From 3a8ea8609b82b7e5401698b7c63df6680e1257a8 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 12 Sep 2020 09:08:07 -0400 Subject: [PATCH] [Intrinsics] define semantics for experimental fmax/fmin vector reductions As discussed on llvm-dev: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html This is hopefully the final remaining showstopper before we can remove the 'experimental' from the reduction intrinsics. No behavior was specified for the FP min/max reductions, so we have a mess of different interpretations. There are a few potential options for the semantics of these max/min ops. I think this is the simplest based on current behavior/implementation: make the reductions inherit from the existing llvm.maxnum/minnum intrinsics. These correspond to libm fmax/fmin, and those are similar to the (now deprecated?) IEEE-754 maxNum/minNum functions (NaNs are treated as missing data). So the default expansion creates calls to libm functions. Another option would be to inherit from llvm.maximum/minimum (NaNs propagate), but most targets just crash in codegen when given those nodes because no default expansion was ever implemented AFAICT. We could also just assume 'nnan' semantics by default (we are already assuming 'nsz' semantics in the maxnum/minnum intrinsics), but some targets (AArch64, PowerPC) support the more defined behavior, so it doesn't make much sense to not allow a tighter spec. Fast-math-flags (nnan) can be used to loosen the semantics. (Note that D67507 was proposed to update the LangRef to acknowledge the more recent IEEE-754 2019 standard, but that patch seems to have stalled. If we do update based on the new standard, the reduction instructions can seamlessly inherit from whatever updates are made to the max/min intrinsics.) x86 sees a regression here on 'nnan' tests because we have underlying, longstanding bugs in FMF creation/propagation. Those need to be fixed apart from this change (for example: https://llvm.org/PR35538). The expansion sequence before this patch may not have been correct. Differential Revision: https://reviews.llvm.org/D87391 --- llvm/docs/LangRef.rst | 14 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 - llvm/lib/CodeGen/ExpandReductions.cpp | 16 +- .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 22 +- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 9 +- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 - .../Target/AArch64/AArch64TargetTransformInfo.h | 5 - llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 6 +- .../AArch64/vecreduce-fmax-legalization-nan.ll | 20 +- .../CodeGen/AArch64/vecreduce-fmax-legalization.ll | 2 +- .../Generic/expand-experimental-reductions.ll | 40 +- llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll | 1307 ++++++-------------- llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll | 30 +- llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll | 348 ++++-- llvm/test/CodeGen/X86/vector-reduce-fmax.ll | 1088 +++++++++++++--- llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll | 358 ++++-- llvm/test/CodeGen/X86/vector-reduce-fmin.ll | 1078 +++++++++++++--- 17 files changed, 2835 insertions(+), 1514 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 781b238..5e35b91 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15824,7 +15824,12 @@ The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating-point ``MAX`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has the ``nnan`` fast-math flag then the operation can +This instruction has the same comparison semantics as the '``llvm.maxnum.*``' +intrinsic. That is, the result will always be a number unless all elements of +the vector are NaN. For a vector with maximum element magnitude 0.0 and +containing both +0.0 and -0.0 elements, the sign of the result is unspecified. + +If the intrinsic call has the ``nnan`` fast-math flag, then the operation can assume that NaNs are not present in the input vector. Arguments: @@ -15850,7 +15855,12 @@ The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating-point ``MIN`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has the ``nnan`` fast-math flag then the operation can +This instruction has the same comparison semantics as the '``llvm.minnum.*``' +intrinsic. That is, the result will always be a number unless all elements of +the vector are NaN. For a vector with minimum element magnitude 0.0 and +containing both +0.0 and -0.0 elements, the sign of the result is unspecified. + +If the intrinsic call has the ``nnan`` fast-math flag, then the operation can assume that NaNs are not present in the input vector. Arguments: diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 2b72dc3..d5c0b83 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1349,13 +1349,9 @@ public: break; case Intrinsic::minnum: ISDs.push_back(ISD::FMINNUM); - if (FMF.noNaNs()) - ISDs.push_back(ISD::FMINIMUM); break; case Intrinsic::maxnum: ISDs.push_back(ISD::FMAXNUM); - if (FMF.noNaNs()) - ISDs.push_back(ISD::FMAXIMUM); break; case Intrinsic::copysign: ISDs.push_back(ISD::FCOPYSIGN); diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index 45f21c1..dfaaafa 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -143,12 +143,24 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_umin: { + Value *Vec = II->getArgOperand(0); + if (!isPowerOf2_32( + cast(Vec->getType())->getNumElements())) + continue; + + Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + break; + } case Intrinsic::experimental_vector_reduce_fmax: case Intrinsic::experimental_vector_reduce_fmin: { + // FIXME: We only expand 'fast' reductions here because the underlying + // code in createMinMaxOp() assumes that comparisons use 'fast' + // semantics. Value *Vec = II->getArgOperand(0); if (!isPowerOf2_32( - cast(Vec->getType())->getNumElements())) + cast(Vec->getType())->getNumElements()) || + !FMF.isFast()) continue; Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 764472e..509ae2c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2146,7 +2146,6 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { EVT LoOpVT, HiOpVT; std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); - bool NoNaN = N->getFlags().hasNoNaNs(); unsigned CombineOpc = 0; switch (N->getOpcode()) { case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; @@ -2160,12 +2159,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: - CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; - break; - case ISD::VECREDUCE_FMIN: - CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; - break; + case ISD::VECREDUCE_FMAX: CombineOpc = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: CombineOpc = ISD::FMINNUM; break; default: llvm_unreachable("Unexpected reduce ISD node"); } @@ -4771,6 +4766,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { EVT OrigVT = N->getOperand(0).getValueType(); EVT WideVT = Op.getValueType(); EVT ElemVT = OrigVT.getVectorElementType(); + SDNodeFlags Flags = N->getFlags(); SDValue NeutralElem; switch (N->getOpcode()) { @@ -4802,12 +4798,18 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT); break; case ISD::VECREDUCE_FMAX: + // This has maxnum semantics, so NaN represents missing data. We must clear + // 'nnan' if it was set because the NaN would be a poison value. NeutralElem = DAG.getConstantFP( - -std::numeric_limits::infinity(), dl, ElemVT); + std::numeric_limits::quiet_NaN(), dl, ElemVT); + Flags.setNoNaNs(false); break; case ISD::VECREDUCE_FMIN: + // This has minnum semantics, so NaN represents missing data. We must clear + // 'nnan' if it was set because the NaN would be a poison value. NeutralElem = DAG.getConstantFP( - std::numeric_limits::infinity(), dl, ElemVT); + std::numeric_limits::quiet_NaN(), dl, ElemVT); + Flags.setNoNaNs(false); break; } @@ -4818,7 +4820,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags()); + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, Flags); } SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a80ca04..ea2344e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7934,7 +7934,6 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result, SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); - bool NoNaN = Node->getFlags().hasNoNaNs(); unsigned BaseOpcode = 0; switch (Node->getOpcode()) { default: llvm_unreachable("Expected VECREDUCE opcode"); @@ -7949,12 +7948,8 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break; case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break; case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: - BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; - break; - case ISD::VECREDUCE_FMIN: - BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; - break; + case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; } SDValue Op = Node->getOperand(0); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d4f3244..6745b84 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9529,14 +9529,12 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, case ISD::VECREDUCE_UMIN: return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); case ISD::VECREDUCE_FMAX: { - assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), Op.getOperand(0)); } case ISD::VECREDUCE_FMIN: { - assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 05b7f70..3c3a246 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -223,11 +223,6 @@ public: // We don't have legalization support for ordered FP reductions. return !II->getFastMathFlags().allowReassoc(); - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: - // Lowering asserts that there are no NaNs. - return !II->getFastMathFlags().noNaNs(); - default: // Don't expand anything else, let legalization deal with it. return false; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index cc2019b..508bb9e 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -201,10 +201,8 @@ public: case Intrinsic::experimental_vector_reduce_fmin: case Intrinsic::experimental_vector_reduce_fmax: - // Can't legalize reductions with soft floats, and NoNan will create - // fminimum which we do not know how to lower. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs() || - !II->getFastMathFlags().noNaNs(); + // Can't legalize reductions with soft floats. + return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); default: // Don't expand anything else, let legalization deal with it. diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll index 4d88831..514a43a 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll @@ -54,19 +54,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind { define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill -; CHECK-NEXT: bl __gttf2 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 // =0 -; CHECK-NEXT: b.le .LBB4_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 // =48 -; CHECK-NEXT: ret +; CHECK-NEXT: b fmaxl %b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -77,11 +65,7 @@ define float @test_v16f32(<16 x float> %a) nounwind { ; CHECK-NEXT: fmaxnm v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.2d, v0.d[1] -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.4s, v0.s[1] -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret %b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) ret float %b diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll index 975ba26..7d6d424d 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -47,7 +47,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind { define float @test_v3f32(<3 x float> %a) nounwind { ; CHECK-LABEL: test_v3f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-8388608 +; CHECK-NEXT: mov w8, #2143289344 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fmaxnmv s0, v0.4s diff --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll index 11abf90..e0e3149 100644 --- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll +++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -93,8 +93,8 @@ define float @fadd_f32(<4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec) @@ -109,8 +109,8 @@ define float @fadd_f32_accum(float %accum, <4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float %accum, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) @@ -161,8 +161,8 @@ define float @fmul_f32(<4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec) @@ -177,8 +177,8 @@ define float @fmul_f32_accum(float %accum, <4 x float> %vec) { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float %accum, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) @@ -277,40 +277,40 @@ entry: ret i64 %r } +; FIXME: Expand using maxnum intrinsic? + define double @fmax_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmax_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: ret double [[TMP0]] +; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: ret double [[R]] ; entry: %r = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %vec) ret double %r } +; FIXME: Expand using minnum intrinsic? + define double @fmin_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmin_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: ret double [[TMP0]] +; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: ret double [[R]] ; entry: %r = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %vec) ret double %r } +; FIXME: Why is this not expanded? + ; Test when the vector size is not power of two. define i8 @test_v3i8(<3 x i8> %a) nounwind { ; CHECK-LABEL: @test_v3i8( ; CHECK-NEXT: entry: -; CHECK-NEXT: %b = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a) -; CHECK-NEXT: ret i8 %b +; CHECK-NEXT: [[B:%.*]] = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]]) +; CHECK-NEXT: ret i8 [[B]] ; entry: %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll index 6936b7e..a83fa68 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -2,30 +2,11 @@ ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP -; FIXME minnum nonan X, +Inf -> X ? define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) { -; CHECK-FP-LABEL: fmin_v2f32: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s4, .LCPI0_0 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI0_0: -; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf -; -; CHECK-NOFP-LABEL: fmin_v2f32: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s4, .LCPI0_0 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI0_0: -; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-LABEL: fmin_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z @@ -99,17 +80,8 @@ define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) { ; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI3_0: -; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) ret half %z @@ -237,23 +209,11 @@ entry: ret double %z } -; FIXME should not be vminnm -; FIXME better reductions (no vmovs/vdups) define arm_aapcs_vfpcc float @fmin_v2f32_nofast(<2 x float> %x) { -; CHECK-FP-LABEL: fmin_v2f32_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmin_v2f32_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmin_v2f32_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z @@ -262,28 +222,16 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -294,38 +242,20 @@ define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) { ; CHECK-FP-LABEL: fmin_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s8, s10 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s12, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f32 s2, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -335,30 +265,20 @@ entry: define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r0, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -368,47 +288,26 @@ entry: define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) { ; CHECK-FP-LABEL: fmin_v8f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -419,73 +318,38 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) { ; CHECK-FP-LABEL: fmin_v16f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -504,9 +368,7 @@ entry: define arm_aapcs_vfpcc double @fmin_v2f64_nofast(<2 x double> %x) { ; CHECK-LABEL: fmin_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) @@ -516,15 +378,9 @@ entry: define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmin_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d3, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d4, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d4, d1, d3 +; CHECK-NEXT: vminnm.f64 d0, d0, d2 +; CHECK-NEXT: vminnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -532,30 +388,11 @@ entry: } define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmin_v2f32_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s6, .LCPI18_0 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI18_0: -; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf -; -; CHECK-NOFP-LABEL: fmin_v2f32_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s6, .LCPI18_0 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI18_0: -; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-LABEL: fmin_v2f32_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vminnm.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp fast olt float %y, %z @@ -641,20 +478,11 @@ define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) { ; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI21_0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI21_0: -; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -665,34 +493,14 @@ entry: } define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) { -; CHECK-FP-LABEL: fmin_v2f16_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmin_v2f16_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI22_0 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI22_0: -; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf +; CHECK-LABEL: fmin_v2f16_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x) @@ -854,25 +662,13 @@ entry: } define arm_aapcs_vfpcc float @fmin_v2f32_acc_nofast(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmin_v2f32_acc_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vcmp.f32 s0, s4 -; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmin_v2f32_acc_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmin_v2f32_acc_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vcmp.f32 s0, s4 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vselgt.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp olt float %y, %z @@ -883,12 +679,9 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d4, d1 -; CHECK-FP-NEXT: vmov.f32 s9, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -896,17 +689,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d4, d1 -; CHECK-NOFP-NEXT: vmov.f32 s9, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s6, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 -; CHECK-NOFP-NEXT: vcmp.f32 s6, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -922,12 +707,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vcmp.f32 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -935,27 +717,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s10, s12 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s14 -; CHECK-NOFP-NEXT: vcmp.f32 s2, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s12, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s10, s12 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s10, s0 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -970,35 +738,26 @@ entry: define arm_aapcs_vfpcc void @fmin_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1016,52 +775,32 @@ entry: define arm_aapcs_vfpcc void @fmin_v8f16_acc_nofast(<8 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1080,78 +819,44 @@ define arm_aapcs_vfpcc void @fmin_v16f16_acc_nofast(<16 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1183,9 +888,7 @@ entry: define arm_aapcs_vfpcc double @fmin_v2f64_acc_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmin_v2f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d0, d2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d2, d0 @@ -1200,15 +903,9 @@ entry: define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmin_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d3, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d5, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d5, d1, d3 +; CHECK-NEXT: vminnm.f64 d0, d0, d2 +; CHECK-NEXT: vminnm.f64 d0, d0, d5 ; CHECK-NEXT: vcmp.f64 d0, d4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 @@ -1221,28 +918,10 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) { -; CHECK-FP-LABEL: fmax_v2f32: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s4, .LCPI37_0 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI37_0: -; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf -; -; CHECK-NOFP-LABEL: fmax_v2f32: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s4, .LCPI37_0 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI37_0: -; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf +; CHECK-LABEL: fmax_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z @@ -1315,17 +994,8 @@ define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) { ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI40_0 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI40_0: -; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) ret half %z @@ -1454,20 +1124,10 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32_nofast(<2 x float> %x) { -; CHECK-FP-LABEL: fmax_v2f32_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v2f32_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmax_v2f32_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z @@ -1476,28 +1136,16 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -1508,38 +1156,20 @@ define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) { ; CHECK-FP-LABEL: fmax_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s10, s8 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s12 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s2 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -1549,30 +1179,20 @@ entry: define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r0, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -1582,47 +1202,26 @@ entry: define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) { ; CHECK-FP-LABEL: fmax_v8f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1633,73 +1232,38 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) { ; CHECK-FP-LABEL: fmax_v16f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1718,9 +1282,7 @@ entry: define arm_aapcs_vfpcc double @fmax_v2f64_nofast(<2 x double> %x) { ; CHECK-LABEL: fmax_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d0, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) @@ -1730,15 +1292,9 @@ entry: define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmax_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d0, d4 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d4, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -1746,30 +1302,11 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmax_v2f32_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vldr s6, .LCPI55_0 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; CHECK-FP-NEXT: .p2align 2 -; CHECK-FP-NEXT: @ %bb.1: -; CHECK-FP-NEXT: .LCPI55_0: -; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf -; -; CHECK-NOFP-LABEL: fmax_v2f32_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vldr s6, .LCPI55_0 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 2 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI55_0: -; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf +; CHECK-LABEL: fmax_v2f32_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vmaxnm.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp fast ogt float %y, %z @@ -1837,34 +1374,14 @@ entry: } define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) { -; CHECK-FP-LABEL: fmax_v2f16_acc: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0 -; CHECK-FP-NEXT: vstr.16 s0, [r0] -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v2f16_acc: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI58_0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 -; CHECK-NOFP-NEXT: vstr.16 s0, [r0] -; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI58_0: -; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf +; CHECK-LABEL: fmax_v2f16_acc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x) @@ -1893,20 +1410,11 @@ define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) { ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 ; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI59_0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr -; CHECK-NOFP-NEXT: .p2align 1 -; CHECK-NOFP-NEXT: @ %bb.1: -; CHECK-NOFP-NEXT: .LCPI59_0: -; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -2068,25 +1576,13 @@ entry: } define arm_aapcs_vfpcc float @fmax_v2f32_acc_nofast(<2 x float> %x, float %y) { -; CHECK-FP-LABEL: fmax_v2f32_acc_nofast: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vcmp.f32 s4, s0 -; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v2f32_acc_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 -; CHECK-NOFP-NEXT: bx lr +; CHECK-LABEL: fmax_v2f32_acc_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vcmp.f32 s4, s0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: vselgt.f32 s0, s4, s0 +; CHECK-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp ogt float %y, %z @@ -2097,12 +1593,9 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d4, d1 -; CHECK-FP-NEXT: vmov.f32 s9, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -2110,17 +1603,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d4, d1 -; CHECK-NOFP-NEXT: vmov.f32 s9, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 -; CHECK-NOFP-NEXT: vselgt.f32 s6, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -2136,12 +1621,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vcmp.f32 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -2149,27 +1631,13 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s12, s10 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s14 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s14 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s2 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s10, s12 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s10, s0 ; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -2184,35 +1652,26 @@ entry: define arm_aapcs_vfpcc void @fmax_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2230,52 +1689,32 @@ entry: define arm_aapcs_vfpcc void @fmax_v8f16_acc_nofast(<8 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2294,78 +1733,44 @@ define arm_aapcs_vfpcc void @fmax_v16f16_acc_nofast(<16 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2397,9 +1802,7 @@ entry: define arm_aapcs_vfpcc double @fmax_v2f64_acc_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmax_v2f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d0, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d2, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d2, d0 @@ -2414,15 +1817,9 @@ entry: define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmax_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d0, d5 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d5, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 ; CHECK-NEXT: vcmp.f64 d4, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index 64a76f3..382c32d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -1512,13 +1512,10 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB15_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-NEXT: beq .LBB15_9 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1526,10 +1523,10 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r0!, {s4} -; CHECK-NEXT: vcmp.f32 s0, s4 +; CHECK-NEXT: vldmia r0!, {s2} +; CHECK-NEXT: vcmp.f32 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselge.f32 s0, s4, s0 +; CHECK-NEXT: vselge.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup ; CHECK-NEXT: vmov r0, s0 @@ -1620,13 +1617,10 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB16_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-NEXT: beq .LBB16_9 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1634,10 +1628,10 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r0!, {s4} -; CHECK-NEXT: vcmp.f32 s4, s0 +; CHECK-NEXT: vldmia r0!, {s2} +; CHECK-NEXT: vcmp.f32 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselge.f32 s0, s4, s0 +; CHECK-NEXT: vselge.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup ; CHECK-NEXT: vmov r0, s0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index e2025be..d304a92 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -13,27 +13,46 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 @@ -43,35 +62,45 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE2-NEXT: maxss %xmm3, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm0 ; SSE2-NEXT: maxss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: maxss %xmm3, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm0 ; SSE41-NEXT: maxss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) @@ -82,43 +111,67 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) @@ -131,12 +184,16 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: maxps %xmm3, %xmm1 ; SSE2-NEXT: maxps %xmm2, %xmm0 ; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -144,35 +201,69 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: maxps %xmm3, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm0 ; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm15, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm12, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm11, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm9, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) @@ -206,6 +297,76 @@ define double @test_v2f64(<2 x double> %a0) { ret double %1 } +define double @test_v3f64(<3 x double> %a0) { +; SSE2-LABEL: test_v3f64: +; SSE2: # %bb.0: +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: maxpd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f64: +; SSE41: # %bb.0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxpd %xmm0, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double> %a0) + ret double %1 +} + define double @test_v4f64(<4 x double> %a0) { ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: @@ -218,18 +379,22 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) @@ -250,21 +415,31 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) @@ -274,12 +449,12 @@ define double @test_v8f64(<8 x double> %a0) { define double @test_v16f64(<16 x double> %a0) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm6, %xmm2 -; SSE-NEXT: maxpd %xmm4, %xmm0 -; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm7, %xmm3 ; SSE-NEXT: maxpd %xmm5, %xmm1 ; SSE-NEXT: maxpd %xmm3, %xmm1 +; SSE-NEXT: maxpd %xmm6, %xmm2 +; SSE-NEXT: maxpd %xmm4, %xmm0 +; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -291,22 +466,32 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) @@ -319,6 +504,7 @@ declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double>) declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index d3b17d2..c5e025b 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -10,69 +10,225 @@ ; vXf32 ; +define float @test_v1f32(<1 x float> %a0) { +; ALL-LABEL: test_v1f32: +; ALL: # %bb.0: +; ALL-NEXT: retq + %1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a0) + ret float %1 +} + define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 } +define float @test_v3f32(<3 x float> %a0) { +; SSE2-LABEL: test_v3f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f32: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq + %1 = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a0) + ret float %1 +} + define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmaxss %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) ret float %1 @@ -81,46 +237,170 @@ define float @test_v4f32(<4 x float> %a0) { define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxps %xmm0, %xmm2 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vmaxss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vmaxss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) ret float %1 } @@ -128,53 +408,259 @@ define float @test_v8f32(<8 x float> %a0) { define float @test_v16f32(<16 x float> %a0) { ; SSE2-LABEL: test_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: maxps %xmm3, %xmm1 -; SSE2-NEXT: maxps %xmm2, %xmm0 -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: maxps %xmm0, %xmm4 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm4, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: maxps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: -; SSE41-NEXT: maxps %xmm3, %xmm1 -; SSE41-NEXT: maxps %xmm2, %xmm0 -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: maxps %xmm0, %xmm4 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: maxps %xmm1, %xmm2 +; SSE41-NEXT: cmpunordps %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxps %xmm4, %xmm1 +; SSE41-NEXT: cmpunordps %xmm4, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm3 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm7 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 +; AVX512VL-NEXT: vmaxss %xmm7, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm16, %xmm0 +; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm13, %xmm0 +; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm12, %xmm0 +; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm11, %xmm0 +; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) ret float %1 } @@ -186,50 +672,106 @@ define float @test_v16f32(<16 x float> %a0) { define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 +; SSE-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: maxsd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0) ret double %1 } define double @test_v4f64(<4 x double> %a0) { -; SSE-LABEL: test_v4f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: maxpd %xmm0, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: maxsd %xmm2, %xmm1 +; SSE41-NEXT: andnpd %xmm1, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmaxsd %xmm4, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) @@ -237,83 +779,325 @@ define double @test_v4f64(<4 x double> %a0) { } define double @test_v8f64(<8 x double> %a0) { -; SSE-LABEL: test_v8f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm3, %xmm1 -; SSE-NEXT: maxpd %xmm2, %xmm0 -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: maxpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: maxpd %xmm0, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: maxpd %xmm1, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxpd %xmm4, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vmaxsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vmaxsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) ret double %1 } define double @test_v16f64(<16 x double> %a0) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm6, %xmm2 -; SSE-NEXT: maxpd %xmm4, %xmm0 -; SSE-NEXT: maxpd %xmm2, %xmm0 -; SSE-NEXT: maxpd %xmm7, %xmm3 -; SSE-NEXT: maxpd %xmm5, %xmm1 -; SSE-NEXT: maxpd %xmm3, %xmm1 -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm4, %xmm8 +; SSE2-NEXT: maxpd %xmm0, %xmm8 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm4 +; SSE2-NEXT: andnpd %xmm8, %xmm0 +; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm6, %xmm4 +; SSE2-NEXT: maxpd %xmm2, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm6 +; SSE2-NEXT: andnpd %xmm4, %xmm2 +; SSE2-NEXT: orpd %xmm6, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: maxpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm5, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm5 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm7, %xmm2 +; SSE2-NEXT: maxpd %xmm3, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE2-NEXT: andpd %xmm3, %xmm7 +; SSE2-NEXT: andnpd %xmm2, %xmm3 +; SSE2-NEXT: orpd %xmm7, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: maxpd %xmm0, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: maxpd %xmm2, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: maxpd %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: maxpd %xmm1, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: maxpd %xmm8, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: maxpd %xmm3, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: maxpd %xmm2, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vmaxpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxpd %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) ret double %1 } +declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>) declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float>) declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll index f25852f..28e8127 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -10,68 +10,176 @@ ; vXf32 ; +define float @test_v1f32(<1 x float> %a0) { +; ALL-LABEL: test_v1f32: +; ALL: # %bb.0: +; ALL-NEXT: retq + %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a0) + ret float %1 +} + define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 } +define float @test_v3f32(<3 x float> %a0) { +; SSE2-LABEL: test_v3f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f32: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq + %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a0) + ret float %1 +} + define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE2-NEXT: minss %xmm3, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm0 ; SSE2-NEXT: minss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: minss %xmm3, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm0 ; SSE41-NEXT: minss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) @@ -82,43 +190,67 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) @@ -131,12 +263,16 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: minps %xmm3, %xmm1 ; SSE2-NEXT: minps %xmm2, %xmm0 ; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -144,35 +280,69 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: minps %xmm3, %xmm1 ; SSE41-NEXT: minps %xmm2, %xmm0 ; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: ; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm15, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm12, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm11, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm9, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) @@ -218,18 +388,22 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) @@ -250,21 +424,31 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) @@ -274,12 +458,12 @@ define double @test_v8f64(<8 x double> %a0) { define double @test_v16f64(<16 x double> %a0) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm6, %xmm2 -; SSE-NEXT: minpd %xmm4, %xmm0 -; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm7, %xmm3 ; SSE-NEXT: minpd %xmm5, %xmm1 ; SSE-NEXT: minpd %xmm3, %xmm1 +; SSE-NEXT: minpd %xmm6, %xmm2 +; SSE-NEXT: minpd %xmm4, %xmm0 +; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -291,29 +475,41 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vminpd %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) ret double %1 } +declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>) declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>) declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index d6c681f..1d7436e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -13,27 +13,46 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 @@ -42,37 +61,95 @@ define float @test_v2f32(<2 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vminss %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) ret float %1 @@ -81,46 +158,170 @@ define float @test_v4f32(<4 x float> %a0) { define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minps %xmm0, %xmm2 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vminss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vminss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -128,53 +329,259 @@ define float @test_v8f32(<8 x float> %a0) { define float @test_v16f32(<16 x float> %a0) { ; SSE2-LABEL: test_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: minps %xmm3, %xmm1 -; SSE2-NEXT: minps %xmm2, %xmm0 -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: minps %xmm0, %xmm4 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm4, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: minps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: -; SSE41-NEXT: minps %xmm3, %xmm1 -; SSE41-NEXT: minps %xmm2, %xmm0 -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: minps %xmm0, %xmm4 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: minps %xmm1, %xmm2 +; SSE41-NEXT: cmpunordps %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minps %xmm4, %xmm1 +; SSE41-NEXT: cmpunordps %xmm4, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm3 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm7 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 +; AVX512VL-NEXT: vminss %xmm7, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm16, %xmm0 +; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm13, %xmm0 +; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm12, %xmm0 +; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm11, %xmm0 +; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) ret float %1 } @@ -186,50 +593,176 @@ define float @test_v16f32(<16 x float> %a0) { define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 +; SSE-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: minsd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) ret double %1 } +define double @test_v3f64(<3 x double> %a0) { +; SSE2-LABEL: test_v3f64: +; SSE2: # %bb.0: +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v3f64: +; SSE41: # %bb.0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: minpd %xmm0, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test_v3f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v3f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double> %a0) + ret double %1 +} + define double @test_v4f64(<4 x double> %a0) { -; SSE-LABEL: test_v4f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: minpd %xmm0, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: minsd %xmm2, %xmm1 +; SSE41-NEXT: andnpd %xmm1, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vminsd %xmm4, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) @@ -237,76 +770,316 @@ define double @test_v4f64(<4 x double> %a0) { } define double @test_v8f64(<8 x double> %a0) { -; SSE-LABEL: test_v8f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm3, %xmm1 -; SSE-NEXT: minpd %xmm2, %xmm0 -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: minpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: minpd %xmm0, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: minpd %xmm1, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: minpd %xmm4, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vminsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vminsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } define double @test_v16f64(<16 x double> %a0) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm6, %xmm2 -; SSE-NEXT: minpd %xmm4, %xmm0 -; SSE-NEXT: minpd %xmm2, %xmm0 -; SSE-NEXT: minpd %xmm7, %xmm3 -; SSE-NEXT: minpd %xmm5, %xmm1 -; SSE-NEXT: minpd %xmm3, %xmm1 -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm4, %xmm8 +; SSE2-NEXT: minpd %xmm0, %xmm8 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm4 +; SSE2-NEXT: andnpd %xmm8, %xmm0 +; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm6, %xmm4 +; SSE2-NEXT: minpd %xmm2, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm6 +; SSE2-NEXT: andnpd %xmm4, %xmm2 +; SSE2-NEXT: orpd %xmm6, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: minpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm5, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm5 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm7, %xmm2 +; SSE2-NEXT: minpd %xmm3, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE2-NEXT: andpd %xmm3, %xmm7 +; SSE2-NEXT: andnpd %xmm2, %xmm3 +; SSE2-NEXT: orpd %xmm7, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: minpd %xmm0, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: minpd %xmm2, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: minpd %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: minpd %xmm1, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: minpd %xmm8, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: minpd %xmm3, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: minpd %xmm2, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vminpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminpd %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) @@ -319,6 +1092,7 @@ declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double>) declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) -- 2.7.4