// We know that they are legal. See LowerAdd in ISelLowering.
return LT.first;
+ case ISD::FNEG:
case ISD::FADD:
case ISD::FSUB:
+ // Increase the cost for half and bfloat types if not architecturally
+ // supported.
+ if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
+ (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
+ return 2 * LT.first;
+ if (!Ty->getScalarType()->isFP128Ty())
+ return LT.first;
+ LLVM_FALLTHROUGH;
case ISD::FMUL:
case ISD::FDIV:
- case ISD::FNEG:
// These nodes are marked as 'custom' just to lower them to SVE.
// We know said lowering will incur no additional cost.
if (!Ty->getScalarType()->isFP128Ty())
define void @fadd() {
; CHECK-LABEL: 'fadd'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <vscale x 4 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <vscale x 8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <vscale x 16 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fadd <vscale x 4 x double> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%V4F16 = fadd <vscale x 4 x half> undef, undef
define void @fsub() {
; CHECK-LABEL: 'fsub'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <vscale x 4 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <vscale x 8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <vscale x 16 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <vscale x 2 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <vscale x 4 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <vscale x 8 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub <vscale x 2 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub <vscale x 4 x double> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%V4F16 = fsub <vscale x 4 x half> undef, undef
define void @fneg() {
; CHECK-LABEL: 'fneg'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <vscale x 4 x double> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <vscale x 2 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fneg <vscale x 4 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fneg <vscale x 8 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <vscale x 16 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <vscale x 2 x float> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fneg <vscale x 4 x float> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <vscale x 8 x float> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <vscale x 2 x double> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fneg <vscale x 4 x double> undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%V2F16 = fneg <vscale x 2 x half> undef
define i32 @fadd(i32 %arg) {
; CHECK-LABEL: 'fadd'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = fadd half undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <4 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <16 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = fadd float undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <2 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <4 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <8 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fadd double undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fadd <2 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fadd <4 x double> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%F16 = fadd half undef, undef
define i32 @fsub(i32 %arg) {
; CHECK-LABEL: 'fsub'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = fsub half undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <4 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <16 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = fsub float undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <2 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <4 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <8 x float> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fsub double undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub <2 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub <4 x double> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%F16 = fsub half undef, undef
define i32 @fneg_idiom(i32 %arg) {
; CHECK-LABEL: 'fneg_idiom'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half 0xH8000, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float -0.000000e+00, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double -0.000000e+00, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = fsub half 0xH8000, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = fsub float -0.000000e+00, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fsub double -0.000000e+00, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%F16 = fsub half -0.0, undef
define i32 @fneg(i32 %arg) {
; CHECK-LABEL: 'fneg'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <8 x float> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <2 x double> undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <4 x double> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = fneg half undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <2 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fneg <4 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fneg <8 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <16 x half> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = fneg float undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <2 x float> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fneg <4 x float> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <8 x float> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fneg double undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <2 x double> undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fneg <4 x double> undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%F16 = fneg half undef
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s
-; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+fullfp16 < %s | FileCheck %s
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FP16
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+bf16 < %s | FileCheck %s --check-prefix=BF16
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
; CHECK-LABEL: 'strict_fp_reductions'
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+; FP16-LABEL: 'strict_fp_reductions'
+; FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BF16-LABEL: 'strict_fp_reductions'
+; BF16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
%fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
%fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+; FP16-LABEL: 'fast_fp_reductions'
+; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BF16-LABEL: 'fast_fp_reductions'
+; BF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
%fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
; CHECK: cost of [[#div(511,VBITS)+1]] for instruction: %add512.i16 = add <32 x i16> undef, undef
; CHECK: cost of [[#div(511,VBITS)+1]] for instruction: %add512.i32 = add <16 x i32> undef, undef
; CHECK: cost of [[#div(511,VBITS)+1]] for instruction: %add512.i64 = add <8 x i64> undef, undef
-; CHECK: cost of [[#mul(div(511,VBITS)+1,2)]] for instruction: %add512.f16 = fadd <32 x half> undef, undef
-; CHECK: cost of [[#mul(div(511,VBITS)+1,2)]] for instruction: %add512.f32 = fadd <16 x float> undef, undef
-; CHECK: cost of [[#mul(div(511,VBITS)+1,2)]] for instruction: %add512.f64 = fadd <8 x double> undef, undef
+; CHECK: cost of [[#div(511,VBITS)+1]] for instruction: %add512.f16 = fadd <32 x half> undef, undef
+; CHECK: cost of [[#div(511,VBITS)+1]] for instruction: %add512.f32 = fadd <16 x float> undef, undef
+; CHECK: cost of [[#div(511,VBITS)+1]] for instruction: %add512.f64 = fadd <8 x double> undef, undef
%add512.i8 = add <64 x i8> undef, undef
%add512.i16 = add <32 x i16> undef, undef
%add512.i32 = add <16 x i32> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
define void @strict_fp_reductions(<vscale x 4 x float> %v0, <vscale x 4 x double> %v1) {
; CHECK-LABEL: 'strict_fp_reductions'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'strict_fp_reductions'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
define <vscale x 2 x double> @fadd_v2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; THRU-LABEL: 'fadd_v2f64'
-; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = fadd <vscale x 2 x double> %a, %b
+; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = fadd <vscale x 2 x double> %a, %b
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <vscale x 2 x double> %r
;
; LATE-LABEL: 'fadd_v2f64'
target triple="aarch64-unknown-linux-gnu"
-; CHECK-VF4: Found an estimated cost of 21 for VF 4 For instruction: %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 42 for VF 8 For instruction: %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 1 for VF 4 For instruction: %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 2 for VF 8 For instruction: %add = fadd float %0, %sum.07
define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) {
entry:
}
-; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 2 for VF 4 For instruction: %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 4 for VF 8 For instruction: %add = fadd double %0, %sum.07
define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) {
entry:
ret double %add
}
-; CHECK-VF4: Found an estimated cost of 23 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
-; CHECK-VF8: Found an estimated cost of 46 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF4: Found an estimated cost of 1 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF8: Found an estimated cost of 38 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) {
entry:
declare float @llvm.fmuladd.f32(float, float, float)
-; CHECK-VF4: Found an estimated cost of 22 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
-; CHECK-VF8: Found an estimated cost of 44 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF4: Found an estimated cost of 4 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF8: Found an estimated cost of 8 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) {
entry:
target triple="aarch64-unknown-linux-gnu"
-; CHECK: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
-; CHECK: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
+; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
+; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
}
-; CHECK: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
+; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapture readonly %B) {
; CHECK-LABEL: @wrap_mul4(
; CHECK-NEXT: [[TEMP:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TEMP1:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[MUL_I:%.*]] = fmul double [[TEMP]], [[TEMP1]]
; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 0, i64 1
; CHECK-NEXT: [[TEMP2:%.*]] = load double, ptr [[ARRAYIDX5_I]], align 8
-; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B:%.*]], i64 1, i64 0
+; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 0
+; CHECK-NEXT: [[TEMP3:%.*]] = load double, ptr [[ARRAYIDX7_I]], align 8
+; CHECK-NEXT: [[MUL8_I:%.*]] = fmul double [[TEMP2]], [[TEMP3]]
+; CHECK-NEXT: [[ADD_I:%.*]] = fadd double [[MUL_I]], [[MUL8_I]]
+; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 1
+; CHECK-NEXT: [[TEMP4:%.*]] = load double, ptr [[ARRAYIDX13_I]], align 8
+; CHECK-NEXT: [[MUL14_I:%.*]] = fmul double [[TEMP]], [[TEMP4]]
+; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 1
+; CHECK-NEXT: [[TEMP5:%.*]] = load double, ptr [[ARRAYIDX18_I]], align 8
+; CHECK-NEXT: [[MUL19_I:%.*]] = fmul double [[TEMP2]], [[TEMP5]]
+; CHECK-NEXT: [[ADD20_I:%.*]] = fadd double [[MUL14_I]], [[MUL19_I]]
; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 2
+; CHECK-NEXT: [[TEMP6:%.*]] = load double, ptr [[ARRAYIDX25_I]], align 8
+; CHECK-NEXT: [[MUL26_I:%.*]] = fmul double [[TEMP]], [[TEMP6]]
; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 2
+; CHECK-NEXT: [[TEMP7:%.*]] = load double, ptr [[ARRAYIDX30_I]], align 8
+; CHECK-NEXT: [[MUL31_I:%.*]] = fmul double [[TEMP2]], [[TEMP7]]
+; CHECK-NEXT: [[ADD32_I:%.*]] = fadd double [[MUL26_I]], [[MUL31_I]]
+; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 3
+; CHECK-NEXT: [[TEMP8:%.*]] = load double, ptr [[ARRAYIDX37_I]], align 8
+; CHECK-NEXT: [[MUL38_I:%.*]] = fmul double [[TEMP]], [[TEMP8]]
+; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 3
+; CHECK-NEXT: [[TEMP9:%.*]] = load double, ptr [[ARRAYIDX42_I]], align 8
+; CHECK-NEXT: [[MUL43_I:%.*]] = fmul double [[TEMP2]], [[TEMP9]]
+; CHECK-NEXT: [[ADD44_I:%.*]] = fadd double [[MUL38_I]], [[MUL43_I]]
; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 0
; CHECK-NEXT: [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8
+; CHECK-NEXT: [[MUL50_I:%.*]] = fmul double [[TEMP1]], [[TEMP10]]
; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1
; CHECK-NEXT: [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP2]]
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX7_I]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[SHUFFLE1]], [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
-; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
-; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8
-; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[SHUFFLE1]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[TMP13]], [[TMP16]]
-; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[OUT]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP17]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
+; CHECK-NEXT: [[MUL55_I:%.*]] = fmul double [[TEMP3]], [[TEMP11]]
+; CHECK-NEXT: [[ADD56_I:%.*]] = fadd double [[MUL50_I]], [[MUL55_I]]
+; CHECK-NEXT: [[MUL62_I:%.*]] = fmul double [[TEMP4]], [[TEMP10]]
+; CHECK-NEXT: [[MUL67_I:%.*]] = fmul double [[TEMP5]], [[TEMP11]]
+; CHECK-NEXT: [[ADD68_I:%.*]] = fadd double [[MUL62_I]], [[MUL67_I]]
+; CHECK-NEXT: [[MUL74_I:%.*]] = fmul double [[TEMP6]], [[TEMP10]]
+; CHECK-NEXT: [[MUL79_I:%.*]] = fmul double [[TEMP7]], [[TEMP11]]
+; CHECK-NEXT: [[ADD80_I:%.*]] = fadd double [[MUL74_I]], [[MUL79_I]]
+; CHECK-NEXT: [[MUL86_I:%.*]] = fmul double [[TEMP8]], [[TEMP10]]
+; CHECK-NEXT: [[MUL91_I:%.*]] = fmul double [[TEMP9]], [[TEMP11]]
+; CHECK-NEXT: [[ADD92_I:%.*]] = fadd double [[MUL86_I]], [[MUL91_I]]
+; CHECK-NEXT: store double [[ADD_I]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 1
+; CHECK-NEXT: store double [[ADD20_I]], ptr [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2]], align 8
+; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 2
+; CHECK-NEXT: store double [[ADD32_I]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
+; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 3
+; CHECK-NEXT: store double [[ADD44_I]], ptr [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6]], align 8
; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
-; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP2]], [[SHUFFLE4]]
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
-; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP6]], [[SHUFFLE5]]
-; CHECK-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[TMP20]], [[TMP22]]
-; CHECK-NEXT: store <2 x double> [[TMP23]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
+; CHECK-NEXT: store double [[ADD56_I]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
+; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 5
+; CHECK-NEXT: store double [[ADD68_I]], ptr [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10]], align 8
; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
-; CHECK-NEXT: [[TMP25:%.*]] = fmul <2 x double> [[TMP12]], [[SHUFFLE4]]
-; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP15]], [[SHUFFLE5]]
-; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP25]], [[TMP26]]
-; CHECK-NEXT: store <2 x double> [[TMP27]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
+; CHECK-NEXT: store double [[ADD80_I]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
+; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 7
+; CHECK-NEXT: store double [[ADD92_I]], ptr [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14]], align 8
; CHECK-NEXT: ret void
;
%temp = load double, ptr %A, align 8
%add0 = fadd double %f0, %f0
%add1 = fadd double %f1, %f1
%w1 = getelementptr inbounds double, ptr %w, i64 1
-; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 3
store double %add0, ptr %w, !dbg !9
store double %add1, ptr %w1
ret void