From: dfukalov Date: Thu, 22 Oct 2020 16:38:56 +0000 (+0300) Subject: [AMDGPU][CostModel] Refine cost model for half- and quarter-rate instructions. X-Git-Tag: llvmorg-13-init~8256 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9068c209655efc597b31b23fc41630d82c5b98a4;p=platform%2Fupstream%2Fllvm.git [AMDGPU][CostModel] Refine cost model for half- and quarter-rate instructions. 1. Throughput and codesize costs estimations was separated and updated. 2. Updated fdiv cost estimation for different cases. 3. Added scalarization processing for types that are treated as !isSimple() to improve codesize estimation in getArithmeticInstrCost() and getArithmeticInstrCost(). The code was borrowed from TCK_RecipThroughput path of base implementation. Next step is unify scalarization part in base class that is currently works for TCK_RecipThroughput path only. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D89973 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 31585ed..2186739 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -472,9 +472,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // FIXME: We're having to query the throughput cost so that the basic // implementation tries to generate legalize and scalarization costs. Maybe // we could hoist the scalarization code here? - return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, - Opd1Info, Opd2Info, Opd1PropInfo, - Opd2PropInfo, Args, CxtI); + if (CostKind != TTI::TCK_CodeSize) + return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); + // Scalarization + + // Check if any of the operands are vector operands. + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + + bool IsFloat = Ty->isFPOrFPVectorTy(); + // Assume that floating point arithmetic operations cost twice as much as + // integer operations. + unsigned OpCost = (IsFloat ? 2 : 1); + + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { + // The operation is legal. Assume it costs 1. + // TODO: Once we have extract/insert subvector cost we need to use them. + return LT.first * OpCost; + } + + if (!TLI->isOperationExpand(ISD, LT.second)) { + // If the operation is custom lowered, then assume that the code is twice + // as expensive. + return LT.first * 2 * OpCost; + } + + // Else, assume that we need to scalarize this op. + // TODO: If one of the types get legalized by splitting, handle this + // similarly to what getCastInstrCost() does. + if (auto *VTy = dyn_cast(Ty)) { + unsigned Num = cast(VTy)->getNumElements(); + unsigned Cost = getArithmeticInstrCost( + Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return getScalarizationOverhead(VTy, Args) + Num * Cost; + } + + // We don't know anything about this scalar instruction. + return OpCost; } // Legalize the type. @@ -493,7 +534,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, case ISD::SRL: case ISD::SRA: if (SLT == MVT::i64) - return get64BitInstrCost() * LT.first * NElts; + return get64BitInstrCost(CostKind) * LT.first * NElts; if (ST->has16BitInsts() && SLT == MVT::i16) NElts = (NElts + 1) / 2; @@ -515,7 +556,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * NElts * getFullRateInstrCost(); case ISD::MUL: { - const int QuarterRateCost = getQuarterRateInstrCost(); + const int QuarterRateCost = getQuarterRateInstrCost(CostKind); if (SLT == MVT::i64) { const int FullRateCost = getFullRateInstrCost(); return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; @@ -552,7 +593,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, case ISD::FADD: case ISD::FSUB: if (SLT == MVT::f64) - return LT.first * NElts * get64BitInstrCost(); + return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; @@ -565,7 +606,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // FIXME: frem should be handled separately. The fdiv in it is most of it, // but the current lowering is also not entirely correct. if (SLT == MVT::f64) { - int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); + int Cost = 7 * get64BitInstrCost(CostKind) + + getQuarterRateInstrCost(CostKind) + + 3 * getHalfRateInstrCost(CostKind); // Add cost of workaround. if (!ST->hasUsableDivScaleConditionOutput()) Cost += 3 * getFullRateInstrCost(); @@ -577,7 +620,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // TODO: This is more complicated, unsafe flags etc. if ((SLT == MVT::f32 && !HasFP32Denormals) || (SLT == MVT::f16 && ST->has16BitInsts())) { - return LT.first * getQuarterRateInstrCost() * NElts; + return LT.first * getQuarterRateInstrCost(CostKind) * NElts; } } @@ -587,12 +630,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // f32 fmul // v_cvt_f16_f32 // f16 div_fixup - int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); + int Cost = + 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); return LT.first * Cost * NElts; } if (SLT == MVT::f32 || SLT == MVT::f16) { - int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); + // 4 more v_cvt_* insts without f16 insts support + int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + + 1 * getQuarterRateInstrCost(CostKind); if (!HasFP32Denormals) { // FP mode switches. @@ -642,7 +688,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Type *RetTy = ICA.getReturnType(); EVT OrigTy = TLI->getValueType(DL, RetTy); if (!OrigTy.isSimple()) { - return BaseT::getIntrinsicInstrCost(ICA, CostKind); + if (CostKind != TTI::TCK_CodeSize) + return BaseT::getIntrinsicInstrCost(ICA, CostKind); + + // TODO: Combine these two logic paths. + if (ICA.isTypeBasedOnly()) + return getTypeBasedIntrinsicInstrCost(ICA, CostKind); + + Type *RetTy = ICA.getReturnType(); + unsigned VF = ICA.getVectorFactor(); + unsigned RetVF = + (RetTy->isVectorTy() ? cast(RetTy)->getNumElements() + : 1); + assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + const IntrinsicInst *I = ICA.getInst(); + const SmallVectorImpl &Args = ICA.getArgs(); + FastMathFlags FMF = ICA.getFlags(); + // Assume that we need to scalarize this intrinsic. + SmallVector Types; + for (const Value *Op : Args) { + Type *OpTy = Op->getType(); + assert(VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = FixedVectorType::get(RetTy, VF); + + // Compute the scalarization overhead based on Args for a vector + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. + unsigned ScalarizationCost = std::numeric_limits::max(); + if (RetVF > 1 || VF > 1) { + ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += + getScalarizationOverhead(cast(RetTy), true, false); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + } + + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, + ScalarizationCost, I); + return getIntrinsicInstrCost(Attrs, CostKind); } // Legalize the type. @@ -654,16 +741,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; if (SLT == MVT::f64) - return LT.first * NElts * get64BitInstrCost(); + return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; // TODO: Get more refined intrinsic costs? - unsigned InstRate = getQuarterRateInstrCost(); + unsigned InstRate = getQuarterRateInstrCost(CostKind); if (ICA.getID() == Intrinsic::fma) { - InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() - : getQuarterRateInstrCost(); + InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } return LT.first * NElts * InstRate; @@ -714,7 +801,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); - return LT.first * getHalfRateInstrCost(); + return LT.first * getHalfRateInstrCost(CostKind); } int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 29e30b6..22aa27e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -115,21 +115,26 @@ class GCNTTIImpl final : public BasicTTIImplBase { return TargetTransformInfo::TCC_Basic; } - static inline int getHalfRateInstrCost() { - return 2 * TargetTransformInfo::TCC_Basic; + static inline int getHalfRateInstrCost( + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { + return CostKind == TTI::TCK_CodeSize ? 2 + : 2 * TargetTransformInfo::TCC_Basic; } // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe // should be 2 or 4. - static inline int getQuarterRateInstrCost() { - return 3 * TargetTransformInfo::TCC_Basic; + static inline int getQuarterRateInstrCost( + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { + return CostKind == TTI::TCK_CodeSize ? 2 + : 4 * TargetTransformInfo::TCC_Basic; } - // On some parts, normal fp64 operations are half rate, and others - // quarter. This also applies to some integer operations. - inline int get64BitInstrCost() const { - return ST->hasHalfRate64Ops() ? - getHalfRateInstrCost() : getQuarterRateInstrCost(); + // On some parts, normal fp64 operations are half rate, and others + // quarter. This also applies to some integer operations. + inline int get64BitInstrCost( + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const { + return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } public: diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 1203182a..c2959bb 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -1,9 +1,9 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s -; ALL: 'fadd_f32' +; ALL-LABEL: 'fadd_f32' ; ALL: estimated cost of 1 for {{.*}} fadd float define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr @@ -12,7 +12,7 @@ define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1) ret void } -; ALL: 'fadd_v2f32' +; ALL-LABEL: 'fadd_v2f32' ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float> define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr @@ -21,10 +21,8 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float ret void } -; ALL: 'fadd_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. -; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float> +; ALL-LABEL: 'fadd_v3f32' +; ALL: estimated cost of 3 for {{.*}} fadd <3 x float> define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fadd <3 x float> %vec, %b @@ -32,10 +30,8 @@ define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float ret void } -; ALL: 'fadd_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. -; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float> +; ALL-LABEL: 'fadd_v5f32' +; ALL: estimated cost of 5 for {{.*}} fadd <5 x float> define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fadd <5 x float> %vec, %b @@ -43,9 +39,10 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float ret void } -; ALL: 'fadd_f64' +; ALL-LABEL: 'fadd_f64' ; FASTF64: estimated cost of 2 for {{.*}} fadd double -; SLOWF64: estimated cost of 3 for {{.*}} fadd double +; SLOWF64: estimated cost of 4 for {{.*}} fadd double +; SIZEALL: estimated cost of 2 for {{.*}} fadd double define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fadd double %vec, %b @@ -53,9 +50,10 @@ define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace( ret void } -; ALL: 'fadd_v2f64' +; ALL-LABEL: 'fadd_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double> -; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double> +; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double> define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fadd <2 x double> %vec, %b @@ -63,9 +61,10 @@ define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ret void } -; ALL: 'fadd_v3f64' +; ALL-LABEL: 'fadd_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double> -; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double> +; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double> +; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double> define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fadd <3 x double> %vec, %b @@ -73,7 +72,7 @@ define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x doub ret void } -; ALL: 'fadd_f16' +; ALL-LABEL: 'fadd_f16' ; ALL: estimated cost of 1 for {{.*}} fadd half define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr @@ -82,7 +81,7 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* ret void } -; ALL: 'fadd_v2f16' +; ALL-LABEL: 'fadd_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half> define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { @@ -92,7 +91,7 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ret void } -; ALL: 'fadd_v3f16' +; ALL-LABEL: 'fadd_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half> define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { @@ -102,7 +101,7 @@ define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> ret void } -; ALL: 'fadd_v4f16' +; ALL-LABEL: 'fadd_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half> define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll index e898f0d..883db92 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -1,19 +1,18 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,NOFP16,NOFP16-FP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,THRPTALL,CIFASTF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,THRPTALL,CISLOWF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,THRPTALL,SIFASTF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,THRPTALL,SISLOWF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,THRPTALL,FP16,CISLOWF64 %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s -; ALL: 'fdiv_f32_ieee' -; ALL: estimated cost of 10 for {{.*}} fdiv float +; ALL-LABEL: 'fdiv_f32_ieee' +; THRPTALL: estimated cost of 14 for {{.*}} fdiv float +; SIZEALL: estimated cost of 12 for {{.*}} fdiv float define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float %vec, %b @@ -21,8 +20,9 @@ define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspa ret void } -; ALL: 'fdiv_f32_ftzdaz' -; ALL: estimated cost of 12 for {{.*}} fdiv float +; ALL-LABEL: 'fdiv_f32_ftzdaz' +; THRPTALL: estimated cost of 16 for {{.*}} fdiv float +; SIZEALL: estimated cost of 14 for {{.*}} fdiv float define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float %vec, %b @@ -30,8 +30,9 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrs ret void } -; ALL: 'fdiv_v2f32_ieee' -; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'fdiv_v2f32_ieee' +; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> %vec, %b @@ -39,8 +40,9 @@ define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x ret void } -; ALL: 'fdiv_v2f32_ftzdaz' -; ALL: estimated cost of 24 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'fdiv_v2f32_ftzdaz' +; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> %vec, %b @@ -48,10 +50,9 @@ define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 ret void } -; ALL: 'fdiv_v3f32_ieee' -; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 36/30 when it is legal. -; ALL: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float> +; ALL-LABEL: 'fdiv_v3f32_ieee' +; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float> +; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float> define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -59,10 +60,9 @@ define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x ret void } -; ALL: 'fdiv_v3f32_ftzdaz' -; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 36/30 when it is legal. -; ALL: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float> +; ALL-LABEL: 'fdiv_v3f32_ftzdaz' +; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float> +; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float> define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -70,10 +70,9 @@ define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 ret void } -; ALL: 'fdiv_v5f32_ieee' -; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 60/50 when it is legal. -; ALL: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float> +; ALL-LABEL: 'fdiv_v5f32_ieee' +; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float> +; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float> define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fdiv <5 x float> %vec, %b @@ -81,10 +80,9 @@ define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x ret void } -; ALL: 'fdiv_v5f32_ftzdaz' -; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 60/50 when it is legal. -; ALL: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float> +; ALL-LABEL: 'fdiv_v5f32_ftzdaz' +; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float> +; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float> define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fdiv <5 x float> %vec, %b @@ -92,11 +90,13 @@ define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 ret void } -; ALL: 'fdiv_f64' -; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double -; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double -; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double -; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double +; ALL-LABEL: 'fdiv_f64' +; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double +; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double +; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double +; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double +; SIZECI: estimated cost of 22 for {{.*}} fdiv double +; SIZESI: estimated cost of 25 for {{.*}} fdiv double define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fdiv double %vec, %b @@ -104,11 +104,13 @@ define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace( ret void } -; ALL: 'fdiv_v2f64' -; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double> -; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double> -; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double> -; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double> +; ALL-LABEL: 'fdiv_v2f64' +; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double> +; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double> +; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double> +; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double> +; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double> +; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double> define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fdiv <2 x double> %vec, %b @@ -116,11 +118,13 @@ define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ret void } -; ALL: 'fdiv_v3f64' -; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double> -; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double> -; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double> -; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double> +; ALL-LABEL: 'fdiv_v3f64' +; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double> +; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double> +; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double> +; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double> +; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double> +; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double> define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fdiv <3 x double> %vec, %b @@ -128,9 +132,11 @@ define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x doub ret void } -; ALL: 'fdiv_f16_f32_ieee' -; NOFP16: estimated cost of 10 for {{.*}} fdiv half -; FP16: estimated cost of 10 for {{.*}} fdiv half +; ALL-LABEL: 'fdiv_f16_f32_ieee' +; NOFP16: estimated cost of 14 for {{.*}} fdiv half +; FP16: estimated cost of 12 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half +; SIZEF16: estimated cost of 8 for {{.*}} fdiv half define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half %vec, %b @@ -138,9 +144,11 @@ define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrs ret void } -; ALL: 'fdiv_f16_f32_ftzdaz' -; NOFP16: estimated cost of 12 for {{.*}} fdiv half -; FP16: estimated cost of 10 for {{.*}} fdiv half +; ALL-LABEL: 'fdiv_f16_f32_ftzdaz' +; NOFP16: estimated cost of 16 for {{.*}} fdiv half +; FP16: estimated cost of 12 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half +; SIZEF16: estimated cost of 8 for {{.*}} fdiv half define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half %vec, %b @@ -148,9 +156,11 @@ define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half add ret void } -; ALL: 'fdiv_v2f16_f32_ieee' -; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'fdiv_v2f16_f32_ieee' +; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> %vec, %b @@ -158,9 +168,11 @@ define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 ret void } -; ALL: 'fdiv_v2f16_f32_ftzdaz' -; NOFP16: estimated cost of 24 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz' +; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> %vec, %b @@ -168,9 +180,11 @@ define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, ret void } -; ALL: 'fdiv_v4f16_f32_ieee' -; NOFP16: estimated cost of 40 for {{.*}} fdiv <4 x half> -; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half> +; ALL-LABEL: 'fdiv_v4f16_f32_ieee' +; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half> +; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half> +; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half> +; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half> define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fdiv <4 x half> %vec, %b @@ -178,9 +192,11 @@ define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 ret void } -; ALL: 'fdiv_v4f16_f32_ftzdaz' -; NOFP16: estimated cost of 48 for {{.*}} fdiv <4 x half> -; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half> +; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz' +; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half> +; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half> +; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half> +; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half> define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fdiv <4 x half> %vec, %b @@ -188,9 +204,9 @@ define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, ret void } -; ALL: 'rcp_f32_ieee' -; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float -; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float +; ALL-LABEL: 'rcp_f32_ieee' +; THRPTALL: estimated cost of 14 for {{.*}} fdiv float +; SIZEALL: estimated cost of 12 for {{.*}} fdiv float define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float 1.0, %vec @@ -198,8 +214,9 @@ define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspac ret void } -; ALL: 'rcp_f32_ftzdaz' -; ALL: estimated cost of 3 for {{.*}} fdiv float +; ALL-LABEL: 'rcp_f32_ftzdaz' +; THRPTALL: estimated cost of 4 for {{.*}} fdiv float +; SIZEALL: estimated cost of 2 for {{.*}} fdiv float define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float 1.0, %vec @@ -207,9 +224,11 @@ define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrsp ret void } -; ALL: 'rcp_f16_f32_ieee' -; NOFP16: estimated cost of 10 for {{.*}} fdiv half -; FP16: estimated cost of 3 for {{.*}} fdiv half +; ALL-LABEL: 'rcp_f16_f32_ieee' +; NOFP16: estimated cost of 14 for {{.*}} fdiv half +; FP16: estimated cost of 4 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half +; SIZEF16: estimated cost of 2 for {{.*}} fdiv half define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half 1.0, %vec @@ -217,9 +236,9 @@ define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrsp ret void } -; ALL: 'rcp_f16_f32_ftzdaz' -; NOFP16: estimated cost of 3 for {{.*}} fdiv half -; FP16: estimated cost of 3 for {{.*}} fdiv half +; ALL-LABEL: 'rcp_f16_f32_ftzdaz' +; THRPTALL: estimated cost of 4 for {{.*}} fdiv half +; SIZEALL: estimated cost of 2 for {{.*}} fdiv half define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half 1.0, %vec @@ -227,11 +246,13 @@ define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addr ret void } -; ALL: 'rcp_f64' -; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double -; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double -; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double -; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double +; ALL-LABEL: 'rcp_f64' +; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double +; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double +; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double +; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double +; SIZECI: estimated cost of 22 for {{.*}} fdiv double +; SIZESI: estimated cost of 25 for {{.*}} fdiv double define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fdiv double 1.0, %vec @@ -239,9 +260,9 @@ define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1 ret void } -; ALL: 'rcp_v2f32_ieee' -; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float> -; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'rcp_v2f32_ieee' +; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> , %vec @@ -249,8 +270,9 @@ define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x f ret void } -; ALL: 'rcp_v2f32_ftzdaz' -; ALL: estimated cost of 6 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'rcp_v2f32_ftzdaz' +; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> , %vec @@ -258,9 +280,11 @@ define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x ret void } -; ALL: 'rcp_v2f16_f32_ieee' -; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'rcp_v2f16_f32_ieee' +; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> , %vec @@ -268,9 +292,9 @@ define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 ret void } -; ALL: 'rcp_v2f16_f32_ftzdaz' -; NOFP16: estimated cost of 6 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz' +; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half> +; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> , %vec diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index 462163d..41cbe0f 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -1,11 +1,12 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF32,FASTF16,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s ; ALL-LABEL: 'fma_f32' -; SLOW32: estimated cost of 3 for {{.*}} call float @llvm.fma.f32 -; FAST32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 +; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32 +; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 +; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { %vec = load float, float addrspace(1)* %vaddr %fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1 @@ -14,8 +15,9 @@ define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* } ; ALL-LABEL: 'fma_v2f32' -; SLOW32: estimated cost of 6 for {{.*}} call <2 x float> @llvm.fma.v2f32 -; FAST32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1 @@ -24,8 +26,9 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> } ; ALL-LABEL: 'fma_v3f32' -; SLOW32: estimated cost of 9 for {{.*}} call <3 x float> @llvm.fma.v3f32 -; FAST32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1 @@ -34,8 +37,9 @@ define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> } ; ALL-LABEL: 'fma_v5f32' -; SLOW32: estimated cost of 15 for {{.*}} call <5 x float> @llvm.fma.v5f32 -; FAST32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1 @@ -44,8 +48,9 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> } ; ALL-LABEL: 'fma_f64' -; SLOW64: estimated cost of 3 for {{.*}} call double @llvm.fma.f64 -; FAST64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 +; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64 +; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 +; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { %vec = load double, double addrspace(1)* %vaddr %fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1 @@ -54,8 +59,9 @@ define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1 } ; ALL-LABEL: 'fma_v2f64' -; SLOW64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.fma.v2f64 -; FAST64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1 @@ -64,8 +70,9 @@ define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x doubl } ; ALL-LABEL: 'fma_v3f64' -; SLOW64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.fma.v3f64 -; FAST64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 +; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64 +; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 +; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1 @@ -74,8 +81,9 @@ define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x doubl } ; ALL-LABEL: 'fma_f16' -; SLOW16: estimated cost of 3 for {{.*}} call half @llvm.fma.f16 -; FAST16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 +; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16 +; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 +; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { %vec = load half, half addrspace(1)* %vaddr %fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1 @@ -84,8 +92,10 @@ define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* % } ; ALL-LABEL: 'fma_v2f16' -; SLOW16: estimated cost of 6 for {{.*}} call <2 x half> @llvm.fma.v2f16 -; FAST16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16 define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1 @@ -94,8 +104,10 @@ define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a } ; ALL-LABEL: 'fma_v3f16' -; SLOW16: estimated cost of 12 for {{.*}} call <3 x half> @llvm.fma.v3f16 -; FAST16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16 define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index ea07089..ba855b2 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -1,7 +1,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s ; ALL-LABEL: 'fmul_f32' ; ALL: estimated cost of 1 for {{.*}} fmul float @@ -22,9 +22,7 @@ define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float } ; ALL-LABEL: 'fmul_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. -; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float> +; ALL: estimated cost of 3 for {{.*}} fmul <3 x float> define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fmul <3 x float> %vec, %b @@ -33,9 +31,7 @@ define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float } ; ALL-LABEL: 'fmul_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. -; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float> +; ALL: estimated cost of 5 for {{.*}} fmul <5 x float> define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fmul <5 x float> %vec, %b @@ -45,7 +41,8 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float ; ALL-LABEL: 'fmul_f64' ; FASTF64: estimated cost of 2 for {{.*}} fmul double -; SLOWF64: estimated cost of 3 for {{.*}} fmul double +; SLOWF64: estimated cost of 4 for {{.*}} fmul double +; SIZEALL: estimated cost of 2 for {{.*}} fmul double define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fmul double %vec, %b @@ -55,7 +52,8 @@ define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace( ; ALL-LABEL: 'fmul_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double> -; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double> +; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double> define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fmul <2 x double> %vec, %b @@ -65,7 +63,8 @@ define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; ALL-LABEL: 'fmul_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double> -; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double> +; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double> +; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double> define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fmul <3 x double> %vec, %b diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 8bc6ebc..287bba8 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -1,9 +1,9 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s -; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s -; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s +; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s +; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s -; ALL: 'fsub_f32' +; ALL-LABEL: 'fsub_f32' ; ALL: estimated cost of 1 for {{.*}} fsub float define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr @@ -12,7 +12,7 @@ define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1) ret void } -; ALL: 'fsub_v2f32' +; ALL-LABEL: 'fsub_v2f32' ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float> define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr @@ -21,10 +21,8 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float ret void } -; ALL: 'fsub_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. -; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float> +; ALL-LABEL: 'fsub_v3f32' +; ALL: estimated cost of 3 for {{.*}} fsub <3 x float> define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fsub <3 x float> %vec, %b @@ -32,10 +30,8 @@ define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float ret void } -; ALL: 'fsub_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. -; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float> +; ALL-LABEL: 'fsub_v5f32' +; ALL: estimated cost of 5 for {{.*}} fsub <5 x float> define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fsub <5 x float> %vec, %b @@ -43,9 +39,10 @@ define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float ret void } -; ALL: 'fsub_f64' +; ALL-LABEL: 'fsub_f64' ; FASTF64: estimated cost of 2 for {{.*}} fsub double -; SLOWF64: estimated cost of 3 for {{.*}} fsub double +; SLOWF64: estimated cost of 4 for {{.*}} fsub double +; SIZEALL: estimated cost of 2 for {{.*}} fsub double define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fsub double %vec, %b @@ -53,9 +50,10 @@ define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace( ret void } -; ALL: 'fsub_v2f64' +; ALL-LABEL: 'fsub_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double> -; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double> +; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double> define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fsub <2 x double> %vec, %b @@ -63,9 +61,10 @@ define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ret void } -; ALL: 'fsub_v3f64' +; ALL-LABEL: 'fsub_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double> -; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double> +; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double> +; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double> define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fsub <3 x double> %vec, %b @@ -73,7 +72,7 @@ define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x doub ret void } -; ALL: 'fsub_f16' +; ALL-LABEL: 'fsub_f16' ; ALL: estimated cost of 1 for {{.*}} fsub half define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr @@ -82,7 +81,7 @@ define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* ret void } -; ALL: 'fsub_v2f16' +; ALL-LABEL: 'fsub_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half> define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { @@ -92,7 +91,7 @@ define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ret void } -; ALL: 'fsub_v3f16' +; ALL-LABEL: 'fsub_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half> define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { @@ -102,7 +101,7 @@ define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> ret void } -; ALL: 'fsub_v4f16' +; ALL-LABEL: 'fsub_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half> define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll index 2106773..5fbd783 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll @@ -1,11 +1,11 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,SZNOCONTRACT,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s target triple = "amdgcn--" @@ -113,8 +113,10 @@ define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r ; ALL-LABEL: 'fmul_fadd_f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double -; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double -; ALL: estimated cost of 3 for instruction: %add = fadd double +; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double +; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double +; THRPTALL: estimated cost of 4 for instruction: %add = fadd double +; SIZEALL: estimated cost of 2 for instruction: %add = fadd double define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 { %mul = fmul double %r0, %r1 %add = fadd double %mul, %r2 @@ -123,7 +125,8 @@ define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 { ; ALL-LABEL: 'fmul_fadd_contract_f64': ; ALL: estimated cost of 0 for instruction: %mul = fmul contract double -; ALL: estimated cost of 3 for instruction: %add = fadd contract double +; THRPTALL: estimated cost of 4 for instruction: %add = fadd contract double +; SIZEALL: estimated cost of 2 for instruction: %add = fadd contract double define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 { %mul = fmul contract double %r0, %r1 %add = fadd contract double %mul, %r2 @@ -132,8 +135,10 @@ define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 { ; ALL-LABEL: 'fmul_fadd_v2f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> -; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double> -; ALL: estimated cost of 6 for instruction: %add = fadd <2 x double> +; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double> +; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double> +; THRPTALL: estimated cost of 8 for instruction: %add = fadd <2 x double> +; SIZEALL: estimated cost of 4 for instruction: %add = fadd <2 x double> define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { %mul = fmul <2 x double> %r0, %r1 %add = fadd <2 x double> %mul, %r2 @@ -142,8 +147,10 @@ define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x do ; ALL-LABEL: 'fmul_fsub_f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double -; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double -; ALL: estimated cost of 3 for instruction: %sub = fsub double +; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double +; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double +; THRPTALL: estimated cost of 4 for instruction: %sub = fsub double +; SIZEALL: estimated cost of 2 for instruction: %sub = fsub double define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 { %mul = fmul double %r0, %r1 %sub = fsub double %mul, %r2 @@ -152,8 +159,10 @@ define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 { ; ALL-LABEL: 'fmul_fsub_v2f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> -; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double> -; ALL: estimated cost of 6 for instruction: %sub = fsub <2 x double> +; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double> +; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double> +; THRPTALL: estimated cost of 8 for instruction: %sub = fsub <2 x double> +; SIZEALL: estimated cost of 4 for instruction: %sub = fsub <2 x double> define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { %mul = fmul <2 x double> %r0, %r1 %sub = fsub <2 x double> %mul, %r2 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll index fa36d39..e4ca068 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -1,10 +1,11 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s -; ALL: 'mul_i32' -; ALL: estimated cost of 3 for {{.*}} mul i32 +; ALL-LABEL: 'mul_i32' +; THRPTALL: estimated cost of 4 for {{.*}} mul i32 +; SIZEALL: estimated cost of 2 for {{.*}} mul i32 define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %mul = mul i32 %vec, %b @@ -12,8 +13,9 @@ define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va ret void } -; ALL: 'mul_v2i32' -; ALL: estimated cost of 6 for {{.*}} mul <2 x i32> +; ALL-LABEL: 'mul_v2i32' +; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32> +; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32> define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %mul = mul <2 x i32> %vec, %b @@ -21,10 +23,9 @@ define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add ret void } -; ALL: 'mul_v3i32' -; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening, -; and 9 when it is legal. -; ALL: estimated cost of {{9|12}} for {{.*}} mul <3 x i32> +; ALL-LABEL: 'mul_v3i32' +; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32> +; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32> define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %mul = mul <3 x i32> %vec, %b @@ -32,10 +33,9 @@ define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add ret void } -; ALL: 'mul_v5i32' -; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening, -; and 15 when it is legal. -; ALL: estimated cost of {{15|24}} for {{.*}} mul <5 x i32> +; ALL-LABEL: 'mul_v5i32' +; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32> +; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32> define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr %mul = mul <5 x i32> %vec, %b @@ -43,8 +43,9 @@ define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> add ret void } -; ALL: 'mul_v4i32' -; ALL: estimated cost of 12 for {{.*}} mul <4 x i32> +; ALL-LABEL: 'mul_v4i32' +; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32> +; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32> define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr %mul = mul <4 x i32> %vec, %b @@ -52,8 +53,9 @@ define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add ret void } -; ALL: 'mul_i64' -; ALL: estimated cost of 16 for {{.*}} mul i64 +; ALL-LABEL: 'mul_i64' +; THRPTALL: estimated cost of 20 for {{.*}} mul i64 +; SIZEALL: estimated cost of 12 for {{.*}} mul i64 define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %mul = mul i64 %vec, %b @@ -61,8 +63,9 @@ define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va ret void } -; ALL: 'mul_v2i64' -; ALL: estimated cost of 32 for {{.*}} mul <2 x i64> +; ALL-LABEL: 'mul_v2i64' +; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64> +; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64> define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %mul = mul <2 x i64> %vec, %b @@ -70,8 +73,9 @@ define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> add ret void } -; ALL: 'mul_v3i64' -; ALL: estimated cost of 48 for {{.*}} mul <3 x i64> +; ALL-LABEL: 'mul_v3i64' +; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64> +; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64> define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr %mul = mul <3 x i64> %vec, %b @@ -79,8 +83,9 @@ define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> add ret void } -; ALL: 'mul_v4i64' -; ALL: estimated cost of 64 for {{.*}} mul <4 x i64> +; ALL-LABEL: 'mul_v4i64' +; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64> +; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64> define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr %mul = mul <4 x i64> %vec, %b @@ -89,8 +94,9 @@ define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add } -; ALL: 'mul_v8i64' -; ALL: estimated cost of 256 for {{.*}} mul <8 x i64> +; ALL-LABEL: 'mul_v8i64' +; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64> +; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64> define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %mul = mul <8 x i64> %vec, %b @@ -98,8 +104,9 @@ define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> add ret void } -; ALL: 'mul_i16' -; ALL: estimated cost of 3 for {{.*}} mul i16 +; ALL-LABEL: 'mul_i16' +; THRPTALL: estimated cost of 4 for {{.*}} mul i16 +; SIZEALL: estimated cost of 2 for {{.*}} mul i16 define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr %mul = mul i16 %vec, %b @@ -107,9 +114,11 @@ define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va ret void } -; ALL: 'mul_v2i16' -; SLOW16: estimated cost of 6 for {{.*}} mul <2 x i16> -; FAST16: estimated cost of 3 for {{.*}} mul <2 x i16> +; ALL-LABEL: 'mul_v2i16' +; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16> +; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16> +; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16> +; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16> define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %mul = mul <2 x i16> %vec, %b @@ -117,9 +126,11 @@ define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add ret void } -; ALL: 'mul_v3i16' -; SLOW16: estimated cost of 12 for {{.*}} mul <3 x i16> -; FAST16: estimated cost of 6 for {{.*}} mul <3 x i16> +; ALL-LABEL: 'mul_v3i16' +; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16> +; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16> +; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16> +; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16> define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 { %vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr %mul = mul <3 x i16> %vec, %b diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll index 55f547f..4293664 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll @@ -1,9 +1,9 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s -; ALL: 'shl_i32' +; ALL-LABEL: 'shl_i32' ; ALL: estimated cost of 1 for {{.*}} shl i32 define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr @@ -12,9 +12,10 @@ define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va ret void } -; ALL: 'shl_i64' +; ALL-LABEL: 'shl_i64' ; FAST64: estimated cost of 2 for {{.*}} shl i64 -; SLOW64: estimated cost of 3 for {{.*}} shl i64 +; SLOW64: estimated cost of 4 for {{.*}} shl i64 +; SIZEALL: estimated cost of 2 for {{.*}} shl i64 define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = shl i64 %vec, %b @@ -22,7 +23,7 @@ define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va ret void } -; ALL: 'shl_i16' +; ALL-LABEL: 'shl_i16' ; ALL: estimated cost of 1 for {{.*}} shl i16 define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr @@ -31,7 +32,7 @@ define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va ret void } -; ALL: 'shl_v2i16' +; ALL-LABEL: 'shl_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16> define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { @@ -41,7 +42,7 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add ret void } -; ALL: 'lshr_i32' +; ALL-LABEL: 'lshr_i32' ; ALL: estimated cost of 1 for {{.*}} lshr i32 define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr @@ -50,9 +51,10 @@ define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %v ret void } -; ALL: 'lshr_i64' +; ALL-LABEL: 'lshr_i64' ; FAST64: estimated cost of 2 for {{.*}} lshr i64 -; SLOW64: estimated cost of 3 for {{.*}} lshr i64 +; SLOW64: estimated cost of 4 for {{.*}} lshr i64 +; SIZEALL: estimated cost of 2 for {{.*}} lshr i64 define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = lshr i64 %vec, %b @@ -60,7 +62,7 @@ define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v ret void } -; ALL: 'lshr_i16' +; ALL-LABEL: 'lshr_i16' ; ALL: estimated cost of 1 for {{.*}} lshr i16 define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr @@ -69,7 +71,7 @@ define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %v ret void } -; ALL: 'lshr_v2i16' +; ALL-LABEL: 'lshr_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16> define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { @@ -79,7 +81,7 @@ define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ad ret void } -; ALL: 'ashr_i32' +; ALL-LABEL: 'ashr_i32' ; ALL: estimated cost of 1 for {{.*}} ashr i32 define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr @@ -88,9 +90,9 @@ define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %v ret void } -; ALL: 'ashr_i64' +; ALL-LABEL: 'ashr_i64' ; FAST64: estimated cost of 2 for {{.*}} ashr i64 -; SLOW64: estimated cost of 3 for {{.*}} ashr i64 +; SLOW64: estimated cost of 4 for {{.*}} ashr i64 define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = ashr i64 %vec, %b @@ -98,7 +100,7 @@ define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v ret void } -; ALL: 'ashr_i16' +; ALL-LABEL: 'ashr_i16' ; ALL: estimated cost of 1 for {{.*}} ashr i16 define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr @@ -107,7 +109,7 @@ define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %v ret void } -; ALL: 'ashr_v2i16' +; ALL-LABEL: 'ashr_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16> define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {