From: dfukalov Date: Tue, 5 Nov 2019 14:30:52 +0000 (+0300) Subject: [AMDGPU] Improve code size cost model (part 2) X-Git-Tag: llvmorg-11-init~4915 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=47a5c36b37f033496de01b48cef0b2f1903c33cc;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Improve code size cost model (part 2) Summary: Added estimations for ShuffleVector, some cast and arithmetic instructions Reviewers: rampitec Reviewed By: rampitec Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, zzheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69629 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 616196ad5..a6544cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -695,34 +695,114 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned GCNTTIImpl::getUserCost(const User *U, ArrayRef Operands) { - // Estimate extractelement elimination - if (const ExtractElementInst *EE = dyn_cast(U)) { - ConstantInt *CI = dyn_cast(EE->getOperand(1)); + const Instruction *I = dyn_cast(U); + if (!I) + return BaseT::getUserCost(U, Operands); + + // Estimate different operations to be optimized out + switch (I->getOpcode()) { + case Instruction::ExtractElement: { + ConstantInt *CI = dyn_cast(I->getOperand(1)); unsigned Idx = -1; if (CI) Idx = CI->getZExtValue(); - return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(), - Idx); + return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx); } - - // Estimate insertelement elimination - if (const InsertElementInst *IE = dyn_cast(U)) { - ConstantInt *CI = dyn_cast(IE->getOperand(2)); + case Instruction::InsertElement: { + ConstantInt *CI = dyn_cast(I->getOperand(2)); unsigned Idx = -1; if (CI) Idx = CI->getZExtValue(); - return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx); + return getVectorInstrCost(I->getOpcode(), I->getType(), Idx); + } + case Instruction::Call: { + if (const IntrinsicInst *II = dyn_cast(U)) { + SmallVector Args(II->arg_operands()); + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(II)) + FMF = FPMO->getFastMathFlags(); + return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, + FMF); + } else { + return BaseT::getUserCost(U, Operands); + } } + case Instruction::ShuffleVector: { + const ShuffleVectorInst *Shuffle = cast(I); + Type *Ty = Shuffle->getType(); + Type *SrcTy = Shuffle->getOperand(0)->getType(); + + // TODO: Identify and add costs for insert subvector, etc. + int SubIndex; + if (Shuffle->isExtractSubvectorMask(SubIndex)) + return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty); + + if (Shuffle->changesLength()) + return -1; + + if (Shuffle->isIdentity()) + return 0; + + if (Shuffle->isReverse()) + return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr); - // Estimate different intrinsics, e.g. llvm.fabs - if (const IntrinsicInst *II = dyn_cast(U)) { - SmallVector Args(II->arg_operands()); - FastMathFlags FMF; - if (auto *FPMO = dyn_cast(II)) - FMF = FPMO->getFastMathFlags(); - return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, - FMF); + if (Shuffle->isSelect()) + return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr); + + if (Shuffle->isTranspose()) + return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr); + + if (Shuffle->isZeroEltSplat()) + return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr); + + if (Shuffle->isSingleSource()) + return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr); + + return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: { + return getCastInstrCost(I->getOpcode(), I->getType(), + I->getOperand(0)->getType(), I); } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::FNeg: { + return getArithmeticInstrCost(I->getOpcode(), I->getType(), + TTI::OK_AnyValue, TTI::OK_AnyValue, + TTI::OP_None, TTI::OP_None, Operands); + } + default: + break; + } + return BaseT::getUserCost(U, Operands); } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll index b538b12..3c4dd34 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -1,5 +1,8 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s + ; CHECK: 'add_i32' ; CHECK: estimated cost of 1 for {{.*}} add i32 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll b/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll index f15ab50..a87a965 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll @@ -1,4 +1,5 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s ; CHECK-LABEL: 'addrspacecast_global_to_flat' ; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8* diff --git a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll index aa70f50..bf8db67 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll @@ -1,4 +1,5 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s ; CHECK: 'or_i32' ; CHECK: estimated cost of 1 for {{.*}} or i32 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 5a5a0941..47742ee 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -1,5 +1,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s ; ALL: 'fadd_f32' ; ALL: estimated cost of 1 for {{.*}} fadd float diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll index 325960a..7a8012a 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -5,6 +5,13 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s + ; ALL: 'fdiv_f32' ; NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv float ; FP32DENORMS: estimated cost of 10 for {{.*}} fdiv float diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index b1e2b51..a3fb632 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -1,5 +1,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s ; ALL: 'fmul_f32' ; ALL: estimated cost of 1 for {{.*}} fmul float diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 9ab5184..f84fd53 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -1,5 +1,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s ; ALL: 'fsub_f32' ; ALL: estimated cost of 1 for {{.*}} fsub float diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll index 85a3770..3239113 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -1,4 +1,5 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s ; CHECK: 'mul_i32' ; CHECK: estimated cost of 3 for {{.*}} mul i32 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll index 85fb0eb..e2d3c47 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll @@ -1,5 +1,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s ; ALL: 'shl_i32' ; ALL: estimated cost of 1 for {{.*}} shl i32 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index cc756c8..413dd5e 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -1,7 +1,11 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s +; GCN-LABEL: 'shufflevector_00_v2i16' ; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer +; VI: estimated cost of 1 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer @@ -9,7 +13,8 @@ define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, ret void } -; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> +; GCN-LABEL: 'shufflevector_01_v2i16' +; GCN: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> @@ -17,7 +22,9 @@ define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, ret void } +; GCN-LABEL: 'shufflevector_10_v2i16' ; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> +; VI: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> @@ -25,7 +32,9 @@ define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, ret void } +; GCN-LABEL: 'shufflevector_11_v2i16' ; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> +; VI: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> @@ -33,6 +42,7 @@ define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, ret void } +; GCN-LABEL: 'shufflevector_02_v2i16' ; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) { %vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0 diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll index e2606bb..74e124c 100644 --- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll @@ -1,4 +1,4 @@ -; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s +; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=12000 %s | FileCheck %s ; Check that we full unroll loop to be able to eliminate alloca ; CHECK-LABEL: @non_invariant_ind