From f2072e0ae0f1a4e1bcf626ee1939f79a1a7ed30c Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Tue, 23 Aug 2022 15:22:52 +0000 Subject: [PATCH] [AArh64-SVE]: Improve cost model for div/udiv/mul 128-bit vector operations Differential Revision: https://reviews.llvm.org/D132477 --- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 55 ++++++++-- llvm/test/Analysis/CostModel/AArch64/sve-arith.ll | 57 +++++++++++ .../Analysis/CostModel/AArch64/sve-fixed-length.ll | 111 +++++++++++++++++++++ .../Analysis/CostModel/AArch64/sve-remainder.ll | 36 +++---- 4 files changed, 231 insertions(+), 28 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/AArch64/sve-arith.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d1dcbdf..ebceac3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2084,12 +2084,40 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( InstructionCost Cost = BaseT::getArithmeticInstrCost( Opcode, Ty, CostKind, Op1Info, Op2Info); if (Ty->isVectorTy()) { - // On AArch64, vector divisions are not supported natively and are - // expanded into scalar divisions of each pair of elements. - Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, - Op1Info, Op2Info); - Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, - Op1Info, Op2Info); + if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { + // SDIV/UDIV operations are lowered, then we can have less costs. + if (isa(Ty) && + cast(Ty)->getPrimitiveSizeInBits().getFixedSize() < + 128) { + EVT VT = TLI->getValueType(DL, Ty); + static const CostTblEntry DivTbl[]{ + {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, + {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, + {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, + {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, + {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, + {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; + + const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); + if (nullptr != Entry) + return Entry->Cost; + } + // For 8/16-bit elements, the cost is higher because the type + // requires promotion and possibly splitting: + if (LT.second.getScalarType() == MVT::i8) + Cost *= 8; + else if (LT.second.getScalarType() == MVT::i16) + Cost *= 4; + return Cost; + } else { + // On AArch64, without SVE, vector divisions are expanded + // into scalar divisions of each pair of elements. + Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, + CostKind, Op1Info, Op2Info); + Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, + Op1Info, Op2Info); + } + // TODO: if one of the arguments is scalar, then it's not necessary to // double the cost of handling the vector elements. Cost += Cost; @@ -2097,16 +2125,23 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( return Cost; } case ISD::MUL: - // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive - // as elements are extracted from the vectors and the muls scalarized. - // As getScalarizationOverhead is a bit too pessimistic, we estimate the - // cost for a i64 vector directly here, which is: + // When SVE is available, then we can lower the v2i64 operation using + // the SVE mul instruction, which has a lower cost. + if (LT.second == MVT::v2i64 && ST->hasSVE()) + return LT.first; + + // When SVE is not available, there is no MUL.2d instruction, + // which means mul <2 x i64> is expensive as elements are extracted + // from the vectors and the muls scalarized. + // As getScalarizationOverhead is a bit too pessimistic, we + // estimate the cost for a i64 vector directly here, which is: // - four 2-cost i64 extracts, // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with // LT.first = 2 the cost is 28. If both operands are extensions it will not // need to scalarize so the cost can be cheaper (smull or umull). + // so the cost can be cheaper (smull or umull). if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) return LT.first; return LT.first * 14; diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll new file mode 100644 index 0000000..f4dfea4 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes="print" 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128 + +target triple = "aarch64-unknown-linux-gnu" + +define void @scalable_sdiv() #0 { +; CHECK-LABEL: 'scalable_sdiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %sdiv_nxv16i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_nxv8i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sdiv_nxv4i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sdiv_nxv2i64 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %sdiv_nxv16i8 = sdiv undef, undef + %sdiv_nxv8i16 = sdiv undef, undef + %sdiv_nxv4i32 = sdiv undef, undef + %sdiv_nxv2i64 = sdiv undef, undef + + ret void +} + +define void @scalable_udiv() #0 { +; CHECK-LABEL: 'scalable_udiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %udiv_nxv16i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_nxv8i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %udiv_nxv4i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %udiv_nxv2i64 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %udiv_nxv16i8 = udiv undef, undef + %udiv_nxv8i16 = udiv undef, undef + %udiv_nxv4i32 = udiv undef, undef + %udiv_nxv2i64 = udiv undef, undef + + ret void +} + +define void @scalable_mul() #0 { +; CHECK-LABEL: 'scalable_mul' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv16i8 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv8i16 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv4i32 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv2i64 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %mul_nxv16i8 = mul undef, undef + %mul_nxv8i16 = mul undef, undef + %mul_nxv4i32 = mul undef, undef + %mul_nxv2i64 = mul undef, undef + + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll index 007f678..05afbea 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll @@ -58,4 +58,115 @@ define void @add() #0 { ret void } +; Assuming base_cost = 2 +; Assuming legalization_cost = (vec_len-1/VBITS)+1 +; Assuming extra cost of 8 for i8. +; Assuming extra cost of 4 for i16. +; The hard-coded expected cost is based on VBITS=128 +define void @sdiv() #0 { +; CHECK-LABEL: function 'sdiv' + +; CHECK: cost of 5 for instruction: %sdiv16.i8 = sdiv <2 x i8> undef, undef + %sdiv16.i8 = sdiv <2 x i8> undef, undef + +; CHECK: cost of 8 for instruction: %sdiv32.i8 = sdiv <4 x i8> undef, undef + %sdiv32.i8 = sdiv <4 x i8> undef, undef + +; CHECK: cost of 5 for instruction: %sdiv32.i16 = sdiv <2 x i16> undef, undef + %sdiv32.i16 = sdiv <2 x i16> undef, undef + +; CHECK: cost of 8 for instruction: %sdiv64.i8 = sdiv <8 x i8> undef, undef + %sdiv64.i8 = sdiv <8 x i8> undef, undef + +; CHECK: cost of 5 for instruction: %sdiv64.i16 = sdiv <4 x i16> undef, undef + %sdiv64.i16 = sdiv <4 x i16> undef, undef + +; CHECK: cost of 1 for instruction: %sdiv64.i32 = sdiv <2 x i32> undef, undef + %sdiv64.i32 = sdiv <2 x i32> undef, undef + +; CHECK: cost of [[#mul(mul(div(128-1, VBITS)+1, 2), 8)]] for instruction: %sdiv128.i8 = sdiv <16 x i8> undef, undef + %sdiv128.i8 = sdiv <16 x i8> undef, undef + +; CHECK: cost of [[#mul(mul(div(128-1, VBITS)+1, 2), 4)]] for instruction: %sdiv128.i16 = sdiv <8 x i16> undef, undef + %sdiv128.i16 = sdiv <8 x i16> undef, undef + +; CHECK: cost of [[#mul(div(128-1, VBITS)+1, 2)]] for instruction: %sdiv128.i64 = sdiv <2 x i64> undef, undef + %sdiv128.i64 = sdiv <2 x i64> undef, undef + +; CHECK: cost of [[#mul(mul(div(512-1, VBITS)+1, 2), 8)]] for instruction: %sdiv512.i8 = sdiv <64 x i8> undef, undef + %sdiv512.i8 = sdiv <64 x i8> undef, undef + +; CHECK: cost of [[#mul(mul(div(512-1, VBITS)+1, 2), 4)]] for instruction: %sdiv512.i16 = sdiv <32 x i16> undef, undef + %sdiv512.i16 = sdiv <32 x i16> undef, undef + +; CHECK: cost of [[#mul(div(512-1, VBITS)+1, 2)]] for instruction: %sdiv512.i32 = sdiv <16 x i32> undef, undef + %sdiv512.i32 = sdiv <16 x i32> undef, undef + +; CHECK: cost of [[#mul(div(512-1, VBITS)+1, 2)]] for instruction: %sdiv512.i64 = sdiv <8 x i64> undef, undef + %sdiv512.i64 = sdiv <8 x i64> undef, undef + + ret void +} + +; Assuming base_cost = 2 +; Assuming legalization_cost = (vec_len-1/VBITS)+1 +; Assuming extra cost of 8 for i8. +; Assuming extra cost of 4 for i16. +; The hard-coded expected cost is based on VBITS=128 +define void @udiv() #0 { +; CHECK-LABEL: function 'udiv' + +; CHECK: cost of 5 for instruction: %udiv16.i8 = udiv <2 x i8> undef, undef + %udiv16.i8 = udiv <2 x i8> undef, undef + +; CHECK: cost of 8 for instruction: %udiv32.i8 = udiv <4 x i8> undef, undef + %udiv32.i8 = udiv <4 x i8> undef, undef + +; CHECK: cost of 5 for instruction: %udiv32.i16 = udiv <2 x i16> undef, undef + %udiv32.i16 = udiv <2 x i16> undef, undef + +; CHECK: cost of 8 for instruction: %udiv64.i8 = udiv <8 x i8> undef, undef + %udiv64.i8 = udiv <8 x i8> undef, undef + +; CHECK: cost of 5 for instruction: %udiv64.i16 = udiv <4 x i16> undef, undef + %udiv64.i16 = udiv <4 x i16> undef, undef + +; CHECK: cost of 1 for instruction: %udiv64.i32 = udiv <2 x i32> undef, undef + %udiv64.i32 = udiv <2 x i32> undef, undef + +; CHECK: cost of [[#mul(mul(div(128-1, VBITS)+1, 2), 8)]] for instruction: %udiv128.i8 = udiv <16 x i8> undef, undef + %udiv128.i8 = udiv <16 x i8> undef, undef + +; CHECK: cost of [[#mul(mul(div(128-1, VBITS)+1, 2), 4)]] for instruction: %udiv128.i16 = udiv <8 x i16> undef, undef + %udiv128.i16 = udiv <8 x i16> undef, undef + +; CHECK: cost of [[#mul(div(128-1, VBITS)+1, 2)]] for instruction: %udiv128.i64 = udiv <2 x i64> undef, undef + %udiv128.i64 = udiv <2 x i64> undef, undef + +; CHECK: cost of [[#mul(mul(div(512-1, VBITS)+1, 2), 8)]] for instruction: %udiv512.i8 = udiv <64 x i8> undef, undef + %udiv512.i8 = udiv <64 x i8> undef, undef + +; CHECK: cost of [[#mul(mul(div(512-1, VBITS)+1, 2), 4)]] for instruction: %udiv512.i16 = udiv <32 x i16> undef, undef + %udiv512.i16 = udiv <32 x i16> undef, undef + +; CHECK: cost of [[#mul(div(512-1, VBITS)+1, 2)]] for instruction: %udiv512.i32 = udiv <16 x i32> undef, undef + %udiv512.i32 = udiv <16 x i32> undef, undef + +; CHECK: cost of [[#mul(div(512-1, VBITS)+1, 2)]] for instruction: %udiv512.i64 = udiv <8 x i64> undef, undef + %udiv512.i64 = udiv <8 x i64> undef, undef + + ret void +} + +; The hard-coded expected cost is based on VBITS=128 +define void @mul() #0 { +; CHECK: cost of [[#div(128-1, VBITS)+1]] for instruction: %mul128.i64 = mul <2 x i64> undef, undef + %mul128.i64 = mul <2 x i64> undef, undef + +; CHECK: cost of [[#div(512-1, VBITS)+1]] for instruction: %mul512.i64 = mul <8 x i64> undef, undef + %mul512.i64 = mul <8 x i64> undef, undef + + ret void + } + attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-remainder.ll b/llvm/test/Analysis/CostModel/AArch64/sve-remainder.ll index b0ea8b1..711f06c 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-remainder.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-remainder.ll @@ -5,30 +5,30 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @test_urem_srem_expand() { ; CHECK-LABEL: 'test_urem_srem_expand' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_urem_0 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %legal_type_urem_0 = urem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_urem_1 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_urem_2 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_urem_3 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_srem_0 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %legal_type_urem_2 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %legal_type_urem_3 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %legal_type_srem_0 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_srem_1 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_srem_2 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %legal_type_srem_3 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_urem_0 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %legal_type_srem_2 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %legal_type_srem_3 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %split_type_urem_0 = urem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_urem_1 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_urem_2 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_urem_3 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_srem_0 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %split_type_urem_2 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %split_type_urem_3 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %split_type_srem_0 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_srem_1 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_srem_2 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %split_type_srem_3 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_urem_0 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %split_type_srem_2 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %split_type_srem_3 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %widen_type_urem_0 = urem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_urem_1 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_urem_2 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_urem_3 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_srem_0 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %widen_type_urem_2 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %widen_type_urem_3 = urem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %widen_type_srem_0 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_srem_1 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_srem_2 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %widen_type_srem_3 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %widen_type_srem_2 = srem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %widen_type_srem_3 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: -- 2.7.4