From a4b64f772711308724ab4626549b414bcf79d8e0 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 3 Nov 2021 17:33:28 +0300 Subject: [PATCH] [BasicTTI] getInterleavedMemoryOpCost(): discount unused members of mask if mask for gap will be used As it can be seen in `InnerLoopVectorizer::vectorizeInterleaveGroup()`, in some cases (reported by `UseMaskForGaps`), the gaps in the interleaved load/store group will be masked away by another constant mask, so there is no need to account for the cost of replication of the mask for these. Differential Revision: https://reviews.llvm.org/D112877 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 16 ++++++++++++---- .../X86/interleaved-store-accesses-with-gaps.ll | 8 ++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 9b116a8..1ebab92 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1239,6 +1239,9 @@ public: assert(Indices.size() <= Factor && "Interleaved memory op has too many members"); + const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); + const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); + APInt DemandedLoadStoreElts = APInt::getZero(NumElts); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); @@ -1256,7 +1259,8 @@ public: // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. InstructionCost InsSubCost = - getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false); + thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, + /*Insert*/ true, /*Extract*/ false); Cost += Indices.size() * InsSubCost; Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, @@ -1276,7 +1280,8 @@ public: // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. InstructionCost ExtSubCost = - getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); + thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, + /*Insert*/ false, /*Extract*/ true); Cost += ExtSubCost * Indices.size(); Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ true, @@ -1300,9 +1305,12 @@ public: // The cost is estimated as extract all mask elements from the <8xi1> mask // vector and insert them factor times into the <24xi1> shuffled mask // vector. - Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); Cost += - getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false); + thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, + /*Insert*/ false, /*Extract*/ true); + Cost += thisT()->getScalarizationOverhead( + MaskVT, UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, + /*Insert*/ true, /*Extract*/ false); // The Gaps mask is invariant and created outside the loop, therefore the // cost of creating it is not accounted for here. However if we have both diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll index 43d54d1..c3f36c3 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll @@ -107,16 +107,16 @@ for.end: ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 41 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 83 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 181 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 152 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) { entry: -- 2.7.4