[BasicTTI] getInterleavedMemoryOpCost(): discount unused members of mask if mask...

author Roman Lebedev <lebedev.ri@gmail.com>

Wed, 3 Nov 2021 14:33:28 +0000 (17:33 +0300)

committer Roman Lebedev <lebedev.ri@gmail.com>

Wed, 3 Nov 2021 14:33:28 +0000 (17:33 +0300)
author Roman Lebedev <lebedev.ri@gmail.com>
Wed, 3 Nov 2021 14:33:28 +0000 (17:33 +0300)
committer Roman Lebedev <lebedev.ri@gmail.com>
Wed, 3 Nov 2021 14:33:28 +0000 (17:33 +0300)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h

index 9b116a8..1ebab92 100644 (file)
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1239,6 +1239,9 @@ public:
      assert(Indices.size() <= Factor &&
             "Interleaved memory op has too many members");
  
+    const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
+    const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
+
      APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
      for (unsigned Index : Indices) {
        assert(Index < Factor && "Invalid index for interleaved memory op");
@@ -1256,7 +1259,8 @@ public:
        // The cost is estimated as extract elements at 0, 2, 4, 6 from the
        // <8 x i32> vector and insert them into a <4 x i32> vector.
        InstructionCost InsSubCost =
-          getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false);
+          thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+                                            /*Insert*/ true, /*Extract*/ false);
        Cost += Indices.size() * InsSubCost;
        Cost +=
            thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
@@ -1276,7 +1280,8 @@ public:
        // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
        // i32> vector.
        InstructionCost ExtSubCost =
-          getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
+          thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+                                            /*Insert*/ false, /*Extract*/ true);
        Cost += ExtSubCost * Indices.size();
        Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
                                                  /*Insert*/ true,
@@ -1300,9 +1305,12 @@ public:
      // The cost is estimated as extract all mask elements from the <8xi1> mask
      // vector and insert them factor times into the <24xi1> shuffled mask
      // vector.
-    Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
      Cost +=
-        getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false);
+        thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+                                          /*Insert*/ false, /*Extract*/ true);
+    Cost += thisT()->getScalarizationOverhead(
+        MaskVT, UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
+        /*Insert*/ true, /*Extract*/ false);
  
      // The Gaps mask is invariant and created outside the loop, therefore the
      // cost of creating it is not accounted for here. However if we have both
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll

index 43d54d1..c3f36c3 100644 (file)
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll
@@ -107,16 +107,16 @@ for.end:
  ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
  ;
  ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 2 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %2, i16* %arrayidx7, align 2
  ;
  ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 41 for VF 4 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 4 For instruction:   store i16 %2, i16* %arrayidx7, align 2
  ;
  ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 83 for VF 8 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 8 For instruction:   store i16 %2, i16* %arrayidx7, align 2
  ;
  ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 181 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 152 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
  
  define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
  entry:
author	Roman Lebedev <lebedev.ri@gmail.com>
	Wed, 3 Nov 2021 14:33:28 +0000 (17:33 +0300)
committer	Roman Lebedev <lebedev.ri@gmail.com>
	Wed, 3 Nov 2021 14:33:28 +0000 (17:33 +0300)
llvm/include/llvm/CodeGen/BasicTTIImpl.h		patch \| blob \| history
llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll		patch \| blob \| history