assert(Indices.size() <= Factor &&
"Interleaved memory op has too many members");
+ const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
+ const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
+
APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");
// The cost is estimated as extract elements at 0, 2, 4, 6 from the
// <8 x i32> vector and insert them into a <4 x i32> vector.
InstructionCost InsSubCost =
- getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false);
+ thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+ /*Insert*/ true, /*Extract*/ false);
Cost += Indices.size() * InsSubCost;
Cost +=
thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
// excluding gaps) from both <4 x i32> vectors and insert into the <12 x
// i32> vector.
InstructionCost ExtSubCost =
- getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
+ thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+ /*Insert*/ false, /*Extract*/ true);
Cost += ExtSubCost * Indices.size();
Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
/*Insert*/ true,
// The cost is estimated as extract all mask elements from the <8xi1> mask
// vector and insert them factor times into the <24xi1> shuffled mask
// vector.
- Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
Cost +=
- getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false);
+ thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+ /*Insert*/ false, /*Extract*/ true);
+ Cost += thisT()->getScalarizationOverhead(
+ MaskVT, UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
+ /*Insert*/ true, /*Extract*/ false);
// The Gaps mask is invariant and created outside the loop, therefore the
// cost of creating it is not accounted for here. However if we have both
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2
;
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2
;
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 41 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2
;
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 83 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2
;
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 181 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 152 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2
define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
entry: