unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
-
- if (UseMaskForCond || UseMaskForGaps)
- return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, CostKind,
- UseMaskForCond, UseMaskForGaps);
-
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
// VecTy = <12 x i32>.
return Cost;
}
-// Get estimation for interleaved load/store operations for AVX2.
-// \p Factor is the interleaved-access factor (stride) - number of
-// (interleaved) elements in the group.
-// \p Indices contains the indices for a strided load: when the
-// interleaved load has gaps they indicate which elements are used.
-// If Indices is empty (or if the number of indices is equal to the size
-// of the interleaved-access as given in \p Factor) the access has no gaps.
-//
-// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
-// computing the cost using a generic formula as a function of generic
-// shuffles. We therefore use a lookup table instead, filled according to
-// the instruction sequences that codegen currently generates.
-InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
- unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
- ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
-
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
+ auto *VecTy = cast<FixedVectorType>(BaseTy);
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
- // We currently Support only fully-interleaved groups, with no gaps.
+ auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
+ Type *EltTy = cast<VectorType>(VecTy)->getElementType();
+ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+ EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+ return true;
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
+ (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
+ return HasBW;
+ return false;
+ };
+ if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
+ return getInterleavedMemoryOpCostAVX512(
+ Opcode, VecTy, Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
+
+ // Get estimation for interleaved load/store operations for SSE-AVX2.
+ // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
+ // computing the cost using a generic formula as a function of generic
+ // shuffles. We therefore use a lookup table instead, filled according to
+ // the instruction sequences that codegen currently generates.
+
+ // We currently support only fully-interleaved groups, with no gaps.
// TODO: Support also strided loads (interleaved-groups with gaps).
if (Indices.size() && Indices.size() != Factor)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
{2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
{2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
{2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
- {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
- {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
+ {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
+ {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
- {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
- {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
- {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
- {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
+ {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
+ {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
+ {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
+ {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
{2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
- {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
- {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
- {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
- {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
+ {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
+ {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
+ {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
+ {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
{2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
- {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
- {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
- {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
+ {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
+ {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
+ {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
{2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
- {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
+ {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
{3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
{3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
{3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
{3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
- {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
- {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
- {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
- {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
- {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
+ {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
+ {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
+ {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
+ {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
+ {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
- {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
- {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
- {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
+ {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
+ {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
+ {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
{3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
- {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
- {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
+ {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
+ {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
{3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
- {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
+ {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
- {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
+ {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
{4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
{4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
{4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
{4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
- {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
- {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
- {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
- {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
+ {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
+ {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
+ {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
+ {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
{4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
- {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
- {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
- {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
+ {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
+ {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
+ {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
{4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
- {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
- {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
+ {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
+ {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
{4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
- {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
- {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
- {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
+ {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
+ {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
+ {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
{6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
{6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
- {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
- {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
- {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
+ {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
+ {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
+ {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
{6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
- {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
- {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
- {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
+ {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
+ {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
+ {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
{6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
- {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
+ {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
{6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
{6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
- {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
- {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
- {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
+ {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
+ {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
+ {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
{2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
{2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
- {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
- {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
- {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
+ {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
+ {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
+ {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
{2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
{2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
- {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
- {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
- {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
- {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
+ {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
+ {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
+ {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
+ {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
{2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
- {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
- {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
- {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
+ {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
+ {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
+ {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
{2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
{3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
{3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
- {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
+ {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
{3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
{3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
{3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
{3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
- {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
- {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
- {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
+ {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
+ {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
+ {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
{3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
{3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
- {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
- {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
+ {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
+ {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
{3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
{3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
- {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
- {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
+ {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
+ {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
- {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
+ {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
{4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
- {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
- {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
+ {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
+ {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
{4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
- {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
- {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
+ {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
+ {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
{4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
- {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
- {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
+ {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
+ {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
- {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
- {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
+ {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
+ {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
{4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
- {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
+ {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
{4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
{4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
- {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
+ {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
- {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
- {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
+ {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
+ {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
{6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
- {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
- {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
+ {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
+ {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
{6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
{6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
{6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
- {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
+ {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
- {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
+ {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
{6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
{6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
- {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
+ {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
{6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
- {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
- {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
+ {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
+ {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
};
if (Opcode == Instruction::Load) {
- if (const auto *Entry =
- CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
- return MemOpCosts + Entry->Cost;
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
+ ETy.getSimpleVT()))
+ return MemOpCosts + Entry->Cost;
} else {
assert(Opcode == Instruction::Store &&
- "Expected Store Instruction at this point");
- if (const auto *Entry =
- CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
- return MemOpCosts + Entry->Cost;
+ "Expected Store Instruction at this point");
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
+ ETy.getSimpleVT()))
+ return MemOpCosts + Entry->Cost;
}
- return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, CostKind);
-}
-
-InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
- unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
- bool UseMaskForCond, bool UseMaskForGaps) {
- auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
- Type *EltTy = cast<VectorType>(VecTy)->getElementType();
- if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
- EltTy->isIntegerTy(32) || EltTy->isPointerTy())
- return true;
- if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
- (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
- return HasBW;
- return false;
- };
- if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
- return getInterleavedMemoryOpCostAVX512(
- Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
- AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
- if (ST->hasAVX2())
- return getInterleavedMemoryOpCostAVX2(
- Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
- AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
-
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);