[CostModel][X86] getScalarizationOverhead - improve extraction costs for > 128-bit...
authorSimon Pilgrim <llvm-dev@redking.me.uk>
Tue, 24 May 2022 14:17:59 +0000 (15:17 +0100)
committerSimon Pilgrim <llvm-dev@redking.me.uk>
Tue, 24 May 2022 14:18:08 +0000 (15:18 +0100)
commit6c80267d0ff445c0c47c6ddb283da5a8bc4feb64
tree0bba66e8e38ad41e2dd9b5b2e0d1fa56738bd89d
parent1586e1dc957677df0f37da603bb35586e1b6a172
[CostModel][X86] getScalarizationOverhead - improve extraction costs for > 128-bit vectors

We were using the default getScalarizationOverhead expansion for extraction costs, which adds up all the individual element extraction costs.

This is fine for 128-bit vectors, but for 256/512-bit vectors each element extraction also has to account for extracting the upper 128-bit subvector extraction before it can handle the element. For scalarization costs we only need to extract each demanded subvector once.

Differential Revision: https://reviews.llvm.org/D125527
120 files changed:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/Analysis/CostModel/X86/arith-fp.ll
llvm/test/Analysis/CostModel/X86/fptoi_sat.ll
llvm/test/Analysis/CostModel/X86/fptosi.ll
llvm/test/Analysis/CostModel/X86/fptoui.ll
llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-5.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-7.ll
llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-8.ll
llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/masked-store-i16.ll
llvm/test/Analysis/CostModel/X86/masked-store-i8.ll
llvm/test/Analysis/CostModel/X86/reduce-fadd.ll
llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll
llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll
llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll
llvm/test/Analysis/CostModel/X86/shuffle-replication-i8.ll
llvm/test/Analysis/CostModel/X86/sitofp.ll
llvm/test/Analysis/CostModel/X86/trunc.ll