From 049799c311515c8c8b5daf91b4a731870ed54afe Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Fri, 19 Nov 2021 15:55:31 +0300 Subject: [PATCH] [X86][Costmodel] `getReplicationShuffleCost()`: promote 1 bit-wide elements to 8 bit when have AVX512BW+AVX512VBMI If in addition to AVX512BW (that provides `{k}<->{i8,i16}` casts and i16 shuffles), we have AVX512VBMI, which provides i8 shuffles, we are in an optimal situation. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D114071 --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 5 +- .../CostModel/X86/shuffle-replication-i1.ll | 79 ++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index a68e6a9..06dacb6 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3668,7 +3668,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, case 1: // There is no support for shuffling i1 elements. We *must* promote. if (ST->hasBWI()) { - PromEltTyBits = 16; // promote to i16, AVX512BW. + if (ST->hasVBMI()) + PromEltTyBits = 8; // promote to i8, AVX512VBMI. + else + PromEltTyBits = 16; // promote to i16, AVX512BW. break; } return bailout(); diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1.ll index 6fb11bb..ba9c820 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1.ll @@ -10,6 +10,8 @@ ; RUN: opt < %s -cost-model -mtriple=x86_64-pc-linux-gnu -analyze -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512F ; RUN: opt < %s -cost-model -mtriple=x86_64-pc-linux-gnu -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW ; RUN: opt < %s -cost-model -mtriple=x86_64-pc-linux-gnu -analyze -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-pc-linux-gnu -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512VBMI +; RUN: opt < %s -cost-model -mtriple=x86_64-pc-linux-gnu -analyze -mattr=+avx512f,+avx512vl,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512VBMI define void @replication_i1_stride2() nounwind { ; SSE2-LABEL: 'replication_i1_stride2' @@ -100,6 +102,17 @@ define void @replication_i1_stride2() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride2' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> @@ -200,6 +213,17 @@ define void @replication_i1_stride3() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride3' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> @@ -300,6 +324,17 @@ define void @replication_i1_stride4() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride4' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> @@ -400,6 +435,17 @@ define void @replication_i1_stride5() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride5' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> @@ -500,6 +546,17 @@ define void @replication_i1_stride6() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride6' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> @@ -600,6 +657,17 @@ define void @replication_i1_stride7() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride7' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> @@ -700,6 +768,17 @@ define void @replication_i1_stride8() nounwind { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; +; AVX512VBMI-LABEL: 'replication_i1_stride8' +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> +; AVX512VBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> -- 2.7.4