From 688cdddafb0dfdeb5f3c5e1e22b88a0cdfc54c0c Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Mon, 1 Mar 2021 14:44:12 +0100 Subject: [PATCH] [SLP] Honor min/max regsize and min/max VF in vectorizeStores Make sure we use PowerOf2Floor instead of PowerOf2Ceil when calculating max number of elements that fits inside a vector register (otherwise we could end up creating vectors larger than the maximum vector register size). Also make sure we honor the min/max VF (as given by TTI or cmd line parameters) when doing vectorizeStores. Reviewed By: anton-afanasyev Differential Revision: https://reviews.llvm.org/D97691 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 11 +++++---- .../Transforms/SLPVectorizer/slp-max-reg-size.ll | 27 ++++++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0ec8027..f36d2fc 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6152,17 +6152,18 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, I = ConsecutiveChain[I]; } - // If a vector register can't hold 1 element, we are done. unsigned MaxVecRegSize = R.getMaxVecRegSize(); unsigned EltSize = R.getVectorElementSize(Operands[0]); - if (MaxVecRegSize % EltSize != 0) - continue; + unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); + + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); + unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), + MaxElts); - unsigned MaxElts = MaxVecRegSize / EltSize; // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? unsigned StartIdx = 0; - for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { + for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { ArrayRef Slice = makeArrayRef(Operands).slice(Cnt, Size); if (!VectorizedStores.count(Slice.front()) && diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll index 367795a..be03ab7 100644 --- a/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-reg-size.ll @@ -15,15 +15,20 @@ define void @foo() { ; CHECK-VF8-160-LABEL: @foo( -; CHECK-VF8-160-NEXT: store <8 x i32> , <8 x i32>* bitcast ([8 x i32]* @X to <8 x i32>*), align 1 +; CHECK-VF8-160-NEXT: store <4 x i32> , <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1 +; CHECK-VF8-160-NEXT: store <4 x i32> , <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1 ; CHECK-VF8-160-NEXT: ret void ; ; CHECK-VF4-160-LABEL: @foo( -; CHECK-VF4-160-NEXT: store <8 x i32> , <8 x i32>* bitcast ([8 x i32]* @X to <8 x i32>*), align 1 +; CHECK-VF4-160-NEXT: store <4 x i32> , <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1 +; CHECK-VF4-160-NEXT: store <4 x i32> , <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1 ; CHECK-VF4-160-NEXT: ret void ; ; CHECK-VF2-160-LABEL: @foo( -; CHECK-VF2-160-NEXT: store <8 x i32> , <8 x i32>* bitcast ([8 x i32]* @X to <8 x i32>*), align 1 +; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast ([8 x i32]* @X to <2 x i32>*), align 1 +; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2) to <2 x i32>*), align 1 +; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <2 x i32>*), align 1 +; CHECK-VF2-160-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6) to <2 x i32>*), align 1 ; CHECK-VF2-160-NEXT: ret void ; ; CHECK-VF8-128-LABEL: @foo( @@ -37,8 +42,10 @@ define void @foo() { ; CHECK-VF4-128-NEXT: ret void ; ; CHECK-VF2-128-LABEL: @foo( -; CHECK-VF2-128-NEXT: store <4 x i32> , <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1 -; CHECK-VF2-128-NEXT: store <4 x i32> , <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1 +; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast ([8 x i32]* @X to <2 x i32>*), align 1 +; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2) to <2 x i32>*), align 1 +; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <2 x i32>*), align 1 +; CHECK-VF2-128-NEXT: store <2 x i32> , <2 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6) to <2 x i32>*), align 1 ; CHECK-VF2-128-NEXT: ret void ; ; CHECK-VF8-256-LABEL: @foo( @@ -46,8 +53,14 @@ define void @foo() { ; CHECK-VF8-256-NEXT: ret void ; ; CHECK-VF2-128-128-LABEL: @foo( -; CHECK-VF2-128-128-NEXT: store <4 x i32> , <4 x i32>* bitcast ([8 x i32]* @X to <4 x i32>*), align 1 -; CHECK-VF2-128-128-NEXT: store <4 x i32> , <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4) to <4 x i32>*), align 1 +; CHECK-VF2-128-128-NEXT: store i32 1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 0), align 1 +; CHECK-VF2-128-128-NEXT: store i32 2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 1), align 1 +; CHECK-VF2-128-128-NEXT: store i32 3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 2), align 1 +; CHECK-VF2-128-128-NEXT: store i32 4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 3), align 1 +; CHECK-VF2-128-128-NEXT: store i32 5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 4), align 1 +; CHECK-VF2-128-128-NEXT: store i32 6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 5), align 1 +; CHECK-VF2-128-128-NEXT: store i32 7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 6), align 1 +; CHECK-VF2-128-128-NEXT: store i32 8, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 7), align 1 ; CHECK-VF2-128-128-NEXT: ret void ; store i32 1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @X, i16 0, i16 0), align 1 -- 2.7.4