From: Alexey Bataev Date: Fri, 12 Nov 2021 21:34:32 +0000 (-0800) Subject: [SLP]Do not create unused gather nodes for scalar arguments of vector intrinsics. X-Git-Tag: upstream/15.0.7~25708 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6fb5bed7d16bffc8549db0523323b182dd43fe4e;p=platform%2Fupstream%2Fllvm.git [SLP]Do not create unused gather nodes for scalar arguments of vector intrinsics. If the vector intrinsic has scalar argument, we currently still create a tree entry for this argument. This entry is not used, just consumes resources and increases the cost of the tree. Differential Revision: https://reviews.llvm.org/D113806 --- diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8fd5adb..a077b25 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4024,6 +4024,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, ReuseShuffleIndicies); TE->setOperandsInOrder(); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { + // For scalar operands no need to to create an entry since no need to + // vectorize it. + if (hasVectorInstrinsicScalarOpd(ID, i)) + continue; ValueList Operands; // Prepare the operand vector. for (Value *V : VL) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll index 2c0ce09..950bbcb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll @@ -5,24 +5,17 @@ declare float @llvm.powi.f32.i32(float, i32) define void @vec_powi_f32(float* %a, float* %c, i32 %P) { ; CHECK-LABEL: @vec_powi_f32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[I0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[I0]], i32 [[P:%.*]]) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 1 -; CHECK-NEXT: [[I2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[I2]], i32 [[P]]) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2 -; CHECK-NEXT: [[I4:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[I4]], i32 [[P]]) ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3 -; CHECK-NEXT: [[I6:%.*]] = load float, float* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[I6]], i32 [[P]]) -; CHECK-NEXT: store float [[CALL1]], float* [[C:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1 -; CHECK-NEXT: store float [[CALL2]], float* [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]]) +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 -; CHECK-NEXT: store float [[CALL3]], float* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 -; CHECK-NEXT: store float [[CALL4]], float* [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: