From a3fd82c289878e1a8fa5833d87b688cd50624247 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 28 Apr 2021 07:02:33 -0700 Subject: [PATCH] [SLP]Fix the crash on cost calculation if non-compatible vectors shuffled. If the extracts from the non-power-2 vectors are recognized as shuffles, need some extra checks to not crash cost calculations if trying to gext the ecost for subvector extracts. In this case need to check carefully that we do not exit out of bounds of the original vector, otherwise the TTI's cost model will crash on assert. Differential Revision: https://reviews.llvm.org/D101477 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 24 +++++++++++---- .../AMDGPU/crash_extract_subvector_cost.ll | 35 ++++++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cd3d4b8..5e2a8a1 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3589,13 +3589,27 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { for (const auto &Data : ExtractVectorsTys) { auto *EEVTy = cast(Data.first->getType()); unsigned NumElts = VecTy->getNumElements(); - if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) - Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, - (Data.second / NumElts) * NumElts, VecTy); - else + if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { + unsigned Idx = (Data.second / NumElts) * NumElts; + unsigned EENumElts = EEVTy->getNumElements(); + if (Idx + NumElts <= EENumElts) { + Cost += + TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, None, Idx, VecTy); + } else { + // Need to round up the subvector type vectorization factor to avoid a + // crash in cost model functions. Make SubVT so that Idx + VF of SubVT + // <= EENumElts. + auto *SubVT = + FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); + Cost += + TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, None, Idx, SubVT); + } + } else { Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, VecTy, None, 0, EEVTy); + } } }; if (E->State == TreeEntry::NeedToGather) { diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll new file mode 100644 index 0000000..3a25530 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck %s + +define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) { +; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7 +; CHECK-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8 +; CHECK-NEXT: [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7 +; CHECK-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[ARG0_1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 [[ARG0_2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[ARG1_1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[ARG1_2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0 +; CHECK-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1 +; CHECK-NEXT: [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[TMP6]], i64 1 +; CHECK-NEXT: ret <2 x i16> [[INS_2]] +; +bb: + %arg0.1 = extractelement <9 x i16> undef, i64 7 + %arg0.2 = extractelement <9 x i16> %arg0, i64 8 + %arg1.1 = extractelement <9 x i16> %arg1, i64 7 + %arg1.2 = extractelement <9 x i16> %arg1, i64 8 + %add.1 = call i16 @llvm.uadd.sat.i16(i16 %arg0.1, i16 %arg1.1) + %add.2 = call i16 @llvm.uadd.sat.i16(i16 %arg0.2, i16 %arg1.2) + %ins.1 = insertelement <2 x i16> undef, i16 %add.1, i64 0 + %ins.2 = insertelement <2 x i16> %ins.1, i16 %add.2, i64 1 + ret <2 x i16> %ins.2 +} + +declare i16 @llvm.uadd.sat.i16(i16, i16) #0 +attributes #0 = { nounwind readnone speculatable willreturn } -- 2.7.4