From 3f8f7840bf12ffa4bfd558e5115acbd66b39280a Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Tue, 30 Aug 2016 23:53:59 +0000 Subject: [PATCH] [LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148. Summary: LSV was using two vector sets (heads and tails) to track pairs of adjiacent position to vectorize. A recent optimization is trying to obtain the longest chain to vectorize and assumes the positions in heads(H) and tails(T) match, which is not the case is there are multiple tails for the same head. e.g.: i1: store a[0] i2: store a[1] i3: store a[1] Leads to: H: i1 T: i2 i3 Instead of: H: i1 i1 T: i2 i3 So the positions for instructions that follow i3 will have different indexes in H/T. This patch resolves PR29148. This issue also surfaced the fact that if the chain is too long, and TLI returns a "not-fast" answer, the whole chain will be abandoned for vectorization, even though a smaller one would be beneficial. Added a testcase and FIXME for this. Reviewers: tstellarAMD, arsenm, jlebar Subscribers: mzolotukhin, wdng, llvm-commits Differential Revision: https://reviews.llvm.org/D24057 llvm-svn: 280179 --- .../Transforms/Vectorize/LoadStoreVectorizer.cpp | 14 ++--- .../LoadStoreVectorizer/AMDGPU/multiple_tails.ll | 64 ++++++++++++++++++++++ .../X86/subchain-interleaved.ll | 30 ++++++++++ 3 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 9fd0b8c..90adf9f 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -628,7 +628,7 @@ bool Vectorizer::vectorizeChains(InstrListMap &Map) { bool Vectorizer::vectorizeInstructions(ArrayRef Instrs) { DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); - SmallSetVector Heads, Tails; + SmallVector Heads, Tails; int ConsecutiveChain[64]; // Do a quadratic search on all of the given stores and find all of the pairs @@ -647,8 +647,8 @@ bool Vectorizer::vectorizeInstructions(ArrayRef Instrs) { continue; // Should not insert. } - Tails.insert(j); - Heads.insert(i); + Tails.push_back(j); + Heads.push_back(i); ConsecutiveChain[i] = j; } } @@ -660,21 +660,21 @@ bool Vectorizer::vectorizeInstructions(ArrayRef Instrs) { for (int Head : Heads) { if (InstructionsProcessed.count(Instrs[Head])) continue; - bool longerChainExists = false; + bool LongerChainExists = false; for (unsigned TIt = 0; TIt < Tails.size(); TIt++) if (Head == Tails[TIt] && !InstructionsProcessed.count(Instrs[Heads[TIt]])) { - longerChainExists = true; + LongerChainExists = true; break; } - if (longerChainExists) + if (LongerChainExists) continue; // We found an instr that starts a chain. Now follow the chain and try to // vectorize it. SmallVector Operands; int I = Head; - while (I != -1 && (Tails.count(I) || Heads.count(I))) { + while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { if (InstructionsProcessed.count(Instrs[I])) break; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll new file mode 100644 index 0000000..88eca36 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -0,0 +1,64 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +; Checks that there is no crash when there are multiple tails +; for a the same head starting a chain. +@0 = internal addrspace(3) global [16384 x i32] undef + +; CHECK-LABEL: @no_crash( +; CHECK: store <2 x i32> zeroinitializer +; CHECK: store i32 0 +; CHECK: store i32 0 + +define void @no_crash(i32 %arg) { + %tmp2 = add i32 %arg, 14 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2 + %tmp4 = add i32 %arg, 15 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4 + + store i32 0, i32 addrspace(3)* %tmp3, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + +; Check adjiacent memory locations are properly matched and the +; longest chain vectorized + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32 %arg) { + %a1 = add i32 %arg, 1 + %a2 = add i32 %arg, 2 + %a3 = add i32 %arg, 3 + %a4 = add i32 %arg, 4 + %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg + %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2 + %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4 + + %l1 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l2 = load i32, i32 addrspace(3)* %tmp1, align 4 + store i32 0, i32 addrspace(3)* %tmp2, align 4 + store i32 0, i32 addrspace(3)* %tmp1, align 4 + %l3 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l4 = load i32, i32 addrspace(3)* %tmp3, align 4 + %l5 = load i32, i32 addrspace(3)* %tmp4, align 4 + %l6 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l7 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l8 = load i32, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll index 34ec43d..915b94a 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -85,3 +85,33 @@ define void @chain_prefix_suffix(i32* noalias %ptr) { ret void } +; FIXME: If the chain is too long and TLI says misaligned is not fast, +; then LSV fails to vectorize anything in that chain. +; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <3 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32* noalias %ptr) { + %tmp1 = getelementptr i32, i32* %ptr, i64 0 + %tmp2 = getelementptr i32, i32* %ptr, i64 1 + %tmp3 = getelementptr i32, i32* %ptr, i64 2 + %tmp4 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %tmp2, align 4 + %l2 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp2, align 4 + store i32 0, i32* %tmp1, align 4 + %l3 = load i32, i32* %tmp2, align 4 + %l4 = load i32, i32* %tmp3, align 4 + %l5 = load i32, i32* %tmp4, align 4 + %l6 = load i32, i32* %tmp4, align 4 + %l7 = load i32, i32* %tmp4, align 4 + + ret void +} -- 2.7.4