From 9cf33729316852043371235a33e310f34132afa8 Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Mon, 12 Nov 2012 21:21:02 +0000 Subject: [PATCH] BBVectorize: Use a more sophisticated check for input cost The old checking code, which assumed that input shuffles and insert-elements could always be folded (and thus were free) is too simple. This can only happen in special circumstances. Using the simple check caused infinite recursion. llvm-svn: 167750 --- llvm/lib/Transforms/Vectorize/BBVectorize.cpp | 57 +++++++++++++++++++------- llvm/test/Transforms/BBVectorize/X86/sh-rec.ll | 54 ++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 14 deletions(-) create mode 100644 llvm/test/Transforms/BBVectorize/X86/sh-rec.ll diff --git a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 407cd7b..fb31d91 100644 --- a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -28,6 +28,7 @@ #include "llvm/Type.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" @@ -400,6 +401,7 @@ namespace { DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << BB.getParent()->getName() << "...\n"); +assert(n < 10 && "hrmm, really?"); if (vectorizePairs(BB)) changed = true; else @@ -1765,9 +1767,12 @@ namespace { bool NeedsExtraction = false; for (Value::use_iterator I = S->first->use_begin(), IE = S->first->use_end(); I != IE; ++I) { - if (isa(*I) || - isa(*I) || - isa(*I)) + if (ShuffleVectorInst *SI = dyn_cast(*I)) { + // Shuffle can be folded if it has no other input + if (isa(SI->getOperand(1))) + continue; + } + if (isa(*I)) continue; if (PrunedTreeInstrs.count(*I)) continue; @@ -1792,9 +1797,12 @@ namespace { NeedsExtraction = false; for (Value::use_iterator I = S->second->use_begin(), IE = S->second->use_end(); I != IE; ++I) { - if (isa(*I) || - isa(*I) || - isa(*I)) + if (ShuffleVectorInst *SI = dyn_cast(*I)) { + // Shuffle can be folded if it has no other input + if (isa(SI->getOperand(1))) + continue; + } + if (isa(*I)) continue; if (PrunedTreeInstrs.count(*I)) continue; @@ -1844,14 +1852,35 @@ namespace { // Combining vector operations of the same type is also assumed // folded with other operations. - if (Ty1 == Ty2 && - (isa(O1) || - isa(O1) || - isa(O1)) && - (isa(O2) || - isa(O2) || - isa(O2))) - continue; + if (Ty1 == Ty2) { + // If both are insert elements, then both can be widened. + if (isa(O1) && isa(O2)) + continue; + // If both are extract elements, and both have the same input + // type, then they can be replaced with a shuffle + ExtractElementInst *EIO1 = dyn_cast(O1), + *EIO2 = dyn_cast(O2); + if (EIO1 && EIO2 && + EIO1->getOperand(0)->getType() == + EIO2->getOperand(0)->getType()) + continue; + // If both are a shuffle with equal operand types and only two + // unqiue operands, then they can be replaced with a single + // shuffle + ShuffleVectorInst *SIO1 = dyn_cast(O1), + *SIO2 = dyn_cast(O2); + if (SIO1 && SIO2 && + SIO1->getOperand(0)->getType() == + SIO2->getOperand(0)->getType()) { + SmallSet SIOps; + SIOps.insert(SIO1->getOperand(0)); + SIOps.insert(SIO1->getOperand(1)); + SIOps.insert(SIO2->getOperand(0)); + SIOps.insert(SIO2->getOperand(1)); + if (SIOps.size() <= 2) + continue; + } + } int ESContrib; // This pair has already been formed. diff --git a/llvm/test/Transforms/BBVectorize/X86/sh-rec.ll b/llvm/test/Transforms/BBVectorize/X86/sh-rec.ll new file mode 100644 index 0000000..1e0492c --- /dev/null +++ b/llvm/test/Transforms/BBVectorize/X86/sh-rec.ll @@ -0,0 +1,54 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s + +define void @ptoa() nounwind uwtable { +entry: + %call = call i8* @malloc() nounwind + br i1 undef, label %return, label %if.end10 + +if.end10: ; preds = %entry + %incdec.ptr = getelementptr inbounds i8* %call, i64 undef + %call17 = call i32 @ptou() nounwind + %incdec.ptr26.1 = getelementptr inbounds i8* %incdec.ptr, i64 -2 + store i8 undef, i8* %incdec.ptr26.1, align 1 + %div27.1 = udiv i32 %call17, 100 + %rem.2 = urem i32 %div27.1, 10 + %add2230.2 = or i32 %rem.2, 48 + %conv25.2 = trunc i32 %add2230.2 to i8 + %incdec.ptr26.2 = getelementptr inbounds i8* %incdec.ptr, i64 -3 + store i8 %conv25.2, i8* %incdec.ptr26.2, align 1 + %incdec.ptr26.3 = getelementptr inbounds i8* %incdec.ptr, i64 -4 + store i8 undef, i8* %incdec.ptr26.3, align 1 + %div27.3 = udiv i32 %call17, 10000 + %rem.4 = urem i32 %div27.3, 10 + %add2230.4 = or i32 %rem.4, 48 + %conv25.4 = trunc i32 %add2230.4 to i8 + %incdec.ptr26.4 = getelementptr inbounds i8* %incdec.ptr, i64 -5 + store i8 %conv25.4, i8* %incdec.ptr26.4, align 1 + %div27.4 = udiv i32 %call17, 100000 + %rem.5 = urem i32 %div27.4, 10 + %add2230.5 = or i32 %rem.5, 48 + %conv25.5 = trunc i32 %add2230.5 to i8 + %incdec.ptr26.5 = getelementptr inbounds i8* %incdec.ptr, i64 -6 + store i8 %conv25.5, i8* %incdec.ptr26.5, align 1 + %incdec.ptr26.6 = getelementptr inbounds i8* %incdec.ptr, i64 -7 + store i8 0, i8* %incdec.ptr26.6, align 1 + %incdec.ptr26.7 = getelementptr inbounds i8* %incdec.ptr, i64 -8 + store i8 undef, i8* %incdec.ptr26.7, align 1 + %div27.7 = udiv i32 %call17, 100000000 + %rem.8 = urem i32 %div27.7, 10 + %add2230.8 = or i32 %rem.8, 48 + %conv25.8 = trunc i32 %add2230.8 to i8 + %incdec.ptr26.8 = getelementptr inbounds i8* %incdec.ptr, i64 -9 + store i8 %conv25.8, i8* %incdec.ptr26.8, align 1 + unreachable + +return: ; preds = %entry + ret void +; CHECK: @ptoa +} + +declare noalias i8* @malloc() nounwind + +declare i32 @ptou() -- 2.7.4