BS->clear();
}
MinBWs.clear();
+ InstrElementSize.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
- SmallVector<Instruction *, 16> Worklist;
+ SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
if (auto *I = dyn_cast<Instruction>(V)) {
- Worklist.push_back(I);
+ Worklist.emplace_back(I, I->getParent());
Visited.insert(I);
}
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruction we don't yet handle, we give up.
- auto MaxWidth = 0u;
- auto FoundUnknownInst = false;
- while (!Worklist.empty() && !FoundUnknownInst) {
- auto *I = Worklist.pop_back_val();
+ auto Width = 0u;
+ while (!Worklist.empty()) {
+ Instruction *I;
+ BasicBlock *Parent;
+ std::tie(I, Parent) = Worklist.pop_back_val();
// We should only be looking at scalar instructions here. If the current
- // instruction has a vector type, give up.
+ // instruction has a vector type, skip.
auto *Ty = I->getType();
if (isa<VectorType>(Ty))
- FoundUnknownInst = true;
+ continue;
// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.
- else if (isa<LoadInst>(I))
- MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+ if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ExtractValueInst>(I))
+ Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an
- // instruction we haven't yet visited, we add it to the worklist.
+ // instruction we haven't yet visited and from the same basic block as the
+ // user or the use is a PHI node, we add it to the worklist.
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
- isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+ isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) ||
+ isa<UnaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
- if (Visited.insert(J).second)
- Worklist.push_back(J);
+ if (Visited.insert(J).second &&
+ (isa<PHINode>(I) || J->getParent() == Parent))
+ Worklist.emplace_back(J, J->getParent());
+ } else {
+ break;
}
-
- // If we don't yet handle the instruction, give up.
- else
- FoundUnknownInst = true;
}
- int Width = MaxWidth;
// If we didn't encounter a memory access in the expression tree, or if we
// gave up for some reason, just return the width of V. Otherwise, return the
// maximum width we found.
- if (!MaxWidth || FoundUnknownInst)
+ if (!Width) {
+ if (auto *CI = dyn_cast<CmpInst>(V))
+ V = CI->getOperand(0);
Width = DL->getTypeSizeInBits(V->getType());
+ }
for (Instruction *I : Visited)
InstrElementSize[I] = Width;
; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
-; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64
-; CHECK-NEXT: [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
-; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[A0]]
+; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4
-; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
-; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64
-; CHECK-NEXT: [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
-; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT: [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP8]]
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4
-; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT: [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP9]]
; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4
; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
; CHECK-NEXT: ret void
; CHECK-LABEL: @inst_size(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; CHECK-NEXT: [[TMPL1:%.*]] = load i64, i64* [[A:%.*]], align 4
-; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
-; CHECK-NEXT: [[TMPL2:%.*]] = load i64, i64* [[PTR2]], align 4
+; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
-; CHECK-NEXT: [[TMPL3:%.*]] = load i64, i64* [[PTR3]], align 4
; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 3
-; CHECK-NEXT: [[TMPL4:%.*]] = load i64, i64* [[PTR4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 4
; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]]
-; CHECK-NEXT: [[T42:%.*]] = icmp sgt i64 0, [[TMPL1]]
-; CHECK-NEXT: [[T43:%.*]] = icmp sgt i64 0, [[TMPL2]]
-; CHECK-NEXT: [[T44:%.*]] = icmp sgt i64 0, [[TMPL3]]
-; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]]
; CHECK-NEXT: br label [[BLOCK:%.*]]
; CHECK: block:
; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[PHI2:%.*]] = phi i1 [ [[T42]], [[ENTRY]] ]
-; CHECK-NEXT: [[PHI3:%.*]] = phi i1 [ [[T43]], [[ENTRY]] ]
-; CHECK-NEXT: [[PHI4:%.*]] = phi i1 [ [[T44]], [[ENTRY]] ]
-; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ]
; CHECK-NEXT: ret void
;
entry: