SmallVector<ShuffledInsertData> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
+ // Maps extract Scalar to the corresponding extractelement instruction in the
+ // basic block. Only one extractelement per block should be emitted.
+ DenseMap<Value *, DenseMap<BasicBlock *, Value *>> ScalarToEEs;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
Value *Lane = Builder.getInt32(ExternalUse.Lane);
auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
if (Scalar->getType() != Vec->getType()) {
- Value *Ex;
- // "Reuse" the existing extract to improve final codegen.
- if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
- Ex = Builder.CreateExtractElement(ES->getOperand(0),
- ES->getOperand(1));
- } else {
- Ex = Builder.CreateExtractElement(Vec, Lane);
+ Value *Ex = nullptr;
+ auto It = ScalarToEEs.find(Scalar);
+ if (It != ScalarToEEs.end()) {
+ // No need to emit many extracts, just move the only one in the
+ // current block.
+ auto EEIt = It->second.find(Builder.GetInsertBlock());
+ if (EEIt != It->second.end()) {
+ auto *I = cast<Instruction>(EEIt->second);
+ if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
+ Builder.GetInsertPoint()->comesBefore(I))
+ I->moveBefore(&*Builder.GetInsertPoint());
+ Ex = I;
+ }
+ }
+ if (!Ex) {
+ // "Reuse" the existing extract to improve final codegen.
+ if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+ Ex = Builder.CreateExtractElement(ES->getOperand(0),
+ ES->getOperand(1));
+ } else {
+ Ex = Builder.CreateExtractElement(Vec, Lane);
+ }
+ ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), Ex);
}
// The then branch of the previous if may produce constants, since 0
// operand might be a constant.
"Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map");
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
- Builder.SetInsertPoint(VecI->getParent(),
- std::next(VecI->getIterator()));
+ if (auto *PHI = dyn_cast<PHINode>(VecI))
+ Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());
+ else
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
}
; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
-; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
-; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
-; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
+; SSE-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 1
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
-; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
-; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
-; AVX-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
+; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 1
+; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
+; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; AVX-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; AVX-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; AVX-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*