From d5f8878a6e7db01004a3bf61f6b6efbb7d5add25 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 18 Dec 2022 06:55:37 -0500
Subject: [PATCH] [InstCombine] canonicalize insertelement order based on index

This puts lower insert indexes before higher. This is independent
of endian, so it requires an adjustment to a fold added with
4446f71ce392, but it makes that fold more robust.
That's also where this patch was suggested - D139668.

This matches what we already do in DAGCombiner, but there is one
more constraint because there's an existing canonicalization for
insert-of-scalar-constant. I'm not sure if that is still needed,
so it may be adjusted/removed as a follow-up.
---
 .../InstCombine/InstCombineVectorOps.cpp           | 58 +++++++++++++++-------
 .../InstCombine/broadcast-inseltpoison.ll          |  6 +--
 llvm/test/Transforms/InstCombine/broadcast.ll      |  8 +--
 .../insert-extract-shuffle-inseltpoison.ll         |  4 +-
 .../InstCombine/insert-extract-shuffle.ll          |  4 +-
 .../test/Transforms/InstCombine/insertelt-trunc.ll | 47 ++++++++++--------
 .../InstCombine/vec_shuffle-inseltpoison.ll        |  4 +-
 llvm/test/Transforms/InstCombine/vec_shuffle.ll    |  4 +-
 .../PhaseOrdering/X86/vector-reductions.ll         | 48 +++++++++---------
 9 files changed, 105 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 84d3134..9617511 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1524,41 +1524,51 @@ static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt,
   Value *ScalarOp = InsElt.getOperand(1);
   Value *IndexOp  = InsElt.getOperand(2);
 
+  // Pattern depends on endian because we expect lower index is inserted first.
+  // Big endian:
+  // inselt (inselt BaseVec, (trunc (lshr X, BW/2), Index0), (trunc X), Index1
+  // Little endian:
   // inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index1
   // Note: It is not safe to do this transform with an arbitrary base vector
   //       because the bitcast of that vector to fewer/larger elements could
   //       allow poison to spill into an element that was not poison before.
-  // TODO: The insertion order could be reversed.
   // TODO: Detect smaller fractions of the scalar.
   // TODO: One-use checks are conservative.
   auto *VTy = dyn_cast<FixedVectorType>(InsElt.getType());
-  Value *X, *BaseVec;
-  uint64_t ShAmt, Index0, Index1;
+  Value *Scalar0, *BaseVec;
+  uint64_t Index0, Index1;
   if (!VTy || (VTy->getNumElements() & 1) ||
-      !match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), m_Trunc(m_Value(X)),
-                                         m_ConstantInt(Index0)))) ||
-      !match(ScalarOp, m_OneUse(m_Trunc(m_LShr(m_Specific(X),
-                                               m_ConstantInt(ShAmt))))) ||
       !match(IndexOp, m_ConstantInt(Index1)) ||
+      !match(VecOp, m_InsertElt(m_Value(BaseVec), m_Value(Scalar0),
+                                m_ConstantInt(Index0))) ||
       !match(BaseVec, m_Undef()))
     return nullptr;
 
+  // The first insert must be to the index one less than this one, and
+  // the first insert must be to an even index.
+  if (Index0 + 1 != Index1 || Index0 & 1)
+    return nullptr;
+
+  // For big endian, the high half of the value should be inserted first.
+  // For little endian, the low half of the value should be inserted first.
+  Value *X;
+  uint64_t ShAmt;
+  if (IsBigEndian) {
+    if (!match(ScalarOp, m_Trunc(m_Value(X))) ||
+        !match(Scalar0, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt)))))
+      return nullptr;
+  } else {
+    if (!match(Scalar0, m_Trunc(m_Value(X))) ||
+        !match(ScalarOp, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt)))))
+      return nullptr;
+  }
+
   Type *SrcTy = X->getType();
   unsigned ScalarWidth = SrcTy->getScalarSizeInBits();
   unsigned VecEltWidth = VTy->getScalarSizeInBits();
   if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth)
     return nullptr;
 
-  // The low half must be inserted at element +1 for big-endian.
-  // The high half must be inserted at element +1 for little-endian
-  if (IsBigEndian ? Index0 != Index1 + 1 : Index0 + 1 != Index1)
-    return nullptr;
-
-  // The high half must be inserted at an even element for big-endian.
-  // The low half must be inserted at an even element for little-endian.
-  if (IsBigEndian ? Index1 & 1 : Index0 & 1)
-    return nullptr;
-
   // Bitcast the base vector to a vector type with the source element type.
   Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2);
   Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy);
@@ -1580,10 +1590,22 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
     return replaceInstUsesWith(IE, V);
 
   // Canonicalize type of constant indices to i64 to simplify CSE
-  if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp))
+  if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp)) {
     if (auto *NewIdx = getPreferredVectorIndex(IndexC))
       return replaceOperand(IE, 2, NewIdx);
 
+    Value *BaseVec, *OtherScalar;
+    uint64_t OtherIndexVal;
+    if (match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec),
+                                          m_Value(OtherScalar),
+                                          m_ConstantInt(OtherIndexVal)))) &&
+        !isa<Constant>(OtherScalar) && OtherIndexVal > IndexC->getZExtValue()) {
+      Value *NewIns = Builder.CreateInsertElement(BaseVec, ScalarOp, IdxOp);
+      return InsertElementInst::Create(NewIns, OtherScalar,
+                                       Builder.getInt64(OtherIndexVal));
+    }
+  }
+
   // If the scalar is bitcast and inserted into undef, do the insert in the
   // source type followed by bitcast.
   // TODO: Generalize for insert into any constant, not just undef?
diff --git a/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll
index d327c7e..ad868b4 100644
--- a/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll
@@ -163,9 +163,9 @@ define <4 x float> @bad7(float %v) {
 ; CHECK-LABEL: @bad7(
 ; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> poison, float [[V:%.*]], i64 1
 ; CHECK-NEXT:    [[A1:%.*]] = fadd <4 x float> [[INS1]], [[INS1]]
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 2
-; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[V]], i64 3
-; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x float> [[INS3]], float [[V]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[V]], i64 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x float> [[TMP2]], float [[V]], i64 3
 ; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[A1]], [[INS4]]
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/broadcast.ll b/llvm/test/Transforms/InstCombine/broadcast.ll
index 8970c1a..7ee7060 100644
--- a/llvm/test/Transforms/InstCombine/broadcast.ll
+++ b/llvm/test/Transforms/InstCombine/broadcast.ll
@@ -16,7 +16,7 @@ define <4 x float> @good1(float %arg) {
 
 define <4 x float> @good2(float %arg) {
 ; CHECK-LABEL: @good2(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i64 0
 ; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x float> [[T6]]
 ;
@@ -163,9 +163,9 @@ define <4 x float> @bad7(float %v) {
 ; CHECK-LABEL: @bad7(
 ; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> undef, float [[V:%.*]], i64 1
 ; CHECK-NEXT:    [[A1:%.*]] = fadd <4 x float> [[INS1]], [[INS1]]
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 2
-; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[V]], i64 3
-; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x float> [[INS3]], float [[V]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[V]], i64 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x float> [[TMP2]], float [[V]], i64 3
 ; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[A1]], [[INS4]]
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
index febb5e0..0b7b8fa 100644
--- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
@@ -355,8 +355,8 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x fl
 
 define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) {
 ; CHECK-LABEL: @insert_insert_shuffle_translate(
-; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i64 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i64 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[TMP1]], float [[X2:%.*]], i64 2
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv1 = insertelement <4 x float> %q, float %x1, i32 0
diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 70d491a..d34ca0c 100644
--- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -355,8 +355,8 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x fl
 
 define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) {
 ; CHECK-LABEL: @insert_insert_shuffle_translate(
-; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i64 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i64 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[TMP1]], float [[X2:%.*]], i64 2
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv1 = insertelement <4 x float> %q, float %x1, i32 0
diff --git a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
index cd735b8..a2721bf 100644
--- a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
@@ -38,8 +38,8 @@ define <8 x i16> @insert_10_poison_v8i16(i32 %x) {
 ; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
 ; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1
-; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[HI16]], i64 0
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[LO16]], i64 1
 ; LE-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -76,8 +76,8 @@ define <4 x i16> @insert_21_poison_v4i16(i32 %x) {
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
 ; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 2
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 2
 ; ALL-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -120,8 +120,8 @@ define <4 x i16> @insert_32_poison_v4i16(i32 %x) {
 ; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
 ; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; LE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3
-; LE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 2
+; LE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 3
 ; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -165,8 +165,8 @@ define <8 x i16> @insert_10_v8i16(i32 %x, <8 x i16> %v) {
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
 ; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[HI16]], i64 0
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[LO16]], i64 1
 ; ALL-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -203,8 +203,8 @@ define <4 x i16> @insert_21_v4i16(i32 %x, <4 x i16> %v) {
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
 ; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 2
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[HI16]], i64 1
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 2
 ; ALL-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -241,8 +241,8 @@ define <4 x i16> @insert_32_v4i16(i32 %x, <4 x i16> %v) {
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
 ; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[HI16]], i64 2
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 3
 ; ALL-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -321,8 +321,8 @@ define <8 x i16> @insert_76_v4i16_uses2(i32 %x, <8 x i16> %v) {
 ; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
 ; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
 ; ALL-NEXT:    call void @use(i16 [[LO16]])
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 7
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 6
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[HI16]], i64 6
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[LO16]], i64 7
 ; ALL-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
@@ -358,13 +358,18 @@ define <8 x i16> @insert_67_v4i16_uses3(i32 %x, <8 x i16> %v) {
 ; TODO: This is equivalent to the 1st test.
 
 define <4 x i16> @insert_01_poison_v4i16_high_first(i32 %x) {
-; ALL-LABEL: @insert_01_poison_v4i16_high_first(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[INS1]], i16 [[LO16]], i64 0
-; ALL-NEXT:    ret <4 x i16> [[INS0]]
+; BE-LABEL: @insert_01_poison_v4i16_high_first(
+; BE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; BE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; BE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[HI16]], i64 1
+; BE-NEXT:    ret <4 x i16> [[INS0]]
+;
+; LE-LABEL: @insert_01_poison_v4i16_high_first(
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0
+; LE-NEXT:    [[INS0:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16>
+; LE-NEXT:    ret <4 x i16> [[INS0]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
index c2a31b5..3256d40 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
@@ -229,8 +229,8 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y
 
 define <2 x i8> @test13a(i8 %x1, i8 %x2) {
 ; CHECK-LABEL: @test13a(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X1:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i8> [[TMP2]], <i8 7, i8 5>
 ; CHECK-NEXT:    ret <2 x i8> [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
index 46d72bf..fdbd9c14 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
@@ -229,8 +229,8 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y
 
 define <2 x i8> @test13a(i8 %x1, i8 %x2) {
 ; CHECK-LABEL: @test13a(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X1:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i8> [[TMP2]], <i8 7, i8 5>
 ; CHECK-NEXT:    ret <2 x i8> [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 2f5640e..6f7e6d8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -61,12 +61,12 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x
 define i32 @TestVectorsEqual(ptr noalias %Vec0, ptr noalias %Vec1, i32 %Tolerance) {
 ; CHECK-LABEL: @TestVectorsEqual(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP6]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP4]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND6:%.*]] = zext i1 [[CMP5_NOT]] to i32
 ; CHECK-NEXT:    ret i32 [[COND6]]
 ;
@@ -119,11 +119,11 @@ for.end:
 define i32 @TestVectorsEqual_alt(ptr noalias %Vec0, ptr noalias %Vec1, i32 %Tolerance) {
 ; CHECK-LABEL: @TestVectorsEqual_alt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP3]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
 ;
@@ -164,12 +164,12 @@ for.end:
 define i32 @TestVectorsEqualFP(ptr noalias %Vec0, ptr noalias %Vec1, float %Tolerance) {
 ; CHECK-LABEL: @TestVectorsEqualFP(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ole float [[TMP4]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND5:%.*]] = zext i1 [[CMP4]] to i32
 ; CHECK-NEXT:    ret i32 [[COND5]]
 ;
@@ -222,11 +222,11 @@ for.end:
 define i32 @TestVectorsEqualFP_alt(ptr noalias %Vec0, ptr noalias %Vec1, float %Tolerance) {
 ; CHECK-LABEL: @TestVectorsEqualFP_alt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
-; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[TMP3]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
 ;
@@ -272,8 +272,8 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[FNEG]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-- 
2.7.4