From d5f8878a6e7db01004a3bf61f6b6efbb7d5add25 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 18 Dec 2022 06:55:37 -0500 Subject: [PATCH] [InstCombine] canonicalize insertelement order based on index This puts lower insert indexes before higher. This is independent of endian, so it requires an adjustment to a fold added with 4446f71ce392, but it makes that fold more robust. That's also where this patch was suggested - D139668. This matches what we already do in DAGCombiner, but there is one more constraint because there's an existing canonicalization for insert-of-scalar-constant. I'm not sure if that is still needed, so it may be adjusted/removed as a follow-up. --- .../InstCombine/InstCombineVectorOps.cpp | 58 +++++++++++++++------- .../InstCombine/broadcast-inseltpoison.ll | 6 +-- llvm/test/Transforms/InstCombine/broadcast.ll | 8 +-- .../insert-extract-shuffle-inseltpoison.ll | 4 +- .../InstCombine/insert-extract-shuffle.ll | 4 +- .../test/Transforms/InstCombine/insertelt-trunc.ll | 47 ++++++++++-------- .../InstCombine/vec_shuffle-inseltpoison.ll | 4 +- llvm/test/Transforms/InstCombine/vec_shuffle.ll | 4 +- .../PhaseOrdering/X86/vector-reductions.ll | 48 +++++++++--------- 9 files changed, 105 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 84d3134..9617511 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1524,41 +1524,51 @@ static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt, Value *ScalarOp = InsElt.getOperand(1); Value *IndexOp = InsElt.getOperand(2); + // Pattern depends on endian because we expect lower index is inserted first. + // Big endian: + // inselt (inselt BaseVec, (trunc (lshr X, BW/2), Index0), (trunc X), Index1 + // Little endian: // inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index1 // Note: It is not safe to do this transform with an arbitrary base vector // because the bitcast of that vector to fewer/larger elements could // allow poison to spill into an element that was not poison before. - // TODO: The insertion order could be reversed. // TODO: Detect smaller fractions of the scalar. // TODO: One-use checks are conservative. auto *VTy = dyn_cast(InsElt.getType()); - Value *X, *BaseVec; - uint64_t ShAmt, Index0, Index1; + Value *Scalar0, *BaseVec; + uint64_t Index0, Index1; if (!VTy || (VTy->getNumElements() & 1) || - !match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), m_Trunc(m_Value(X)), - m_ConstantInt(Index0)))) || - !match(ScalarOp, m_OneUse(m_Trunc(m_LShr(m_Specific(X), - m_ConstantInt(ShAmt))))) || !match(IndexOp, m_ConstantInt(Index1)) || + !match(VecOp, m_InsertElt(m_Value(BaseVec), m_Value(Scalar0), + m_ConstantInt(Index0))) || !match(BaseVec, m_Undef())) return nullptr; + // The first insert must be to the index one less than this one, and + // the first insert must be to an even index. + if (Index0 + 1 != Index1 || Index0 & 1) + return nullptr; + + // For big endian, the high half of the value should be inserted first. + // For little endian, the low half of the value should be inserted first. + Value *X; + uint64_t ShAmt; + if (IsBigEndian) { + if (!match(ScalarOp, m_Trunc(m_Value(X))) || + !match(Scalar0, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt))))) + return nullptr; + } else { + if (!match(Scalar0, m_Trunc(m_Value(X))) || + !match(ScalarOp, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt))))) + return nullptr; + } + Type *SrcTy = X->getType(); unsigned ScalarWidth = SrcTy->getScalarSizeInBits(); unsigned VecEltWidth = VTy->getScalarSizeInBits(); if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth) return nullptr; - // The low half must be inserted at element +1 for big-endian. - // The high half must be inserted at element +1 for little-endian - if (IsBigEndian ? Index0 != Index1 + 1 : Index0 + 1 != Index1) - return nullptr; - - // The high half must be inserted at an even element for big-endian. - // The low half must be inserted at an even element for little-endian. - if (IsBigEndian ? Index1 & 1 : Index0 & 1) - return nullptr; - // Bitcast the base vector to a vector type with the source element type. Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2); Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy); @@ -1580,10 +1590,22 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) { return replaceInstUsesWith(IE, V); // Canonicalize type of constant indices to i64 to simplify CSE - if (auto *IndexC = dyn_cast(IdxOp)) + if (auto *IndexC = dyn_cast(IdxOp)) { if (auto *NewIdx = getPreferredVectorIndex(IndexC)) return replaceOperand(IE, 2, NewIdx); + Value *BaseVec, *OtherScalar; + uint64_t OtherIndexVal; + if (match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), + m_Value(OtherScalar), + m_ConstantInt(OtherIndexVal)))) && + !isa(OtherScalar) && OtherIndexVal > IndexC->getZExtValue()) { + Value *NewIns = Builder.CreateInsertElement(BaseVec, ScalarOp, IdxOp); + return InsertElementInst::Create(NewIns, OtherScalar, + Builder.getInt64(OtherIndexVal)); + } + } + // If the scalar is bitcast and inserted into undef, do the insert in the // source type followed by bitcast. // TODO: Generalize for insert into any constant, not just undef? diff --git a/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll index d327c7e..ad868b4 100644 --- a/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll @@ -163,9 +163,9 @@ define <4 x float> @bad7(float %v) { ; CHECK-LABEL: @bad7( ; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x float> poison, float [[V:%.*]], i64 1 ; CHECK-NEXT: [[A1:%.*]] = fadd <4 x float> [[INS1]], [[INS1]] -; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 2 -; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[V]], i64 3 -; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x float> [[INS3]], float [[V]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[V]], i64 2 +; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x float> [[TMP2]], float [[V]], i64 3 ; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[A1]], [[INS4]] ; CHECK-NEXT: ret <4 x float> [[RES]] ; diff --git a/llvm/test/Transforms/InstCombine/broadcast.ll b/llvm/test/Transforms/InstCombine/broadcast.ll index 8970c1a..7ee7060 100644 --- a/llvm/test/Transforms/InstCombine/broadcast.ll +++ b/llvm/test/Transforms/InstCombine/broadcast.ll @@ -16,7 +16,7 @@ define <4 x float> @good1(float %arg) { define <4 x float> @good2(float %arg) { ; CHECK-LABEL: @good2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i64 0 ; CHECK-NEXT: [[T6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x float> [[T6]] ; @@ -163,9 +163,9 @@ define <4 x float> @bad7(float %v) { ; CHECK-LABEL: @bad7( ; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x float> undef, float [[V:%.*]], i64 1 ; CHECK-NEXT: [[A1:%.*]] = fadd <4 x float> [[INS1]], [[INS1]] -; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 2 -; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[V]], i64 3 -; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x float> [[INS3]], float [[V]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[V]], i64 2 +; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x float> [[TMP2]], float [[V]], i64 3 ; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[A1]], [[INS4]] ; CHECK-NEXT: ret <4 x float> [[RES]] ; diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll index febb5e0..0b7b8fa 100644 --- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll @@ -355,8 +355,8 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x fl define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) { ; CHECK-LABEL: @insert_insert_shuffle_translate( -; CHECK-NEXT: [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i64 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[TMP1]], float [[X2:%.*]], i64 2 ; CHECK-NEXT: ret <4 x float> [[R]] ; %xv1 = insertelement <4 x float> %q, float %x1, i32 0 diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll index 70d491a..d34ca0c 100644 --- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll +++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll @@ -355,8 +355,8 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x fl define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) { ; CHECK-LABEL: @insert_insert_shuffle_translate( -; CHECK-NEXT: [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i64 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[TMP1]], float [[X2:%.*]], i64 2 ; CHECK-NEXT: ret <4 x float> [[R]] ; %xv1 = insertelement <4 x float> %q, float %x1, i32 0 diff --git a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll index cd735b8..a2721bf 100644 --- a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll +++ b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll @@ -38,8 +38,8 @@ define <8 x i16> @insert_10_poison_v8i16(i32 %x) { ; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 ; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; LE-NEXT: [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1 -; LE-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0 +; LE-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[HI16]], i64 0 +; LE-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[LO16]], i64 1 ; LE-NEXT: ret <8 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -76,8 +76,8 @@ define <4 x i16> @insert_21_poison_v4i16(i32 %x) { ; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 ; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 2 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1 +; ALL-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1 +; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 2 ; ALL-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -120,8 +120,8 @@ define <4 x i16> @insert_32_poison_v4i16(i32 %x) { ; LE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 ; LE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; LE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; LE-NEXT: [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3 -; LE-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2 +; LE-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 2 +; LE-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 3 ; LE-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -165,8 +165,8 @@ define <8 x i16> @insert_10_v8i16(i32 %x, <8 x i16> %v) { ; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 ; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1 -; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0 +; ALL-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[HI16]], i64 0 +; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[LO16]], i64 1 ; ALL-NEXT: ret <8 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -203,8 +203,8 @@ define <4 x i16> @insert_21_v4i16(i32 %x, <4 x i16> %v) { ; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 ; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 2 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1 +; ALL-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[HI16]], i64 1 +; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 2 ; ALL-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -241,8 +241,8 @@ define <4 x i16> @insert_32_v4i16(i32 %x, <4 x i16> %v) { ; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 ; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2 +; ALL-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[HI16]], i64 2 +; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LO16]], i64 3 ; ALL-NEXT: ret <4 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -321,8 +321,8 @@ define <8 x i16> @insert_76_v4i16_uses2(i32 %x, <8 x i16> %v) { ; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 ; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 ; ALL-NEXT: call void @use(i16 [[LO16]]) -; ALL-NEXT: [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 7 -; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 6 +; ALL-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[HI16]], i64 6 +; ALL-NEXT: [[INS1:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[LO16]], i64 7 ; ALL-NEXT: ret <8 x i16> [[INS1]] ; %hi32 = lshr i32 %x, 16 @@ -358,13 +358,18 @@ define <8 x i16> @insert_67_v4i16_uses3(i32 %x, <8 x i16> %v) { ; TODO: This is equivalent to the 1st test. define <4 x i16> @insert_01_poison_v4i16_high_first(i32 %x) { -; ALL-LABEL: @insert_01_poison_v4i16_high_first( -; ALL-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 -; ALL-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 -; ALL-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 -; ALL-NEXT: [[INS1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1 -; ALL-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[INS1]], i16 [[LO16]], i64 0 -; ALL-NEXT: ret <4 x i16> [[INS0]] +; BE-LABEL: @insert_01_poison_v4i16_high_first( +; BE-NEXT: [[HI32:%.*]] = lshr i32 [[X:%.*]], 16 +; BE-NEXT: [[HI16:%.*]] = trunc i32 [[HI32]] to i16 +; BE-NEXT: [[LO16:%.*]] = trunc i32 [[X]] to i16 +; BE-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0 +; BE-NEXT: [[INS0:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[HI16]], i64 1 +; BE-NEXT: ret <4 x i16> [[INS0]] +; +; LE-LABEL: @insert_01_poison_v4i16_high_first( +; LE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0 +; LE-NEXT: [[INS0:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16> +; LE-NEXT: ret <4 x i16> [[INS0]] ; %hi32 = lshr i32 %x, 16 %hi16 = trunc i32 %hi32 to i16 diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll index c2a31b5..3256d40 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll @@ -229,8 +229,8 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y define <2 x i8> @test13a(i8 %x1, i8 %x2) { ; CHECK-LABEL: @test13a( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X2:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X2:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X1:%.*]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[TMP3]] ; diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll index 46d72bf..fdbd9c14 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll @@ -229,8 +229,8 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y define <2 x i8> @test13a(i8 %x1, i8 %x2) { ; CHECK-LABEL: @test13a( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X2:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X2:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[X1:%.*]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[TMP3]] ; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index 2f5640e..6f7e6d8 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -61,12 +61,12 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x define i32 @TestVectorsEqual(ptr noalias %Vec0, ptr noalias %Vec1, i32 %Tolerance) { ; CHECK-LABEL: @TestVectorsEqual( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP6]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP4]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND6:%.*]] = zext i1 [[CMP5_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND6]] ; @@ -119,11 +119,11 @@ for.end: define i32 @TestVectorsEqual_alt(ptr noalias %Vec0, ptr noalias %Vec1, i32 %Tolerance) { ; CHECK-LABEL: @TestVectorsEqual_alt( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[VEC0:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[VEC1:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP3]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND]] ; @@ -164,12 +164,12 @@ for.end: define i32 @TestVectorsEqualFP(ptr noalias %Vec0, ptr noalias %Vec1, float %Tolerance) { ; CHECK-LABEL: @TestVectorsEqualFP( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <4 x float> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ole float [[TMP4]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND5:%.*]] = zext i1 [[CMP4]] to i32 ; CHECK-NEXT: ret i32 [[COND5]] ; @@ -222,11 +222,11 @@ for.end: define i32 @TestVectorsEqualFP_alt(ptr noalias %Vec0, ptr noalias %Vec1, float %Tolerance) { ; CHECK-LABEL: @TestVectorsEqualFP_alt( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[VEC0:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[VEC1:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <4 x float> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) +; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP3]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 ; CHECK-NEXT: ret i32 [[COND]] ; @@ -272,8 +272,8 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[FNEG]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -- 2.7.4