From 595a74391daaff8daffa2f6e19274792d0074565 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 14 Jun 2023 11:06:15 +0100 Subject: [PATCH] [CostModel][X86] Tweak SSE2 v2i64 multiply costs based off D46276 script It looks like we were trying to account for SLM costs, which are actually handled separately Fixes #62969 --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 +- llvm/test/Analysis/CostModel/X86/arith-fix.ll | 12 +- .../Analysis/CostModel/X86/arith-int-codesize.ll | 6 +- .../CostModel/X86/arith-int-sizelatency.ll | 6 +- llvm/test/Analysis/CostModel/X86/arith-int.ll | 6 +- llvm/test/Analysis/CostModel/X86/arith-overflow.ll | 12 +- .../Analysis/CostModel/X86/intrinsic-cost-kinds.ll | 6 +- llvm/test/Analysis/CostModel/X86/mul64.ll | 144 ++++++++++----------- llvm/test/Analysis/CostModel/X86/reduce-mul.ll | 16 +-- llvm/test/Analysis/CostModel/X86/rem-codesize.ll | 12 +- .../test/Analysis/CostModel/X86/rem-sizelatency.ll | 12 +- llvm/test/Analysis/CostModel/X86/rem.ll | 12 +- llvm/test/Transforms/SLPVectorizer/X86/mul64.ll | 19 +-- .../SLPVectorizer/X86/multi-nodes-to-shuffle.ll | 58 +++------ 14 files changed, 141 insertions(+), 182 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index ecba9b2..7a5baa8 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1335,7 +1335,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle - { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add + { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } }, diff --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll index e976fb63..3595f30 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll @@ -38,9 +38,9 @@ define i32 @smul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3) @@ -272,9 +272,9 @@ define i32 @umul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3) diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll index f469967..64dce90 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll @@ -776,9 +776,9 @@ define i32 @and(i32 %arg) { define i32 @mul(i32 %arg) { ; SSE2-LABEL: 'mul' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = mul <2 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = mul <4 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = mul <8 x i64> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = mul <4 x i32> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = mul <8 x i32> undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll index 580145e..c5ae4d8 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll @@ -620,9 +620,9 @@ define i32 @and(i32 %arg) { define i32 @mul(i32 %arg) { ; SSE2-LABEL: 'mul' ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = mul <2 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = mul <4 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = mul <8 x i64> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = mul <4 x i32> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = mul <8 x i32> undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll index cd70079..45bb058 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll @@ -890,9 +890,9 @@ define i32 @and(i32 %arg) { define i32 @mul(i32 %arg) { ; SSSE3-LABEL: 'mul' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = mul <2 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = mul <4 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = mul <8 x i64> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = mul <4 x i32> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = mul <8 x i32> undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll index 56efbb2..ba74526 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll @@ -994,9 +994,9 @@ define i32 @smul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) @@ -1232,9 +1232,9 @@ define i32 @umul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll index c87ac68..f5936a7 100644 --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -51,7 +51,7 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1) define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) { ; THRU-LABEL: 'umul' ; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b) -; THRU-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb) +; THRU-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'umul' @@ -61,12 +61,12 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) { ; ; SIZE-LABEL: 'umul' ; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b) -; SIZE-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb) +; SIZE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'umul' ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b) diff --git a/llvm/test/Analysis/CostModel/X86/mul64.ll b/llvm/test/Analysis/CostModel/X86/mul64.ll index 6626048..7189720 100644 --- a/llvm/test/Analysis/CostModel/X86/mul64.ll +++ b/llvm/test/Analysis/CostModel/X86/mul64.ll @@ -31,12 +31,12 @@ define void @mul_sext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = sext <32 x i8> %b32 to <32 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i64> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'mul_sext_vXi8' @@ -52,12 +52,12 @@ define void @mul_sext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = sext <32 x i8> %b32 to <32 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i64> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'mul_sext_vXi8' @@ -453,12 +453,12 @@ define void @mul_sext_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = zext <32 x i8> %b32 to <32 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = zext <64 x i8> %b64 to <64 x i64> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'mul_sext_zext_vXi8' @@ -474,12 +474,12 @@ define void @mul_sext_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = zext <32 x i8> %b32 to <32 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = zext <64 x i8> %b64 to <64 x i64> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'mul_sext_zext_vXi8' @@ -689,12 +689,12 @@ define void @mul_sext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %xb32 = sext <32 x i16> %b32 to <32 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xb64 = sext <64 x i16> %b64 to <64 x i64> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'mul_sext_vXi16' @@ -710,12 +710,12 @@ define void @mul_sext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %xb32 = sext <32 x i16> %b32 to <32 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xb64 = sext <64 x i16> %b64 to <64 x i64> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'mul_sext_vXi16' @@ -1111,12 +1111,12 @@ define void @mul_sext_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = zext <32 x i16> %b32 to <32 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i64> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'mul_sext_zext_vXi16' @@ -1132,12 +1132,12 @@ define void @mul_sext_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = zext <32 x i16> %b32 to <32 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i64> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'mul_sext_zext_vXi16' @@ -1347,12 +1347,12 @@ define void @mul_sext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = sext <32 x i32> %b32 to <32 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = sext <64 x i32> %b64 to <64 x i64> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'mul_sext_vXi32' @@ -1368,12 +1368,12 @@ define void @mul_sext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = sext <32 x i32> %b32 to <32 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = sext <64 x i32> %b64 to <64 x i64> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'mul_sext_vXi32' @@ -1769,12 +1769,12 @@ define void @mul_sext_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %xb32 = zext <32 x i32> %b32 to <32 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb64 = zext <64 x i32> %b64 to <64 x i64> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'mul_sext_zext_vXi32' @@ -1790,12 +1790,12 @@ define void @mul_sext_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %xb32 = zext <32 x i32> %b32 to <32 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb64 = zext <64 x i32> %b64 to <64 x i64> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'mul_sext_zext_vXi32' diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll index 19ebace..93d3246 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll @@ -11,18 +11,18 @@ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' diff --git a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll index 108db72..9ae78fb 100644 --- a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll @@ -272,9 +272,9 @@ define i32 @urem_uniformconst() { define i32 @srem_constpow2() { ; SSE2-LABEL: 'srem_constpow2' ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8i32 = srem <8 x i32> undef, @@ -510,9 +510,9 @@ define i32 @urem_constpow2() { define i32 @srem_uniformconstpow2() { ; SSE2-LABEL: 'srem_uniformconstpow2' ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, diff --git a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll index ea52ce2..f737b43 100644 --- a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll @@ -272,9 +272,9 @@ define i32 @urem_uniformconst() { define i32 @srem_constpow2() { ; SSE2-LABEL: 'srem_constpow2' ; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V8i32 = srem <8 x i32> undef, @@ -510,9 +510,9 @@ define i32 @urem_constpow2() { define i32 @srem_uniformconstpow2() { ; SSE2-LABEL: 'srem_uniformconstpow2' ; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll index a5f2b85..8bbf775 100644 --- a/llvm/test/Analysis/CostModel/X86/rem.ll +++ b/llvm/test/Analysis/CostModel/X86/rem.ll @@ -709,9 +709,9 @@ define i32 @urem_uniformconst() { define i32 @srem_constpow2() { ; SSE2-LABEL: 'srem_constpow2' ; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, @@ -966,9 +966,9 @@ define i32 @urem_constpow2() { define i32 @srem_uniformconstpow2() { ; SSE2-LABEL: 'srem_uniformconstpow2' ; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll b/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll index d764d5c0..4b5b5da 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll @@ -1,27 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s --check-prefix=SCALAR +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 | FileCheck %s define void @PR62969(ptr dereferenceable(16) %out, ptr dereferenceable(16) %in) { -; SCALAR-LABEL: @PR62969( -; SCALAR-NEXT: [[IN0:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN:%.*]], i64 0, i64 0 -; SCALAR-NEXT: [[IN1:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN]], i64 0, i64 1 -; SCALAR-NEXT: [[X:%.*]] = load i64, ptr [[IN0]], align 8 -; SCALAR-NEXT: [[Y:%.*]] = load i64, ptr [[IN1]], align 8 -; SCALAR-NEXT: [[XL:%.*]] = and i64 [[X]], 4294967295 -; SCALAR-NEXT: [[YL:%.*]] = and i64 [[Y]], 4294967295 -; SCALAR-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; SCALAR-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; SCALAR-NEXT: [[M0:%.*]] = mul i64 [[XL]], [[XH]] -; SCALAR-NEXT: [[M1:%.*]] = mul i64 [[YL]], [[YH]] -; SCALAR-NEXT: [[OUT0:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT:%.*]], i64 0, i64 0 -; SCALAR-NEXT: [[OUT1:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT]], i64 0, i64 1 -; SCALAR-NEXT: store i64 [[M0]], ptr [[OUT0]], align 8 -; SCALAR-NEXT: store i64 [[M1]], ptr [[OUT1]], align 8 -; SCALAR-NEXT: ret void -; ; CHECK-LABEL: @PR62969( ; CHECK-NEXT: [[IN0:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[OUT0:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT:%.*]], i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll index 2ec4c04..cf3d40d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll @@ -4,49 +4,25 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[P0:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[A2:%.*]] = add i64 [[P2:%.*]], [[P2]] -; CHECK-NEXT: [[A3:%.*]] = add i64 [[P3:%.*]], [[P3]] -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[M2:%.*]] = mul i64 [[P2]], [[P2]] -; CHECK-NEXT: [[M3:%.*]] = mul i64 [[P3]], [[P3]] -; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]] -; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[S0:%.*]] = sub i64 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sub i64 [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[S2:%.*]] = sub i64 [[M2]], [[D2]] -; CHECK-NEXT: [[S3:%.*]] = sub i64 [[M3]], [[D3]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[SHL1:%.*]] = shl i64 [[TMP9]], [[S0]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[SHL2:%.*]] = shl i64 [[TMP10]], [[S1]] -; CHECK-NEXT: [[SHL3:%.*]] = shl i64 [[A2]], [[S2]] -; CHECK-NEXT: [[SHL4:%.*]] = shl i64 [[A3]], [[S3]] -; CHECK-NEXT: [[O0:%.*]] = or i64 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TT0:%.*]] = trunc i64 [[O0]] to i32 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TT1:%.*]] = trunc i64 [[O1]] to i32 -; CHECK-NEXT: [[O2:%.*]] = or i64 [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TT2:%.*]] = trunc i64 [[O2]] to i32 -; CHECK-NEXT: [[O3:%.*]] = or i64 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TT3:%.*]] = trunc i64 [[O3]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[PHI0:%.*]] = phi i32 [ [[T1:%.*]], [[BB]] ], [ [[TT0]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[T2:%.*]], [[BB]] ], [ [[TT1]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[T3:%.*]], [[BB]] ], [ [[TT2]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ [[T4:%.*]], [[BB]] ], [ [[TT3]], [[ENTRY]] ] -; CHECK-NEXT: [[T1]] = trunc i64 [[SHL1]] to i32 -; CHECK-NEXT: [[T2]] = trunc i64 [[SHL2]] to i32 -; CHECK-NEXT: [[T3]] = trunc i64 [[SHL3]] to i32 -; CHECK-NEXT: [[T4]] = trunc i64 [[SHL4]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32> ; CHECK-NEXT: br label [[BB]] ; entry: -- 2.7.4