From 595a74391daaff8daffa2f6e19274792d0074565 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 14 Jun 2023 11:06:15 +0100
Subject: [PATCH] [CostModel][X86] Tweak SSE2 v2i64 multiply costs based off
 D46276 script

It looks like we were trying to account for SLM costs, which are actually handled separately

Fixes #62969
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp     |   2 +-
 llvm/test/Analysis/CostModel/X86/arith-fix.ll      |  12 +-
 .../Analysis/CostModel/X86/arith-int-codesize.ll   |   6 +-
 .../CostModel/X86/arith-int-sizelatency.ll         |   6 +-
 llvm/test/Analysis/CostModel/X86/arith-int.ll      |   6 +-
 llvm/test/Analysis/CostModel/X86/arith-overflow.ll |  12 +-
 .../Analysis/CostModel/X86/intrinsic-cost-kinds.ll |   6 +-
 llvm/test/Analysis/CostModel/X86/mul64.ll          | 144 ++++++++++-----------
 llvm/test/Analysis/CostModel/X86/reduce-mul.ll     |  16 +--
 llvm/test/Analysis/CostModel/X86/rem-codesize.ll   |  12 +-
 .../test/Analysis/CostModel/X86/rem-sizelatency.ll |  12 +-
 llvm/test/Analysis/CostModel/X86/rem.ll            |  12 +-
 llvm/test/Transforms/SLPVectorizer/X86/mul64.ll    |  19 +--
 .../SLPVectorizer/X86/multi-nodes-to-shuffle.ll    |  58 +++------
 14 files changed, 141 insertions(+), 182 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ecba9b2..7a5baa8 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1335,7 +1335,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::MUL,  MVT::v16i8,  {  5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
     { ISD::MUL,  MVT::v8i16,  {  1,  5, 1, 1 } }, // pmullw
     { ISD::MUL,  MVT::v4i32,  {  6,  8, 7, 7 } }, // 3*pmuludq/4*shuffle
-    { ISD::MUL,  MVT::v2i64,  {  8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
+    { ISD::MUL,  MVT::v2i64,  {  7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
 
     { X86ISD::PMULUDQ, MVT::v2i64, { 1,  5, 1, 1 } },
 
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
index e976fb63..3595f30 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
@@ -38,9 +38,9 @@ define i32 @smul(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
@@ -272,9 +272,9 @@ define i32 @umul(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
index f469967..64dce90 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
@@ -776,9 +776,9 @@ define i32 @and(i32 %arg) {
 define i32 @mul(i32 %arg) {
 ; SSE2-LABEL: 'mul'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = mul <2 x i64> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = mul <4 x i64> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = mul <8 x i64> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = mul <4 x i32> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = mul <8 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
index 580145e..c5ae4d8 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
@@ -620,9 +620,9 @@ define i32 @and(i32 %arg) {
 define i32 @mul(i32 %arg) {
 ; SSE2-LABEL: 'mul'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = mul <2 x i64> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = mul <4 x i64> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = mul <8 x i64> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = mul <4 x i32> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = mul <8 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll
index cd70079..45bb058 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll
@@ -890,9 +890,9 @@ define i32 @and(i32 %arg) {
 define i32 @mul(i32 %arg) {
 ; SSSE3-LABEL: 'mul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = mul <2 x i64> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = mul <4 x i64> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = mul <8 x i64> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = mul <4 x i32> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = mul <8 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index 56efbb2..ba74526 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -994,9 +994,9 @@ define i32 @smul(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -1232,9 +1232,9 @@ define i32 @umul(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index c87ac68..f5936a7 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -51,7 +51,7 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-LABEL: 'umul'
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'umul'
@@ -61,12 +61,12 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ;
 ; SIZE-LABEL: 'umul'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'umul'
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Analysis/CostModel/X86/mul64.ll b/llvm/test/Analysis/CostModel/X86/mul64.ll
index 6626048..7189720 100644
--- a/llvm/test/Analysis/CostModel/X86/mul64.ll
+++ b/llvm/test/Analysis/CostModel/X86/mul64.ll
@@ -31,12 +31,12 @@ define void @mul_sext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb32 = sext <32 x i8> %b32 to <32 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'mul_sext_vXi8'
@@ -52,12 +52,12 @@ define void @mul_sext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb32 = sext <32 x i8> %b32 to <32 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'mul_sext_vXi8'
@@ -453,12 +453,12 @@ define void @mul_sext_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb32 = zext <32 x i8> %b32 to <32 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xb64 = zext <64 x i8> %b64 to <64 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'mul_sext_zext_vXi8'
@@ -474,12 +474,12 @@ define void @mul_sext_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb32 = zext <32 x i8> %b32 to <32 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %xb64 = zext <64 x i8> %b64 to <64 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'mul_sext_zext_vXi8'
@@ -689,12 +689,12 @@ define void @mul_sext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %xb32 = sext <32 x i16> %b32 to <32 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %xb64 = sext <64 x i16> %b64 to <64 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'mul_sext_vXi16'
@@ -710,12 +710,12 @@ define void @mul_sext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %xb32 = sext <32 x i16> %b32 to <32 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %xb64 = sext <64 x i16> %b64 to <64 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'mul_sext_vXi16'
@@ -1111,12 +1111,12 @@ define void @mul_sext_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %xb32 = zext <32 x i16> %b32 to <32 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'mul_sext_zext_vXi16'
@@ -1132,12 +1132,12 @@ define void @mul_sext_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %xb32 = zext <32 x i16> %b32 to <32 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'mul_sext_zext_vXi16'
@@ -1347,12 +1347,12 @@ define void @mul_sext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %xb32 = sext <32 x i32> %b32 to <32 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb64 = sext <64 x i32> %b64 to <64 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'mul_sext_vXi32'
@@ -1368,12 +1368,12 @@ define void @mul_sext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %xb32 = sext <32 x i32> %b32 to <32 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xb64 = sext <64 x i32> %b64 to <64 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'mul_sext_vXi32'
@@ -1769,12 +1769,12 @@ define void @mul_sext_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %xb32 = zext <32 x i32> %b32 to <32 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %xb64 = zext <64 x i32> %b64 to <64 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'mul_sext_zext_vXi32'
@@ -1790,12 +1790,12 @@ define void @mul_sext_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %xb32 = zext <32 x i32> %b32 to <32 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %xb64 = zext <64 x i32> %b64 to <64 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'mul_sext_zext_vXi32'
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
index 19ebace..93d3246 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -11,18 +11,18 @@
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
diff --git a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
index 108db72..9ae78fb 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
@@ -272,9 +272,9 @@ define i32 @urem_uniformconst() {
 define i32 @srem_constpow2() {
 ; SSE2-LABEL: 'srem_constpow2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
@@ -510,9 +510,9 @@ define i32 @urem_constpow2() {
 define i32 @srem_uniformconstpow2() {
 ; SSE2-LABEL: 'srem_uniformconstpow2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
index ea52ce2..f737b43 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
@@ -272,9 +272,9 @@ define i32 @urem_uniformconst() {
 define i32 @srem_constpow2() {
 ; SSE2-LABEL: 'srem_constpow2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
@@ -510,9 +510,9 @@ define i32 @urem_constpow2() {
 define i32 @srem_uniformconstpow2() {
 ; SSE2-LABEL: 'srem_uniformconstpow2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll
index a5f2b85..8bbf775 100644
--- a/llvm/test/Analysis/CostModel/X86/rem.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem.ll
@@ -709,9 +709,9 @@ define i32 @urem_uniformconst() {
 define i32 @srem_constpow2() {
 ; SSE2-LABEL: 'srem_constpow2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
@@ -966,9 +966,9 @@ define i32 @urem_constpow2() {
 define i32 @srem_uniformconstpow2() {
 ; SSE2-LABEL: 'srem_uniformconstpow2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll b/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
index d764d5c0..4b5b5da 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
@@ -1,27 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s --check-prefix=SCALAR
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 | FileCheck %s
 
 define void @PR62969(ptr dereferenceable(16) %out, ptr dereferenceable(16) %in) {
-; SCALAR-LABEL: @PR62969(
-; SCALAR-NEXT:    [[IN0:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN:%.*]], i64 0, i64 0
-; SCALAR-NEXT:    [[IN1:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN]], i64 0, i64 1
-; SCALAR-NEXT:    [[X:%.*]] = load i64, ptr [[IN0]], align 8
-; SCALAR-NEXT:    [[Y:%.*]] = load i64, ptr [[IN1]], align 8
-; SCALAR-NEXT:    [[XL:%.*]] = and i64 [[X]], 4294967295
-; SCALAR-NEXT:    [[YL:%.*]] = and i64 [[Y]], 4294967295
-; SCALAR-NEXT:    [[XH:%.*]] = lshr i64 [[X]], 32
-; SCALAR-NEXT:    [[YH:%.*]] = lshr i64 [[Y]], 32
-; SCALAR-NEXT:    [[M0:%.*]] = mul i64 [[XL]], [[XH]]
-; SCALAR-NEXT:    [[M1:%.*]] = mul i64 [[YL]], [[YH]]
-; SCALAR-NEXT:    [[OUT0:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT:%.*]], i64 0, i64 0
-; SCALAR-NEXT:    [[OUT1:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT]], i64 0, i64 1
-; SCALAR-NEXT:    store i64 [[M0]], ptr [[OUT0]], align 8
-; SCALAR-NEXT:    store i64 [[M1]], ptr [[OUT1]], align 8
-; SCALAR-NEXT:    ret void
-;
 ; CHECK-LABEL: @PR62969(
 ; CHECK-NEXT:    [[IN0:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[OUT0:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT:%.*]], i64 0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index 2ec4c04..cf3d40d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -4,49 +4,25 @@
 define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[P0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[A2:%.*]] = add i64 [[P2:%.*]], [[P2]]
-; CHECK-NEXT:    [[A3:%.*]] = add i64 [[P3:%.*]], [[P3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[M2:%.*]] = mul i64 [[P2]], [[P2]]
-; CHECK-NEXT:    [[M3:%.*]] = mul i64 [[P3]], [[P3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sdiv <2 x i64> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[D2:%.*]] = sdiv i64 [[P2]], [[P2]]
-; CHECK-NEXT:    [[D3:%.*]] = sdiv i64 [[P3]], [[P3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[S0:%.*]] = sub i64 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = sub i64 [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[S2:%.*]] = sub i64 [[M2]], [[D2]]
-; CHECK-NEXT:    [[S3:%.*]] = sub i64 [[M3]], [[D3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[SHL1:%.*]] = shl i64 [[TMP9]], [[S0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[SHL2:%.*]] = shl i64 [[TMP10]], [[S1]]
-; CHECK-NEXT:    [[SHL3:%.*]] = shl i64 [[A2]], [[S2]]
-; CHECK-NEXT:    [[SHL4:%.*]] = shl i64 [[A3]], [[S3]]
-; CHECK-NEXT:    [[O0:%.*]] = or i64 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TT0:%.*]] = trunc i64 [[O0]] to i32
-; CHECK-NEXT:    [[O1:%.*]] = or i64 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TT1:%.*]] = trunc i64 [[O1]] to i32
-; CHECK-NEXT:    [[O2:%.*]] = or i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TT2:%.*]] = trunc i64 [[O2]] to i32
-; CHECK-NEXT:    [[O3:%.*]] = or i64 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TT3:%.*]] = trunc i64 [[O3]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[PHI0:%.*]] = phi i32 [ [[T1:%.*]], [[BB]] ], [ [[TT0]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i32 [ [[T2:%.*]], [[BB]] ], [ [[TT1]], [[ENTRY]] ]
-; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ [[T3:%.*]], [[BB]] ], [ [[TT2]], [[ENTRY]] ]
-; CHECK-NEXT:    [[PHI3:%.*]] = phi i32 [ [[T4:%.*]], [[BB]] ], [ [[TT3]], [[ENTRY]] ]
-; CHECK-NEXT:    [[T1]] = trunc i64 [[SHL1]] to i32
-; CHECK-NEXT:    [[T2]] = trunc i64 [[SHL2]] to i32
-; CHECK-NEXT:    [[T3]] = trunc i64 [[SHL3]] to i32
-; CHECK-NEXT:    [[T4]] = trunc i64 [[SHL4]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB]]
 ;
 entry:
-- 
2.7.4