From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 27 May 2017 14:07:03 +0000 (+0000)
Subject: [DAGCombiner] use narrow load to avoid vector extract
X-Git-Tag: llvmorg-5.0.0-rc1~4097
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=33f4a9728741b1305496ec89f5b472a57e21a32b;p=platform%2Fupstream%2Fllvm.git

[DAGCombiner] use narrow load to avoid vector extract

If we have (extract_subvector(load wide vector)) with no other users,
that can just be (load narrow vector). This is intentionally conservative.
Follow-ups may loosen the one-use constraint to account for the extract cost
or just remove the one-use check.

The memop chain updating is based on code that already exists multiple times
in x86 lowering, so that should be pulled into a helper function as a follow-up.

Background: this is a potential improvement noticed via regressions caused by
making x86's peekThroughBitcasts() not loop on consecutive bitcasts (see
comments in D33137).

Differential Revision: https://reviews.llvm.org/D33578

llvm-svn: 304072
---

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6928ad6b..23a302f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14555,6 +14555,52 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, NarrowBinOp);
 }
 
+/// If we are extracting a subvector from a wide vector load, convert to a
+/// narrow load to eliminate the extraction:
+/// (extract_subvector (load wide vector)) --> (load narrow vector)
+static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
+  // TODO: Add support for big-endian. The offset calculation must be adjusted.
+  if (DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
+  // TODO: The one-use check is overly conservative. Check the cost of the
+  // extract instead or remove that condition entirely.
+  auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
+  auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!Ld || !Ld->hasOneUse() || Ld->isVolatile() || !ExtIdx)
+    return SDValue();
+
+  // The narrow load will be offset from the base address of the old load if
+  // we are extracting from something besides index 0 (little-endian).
+  EVT VT = Extract->getValueType(0);
+  SDLoc DL(Extract);
+  SDValue BaseAddr = Ld->getOperand(1);
+  unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();
+
+  // TODO: Use "BaseIndexOffset" to make this more effective.
+  SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
+                                                   VT.getStoreSize());
+  SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
+
+  // The new load must have the same position as the old load in terms of memory
+  // dependency. Create a TokenFactor for Ld and NewLd and update uses of Ld's
+  // output chain to use that TokenFactor.
+  // TODO: This code is based on a similar sequence in x86 lowering. It should
+  // be moved to a helper function, so it can be shared and reused.
+  if (Ld->hasAnyUseOfValue(1)) {
+    SDValue OldChain = SDValue(Ld, 1);
+    SDValue NewChain = SDValue(NewLd.getNode(), 1);
+    SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                      OldChain, NewChain);
+    DAG.ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
+    DAG.UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+  }
+
+  return NewLd;
+}
+
 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
@@ -14563,6 +14609,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   if (V.isUndef())
     return DAG.getUNDEF(NVT);
 
+  if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
+    if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
+      return NarrowLoad;
+
   // Combine:
   //    (extract_subvec (concat V1, V2, ...), i)
   // Into:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index ff7a0a8..6b754b0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -33,7 +33,7 @@ define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_8h:
-;CHECK: sabdl2.8h
+;CHECK: sabdl.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -45,7 +45,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_4s:
-;CHECK: sabdl2.4s
+;CHECK: sabdl.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -57,7 +57,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_2d:
-;CHECK: sabdl2.2d
+;CHECK: sabdl.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -99,7 +99,7 @@ define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_8h:
-;CHECK: uabdl2.8h
+;CHECK: uabdl.8h
   %load1 = load <16 x i8>, <16 x i8>* %A
   %load2 = load <16 x i8>, <16 x i8>* %B
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -112,7 +112,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_4s:
-;CHECK: uabdl2.4s
+;CHECK: uabdl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -124,7 +124,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_2d:
-;CHECK: uabdl2.2d
+;CHECK: uabdl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -561,7 +561,7 @@ define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
 
 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: sabal2_8h:
-;CHECK: sabal2.8h
+;CHECK: sabal.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp3 = load <8 x i16>, <8 x i16>* %C
@@ -575,7 +575,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sabal2_4s:
-;CHECK: sabal2.4s
+;CHECK: sabal.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -589,7 +589,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sabal2_2d:
-;CHECK: sabal2.2d
+;CHECK: sabal.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -639,7 +639,7 @@ define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
 
 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: uabal2_8h:
-;CHECK: uabal2.8h
+;CHECK: uabal.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp3 = load <8 x i16>, <8 x i16>* %C
@@ -653,7 +653,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: uabal2_4s:
-;CHECK: uabal2.4s
+;CHECK: uabal.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -667,7 +667,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: uabal2_2d:
-;CHECK: uabal2.2d
+;CHECK: uabal.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp3 = load <2 x i64>, <2 x i64>* %C
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 9d09251..2a25538 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -318,7 +318,7 @@ define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw2.8h
+;CHECK: uaddw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -331,7 +331,7 @@ define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw2.4s
+;CHECK: uaddw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -344,7 +344,7 @@ define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw2.2d
+;CHECK: uaddw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
@@ -387,7 +387,7 @@ define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw2.8h
+;CHECK: saddw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -400,7 +400,7 @@ define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw2.4s
+;CHECK: saddw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -413,7 +413,7 @@ define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw2.2d
+;CHECK: saddw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index a7668ec..f70ed9a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -83,7 +83,7 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_4s:
-;CHECK: sqdmull2.4s
+;CHECK: sqdmull.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -94,7 +94,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_2d:
-;CHECK: sqdmull2.2d
+;CHECK: sqdmull.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -324,7 +324,7 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
 
 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_4s:
-;CHECK: sqdmlal2.4s
+;CHECK: sqdmlal.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -337,7 +337,7 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
 
 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_2d:
-;CHECK: sqdmlal2.2d
+;CHECK: sqdmlal.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -372,7 +372,7 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
 
 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_4s:
-;CHECK: sqdmlsl2.4s
+;CHECK: sqdmlsl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -385,7 +385,7 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
 
 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_2d:
-;CHECK: sqdmlsl2.2d
+;CHECK: sqdmlsl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -874,7 +874,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmull2.4s
+;CHECK: sqdmull.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -886,7 +886,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmull2.2d
+;CHECK: sqdmull.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -994,7 +994,7 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmlal2.4s
+;CHECK: sqdmlal.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -1008,7 +1008,7 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmlal2.2d
+;CHECK: sqdmlal.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -1147,7 +1147,7 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmlsl2.4s
+;CHECK: sqdmlsl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -1161,7 +1161,7 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmlsl2.2d
+;CHECK: sqdmlsl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
index c1c4649..6b0fe40 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1164,7 +1164,7 @@ define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
 
 define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: ushll2_8h:
-;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
         %load1 = load <16 x i8>, <16 x i8>* %A
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -1174,7 +1174,7 @@ define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
 
 define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: ushll2_4s:
-;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
         %load1 = load <8 x i16>, <8 x i16>* %A
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -1184,7 +1184,7 @@ define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
 
 define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: ushll2_2d:
-;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
         %load1 = load <4 x i32>, <4 x i32>* %A
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -1221,7 +1221,7 @@ define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
 
 define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sshll2_8h:
-;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
         %load1 = load <16 x i8>, <16 x i8>* %A
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -1231,7 +1231,7 @@ define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
 
 define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sshll2_4s:
-;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
         %load1 = load <8 x i16>, <8 x i16>* %A
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -1241,7 +1241,7 @@ define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
 
 define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sshll2_2d:
-;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
         %load1 = load <4 x i32>, <4 x i32>* %A
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vsub.ll b/llvm/test/CodeGen/AArch64/arm64-vsub.ll
index 7af6911..6746e49 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vsub.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vsub.ll
@@ -157,7 +157,7 @@ define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_8h:
-;CHECK: ssubl2.8h
+;CHECK: ssubl.8h
         %tmp1 = load <16 x i8>, <16 x i8>* %A
         %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %ext1 = sext <8 x i8> %high1 to <8 x i16>
@@ -172,7 +172,7 @@ define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_4s:
-;CHECK: ssubl2.4s
+;CHECK: ssubl.4s
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %ext1 = sext <4 x i16> %high1 to <4 x i32>
@@ -187,7 +187,7 @@ define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_2d:
-;CHECK: ssubl2.2d
+;CHECK: ssubl.2d
         %tmp1 = load <4 x i32>, <4 x i32>* %A
         %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %ext1 = sext <2 x i32> %high1 to <2 x i64>
@@ -235,7 +235,7 @@ define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: usubl2_8h:
-;CHECK: usubl2.8h
+;CHECK: usubl.8h
   %tmp1 = load <16 x i8>, <16 x i8>* %A
   %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %ext1 = zext <8 x i8> %high1 to <8 x i16>
@@ -250,7 +250,7 @@ define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: usubl2_4s:
-;CHECK: usubl2.4s
+;CHECK: usubl.4s
   %tmp1 = load <8 x i16>, <8 x i16>* %A
   %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %ext1 = zext <4 x i16> %high1 to <4 x i32>
@@ -265,7 +265,7 @@ define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: usubl2_2d:
-;CHECK: usubl2.2d
+;CHECK: usubl.2d
   %tmp1 = load <4 x i32>, <4 x i32>* %A
   %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %ext1 = zext <2 x i32> %high1 to <2 x i64>
@@ -310,7 +310,7 @@ define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_8h:
-;CHECK: ssubw2.8h
+;CHECK: ssubw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -323,7 +323,7 @@ define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_4s:
-;CHECK: ssubw2.4s
+;CHECK: ssubw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -336,7 +336,7 @@ define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_2d:
-;CHECK: ssubw2.2d
+;CHECK: ssubw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
@@ -379,7 +379,7 @@ define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: usubw2_8h:
-;CHECK: usubw2.8h
+;CHECK: usubw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -392,7 +392,7 @@ define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: usubw2_4s:
-;CHECK: usubw2.4s
+;CHECK: usubw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -405,7 +405,7 @@ define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: usubw2_2d:
-;CHECK: usubw2.2d
+;CHECK: usubw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
diff --git a/llvm/test/CodeGen/ARM/vcombine.ll b/llvm/test/CodeGen/ARM/vcombine.ll
index 81b22ee..c08ed81 100644
--- a/llvm/test/CodeGen/ARM/vcombine.ll
+++ b/llvm/test/CodeGen/ARM/vcombine.ll
@@ -99,7 +99,9 @@ define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
 ; CHECK: vget_high8
 ; CHECK-NOT: vst
-; CHECK-LE: vmov r0, r1, d17
+; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0]
+; CHECK-LE: vldr  d16, [r0, #8]
+; CHECK-LE: vmov  r0, r1, d16
 ; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
         %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index e44e757..5742dc3 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -199,10 +199,10 @@ define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ; CHECK-LABEL: test_undef:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vzip.16 d19, d16
-; CHECK-NEXT:    vmov r0, r1, d19
+; CHECK-NEXT:    vldr  d16, [r1]
+; CHECK-NEXT:    vldr  d17, [r0, #8]
+; CHECK-NEXT:    vzip.16 d17, d16
+; CHECK-NEXT:    vmov  r0, r1, d17
 ; CHECK-NEXT:    mov pc, lr
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %tmp2 = load <8 x i16>, <8 x i16>* %B
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
index 3337f42..51f9a38 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2216,9 +2216,9 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k0, %k1, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index fa3471c..2e65bd8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -282,8 +282,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
 define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
 ; ALL-LABEL: shuffle_v16f32_extract_256:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovups (%rsi), %zmm0
-; ALL-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
+; ALL-NEXT:    vmovups 32(%rsi), %ymm0
 ; ALL-NEXT:    retq
   %ptr_a = bitcast float* %a to <16 x float>*
   %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 5aab217..706edd2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -511,11 +511,10 @@ define <8 x float> @expand14(<4 x float> %a) {
 ;
 ; KNL64-LABEL: expand14:
 ; KNL64:       # BB#0:
+; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
+; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
-; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL64-NEXT:    retq
 ;
@@ -529,11 +528,10 @@ define <8 x float> @expand14(<4 x float> %a) {
 ;
 ; KNL32-LABEL: expand14:
 ; KNL32:       # BB#0:
+; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
+; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
-; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL32-NEXT:    retl
    %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
@@ -545,39 +543,35 @@ define <8 x float> @expand14(<4 x float> %a) {
 define <8 x float> @expand15(<4 x float> %a) {
 ; SKX64-LABEL: expand15:
 ; SKX64:       # BB#0:
-; SKX64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SKX64-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
-; SKX64-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; SKX64-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SKX64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
-; SKX64-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX64-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
 ; SKX64-NEXT:    retq
 ;
 ; KNL64-LABEL: expand15:
 ; KNL64:       # BB#0:
+; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
-; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL64-NEXT:    retq
 ;
 ; SKX32-LABEL: expand15:
 ; SKX32:       # BB#0:
-; SKX32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SKX32-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
-; SKX32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; SKX32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SKX32-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
-; SKX32-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX32-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
 ; SKX32-NEXT:    retl
 ;
 ; KNL32-LABEL: expand15:
 ; KNL32:       # BB#0:
+; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
-; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL32-NEXT:    retl
    %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll
index 6b2e4de..42c4c23 100644
--- a/llvm/test/CodeGen/X86/widened-broadcast.ll
+++ b/llvm/test/CodeGen/X86/widened-broadcast.ll
@@ -151,8 +151,7 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ;
 ; AVX1-LABEL: load_splat_8i32_8i32_01010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovapd (%rdi), %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -288,8 +287,7 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ;
 ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -315,22 +313,10 @@ define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_16i16_0123012301230123:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -513,8 +499,7 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8
 ;
 ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -587,26 +572,10 @@ define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_4f32_8f32_0000:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_4f32_8f32_0000:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_4f32_8f32_0000:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_4f32_8f32_0000:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer
@@ -627,22 +596,10 @@ define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind
 ; SSE42-NEXT:    movapd %xmm0, %xmm1
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8f32_16f32_89898989:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vbroadcastsd 32(%rdi), %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8f32_16f32_89898989:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vbroadcastsd 32(%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8f32_16f32_89898989:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovapd (%rdi), %zmm0
-; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8f32_16f32_89898989:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastsd 32(%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x float>, <16 x float>* %ptr
   %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 8, i32 9, i32 8, i32 9, i32 8, i32 9>
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 6fbec91..f4d0503 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -57,10 +57,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -69,10 +67,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovupd (%rdi), %ymm0
 ; AVX2-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX2-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq