From 005173cbb609f79adc2018e378bc6897cf84b06d Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 22 Jan 2023 00:12:27 +0300
Subject: [PATCH] [X86] `X86TargetLowering`: override `allowsMemoryAccess()`

The baseline `allowsMemoryAccess()` is wrong for X86.
It assumes that aligned memory operations are always allowed,
but that is not true.

For example, We can not perform a 32-byte aligned non-temporal load
of a 32-byte vector, without AVX2 that is, yet `allowsMemoryAccess()`
will say it is allowed, so we may end up merging non-temporal loads,
only to split them up to legalize them, and here we go again.

NOTE: the test changes here are superfluous. The main effect is that without this change,
in D141777, we'd get stuck endlessly merging and splitting non-temporal stores.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D141776
---
 llvm/lib/Target/X86/X86ISelLowering.cpp            |  74 ++++--
 llvm/lib/Target/X86/X86ISelLowering.h              |  19 ++
 llvm/test/CodeGen/X86/add-sub-bool.ll              |   8 +-
 llvm/test/CodeGen/X86/bswap-wide-int.ll            |   4 +-
 llvm/test/CodeGen/X86/fshl.ll                      |  68 +++---
 llvm/test/CodeGen/X86/fshr.ll                      |  51 ++--
 llvm/test/CodeGen/X86/i128-add.ll                  |  16 +-
 llvm/test/CodeGen/X86/icmp-shift-opt.ll            |  28 +--
 llvm/test/CodeGen/X86/legalize-shl-vec.ll          | 128 +++++-----
 .../CodeGen/X86/merge-consecutive-stores-nt.ll     |   8 +-
 llvm/test/CodeGen/X86/setcc-wide-types.ll          | 262 +++++++++------------
 llvm/test/CodeGen/X86/smin.ll                      |  26 +-
 llvm/test/CodeGen/X86/smul-with-overflow.ll        |   4 +-
 .../CodeGen/X86/smulo-128-legalisation-lowering.ll |  20 +-
 llvm/test/CodeGen/X86/umin.ll                      |  26 +-
 llvm/test/CodeGen/X86/umul-with-overflow.ll        |   4 +-
 llvm/test/CodeGen/X86/wide-integer-cmp.ll          |   2 +-
 llvm/test/CodeGen/X86/xaluo128.ll                  |  24 +-
 18 files changed, 404 insertions(+), 368 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f999e27..8ffc494 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2730,24 +2730,30 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   return true;
 }
 
+static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
+  return (8 * Alignment.value()) % SizeInBits == 0;
+}
+
+bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
+  if (isBitAligned(Alignment, VT.getSizeInBits()))
+    return true;
+  switch (VT.getSizeInBits()) {
+  default:
+    // 8-byte and under are always assumed to be fast.
+    return true;
+  case 128:
+    return !Subtarget.isUnalignedMem16Slow();
+  case 256:
+    return !Subtarget.isUnalignedMem32Slow();
+    // TODO: What about AVX-512 (512-bit) accesses?
+  }
+}
+
 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *Fast) const {
-  if (Fast) {
-    switch (VT.getSizeInBits()) {
-    default:
-      // 8-byte and under are always assumed to be fast.
-      *Fast = 1;
-      break;
-    case 128:
-      *Fast = !Subtarget.isUnalignedMem16Slow();
-      break;
-    case 256:
-      *Fast = !Subtarget.isUnalignedMem32Slow();
-      break;
-    // TODO: What about AVX-512 (512-bit) accesses?
-    }
-  }
+  if (Fast)
+    *Fast = isMemoryAccessFast(VT, Alignment);
   // NonTemporal vector memory ops must be aligned.
   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
     // NT loads can only be vector aligned, so if its less aligned than the
@@ -2762,6 +2768,44 @@ bool X86TargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
+bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
+                                           const DataLayout &DL, EVT VT,
+                                           unsigned AddrSpace, Align Alignment,
+                                           MachineMemOperand::Flags Flags,
+                                           unsigned *Fast) const {
+  if (Fast)
+    *Fast = isMemoryAccessFast(VT, Alignment);
+  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+    if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
+                                       /*Fast=*/nullptr))
+      return true;
+    // NonTemporal vector memory ops are special, and must be aligned.
+    if (!isBitAligned(Alignment, VT.getSizeInBits()))
+      return false;
+    switch (VT.getSizeInBits()) {
+    case 128:
+      if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
+        return true;
+      if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
+        return true;
+      return false;
+    case 256:
+      if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
+        return true;
+      if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
+        return true;
+      return false;
+    case 512:
+      if (Subtarget.hasAVX512())
+        return true;
+      return false;
+    default:
+      return false; // Don't have NonTemporal vector memory ops of this size.
+    }
+  }
+  return true;
+}
+
 /// Return the entry encoding for a jump table in the
 /// current function.  The returned value is a member of the
 /// MachineJumpTableInfo::JTEntryKind enum.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index c08227b..c5c1150 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1003,12 +1003,31 @@ namespace llvm {
     /// legal as the hook is used before type legalization.
     bool isSafeMemOpType(MVT VT) const override;
 
+    bool isMemoryAccessFast(EVT VT, Align Alignment) const;
+
     /// Returns true if the target allows unaligned memory accesses of the
     /// specified type. Returns whether it is "fast" in the last argument.
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
                                         MachineMemOperand::Flags Flags,
                                         unsigned *Fast) const override;
 
+    /// This function returns true if the memory access is aligned or if the
+    /// target allows this specific unaligned memory access. If the access is
+    /// allowed, the optional final parameter returns a relative speed of the
+    /// access (as defined by the target).
+    bool allowsMemoryAccess(
+        LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+        Align Alignment,
+        MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+        unsigned *Fast = nullptr) const override;
+
+    bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+                            const MachineMemOperand &MMO,
+                            unsigned *Fast) const {
+      return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(),
+                                MMO.getAlign(), MMO.getFlags(), Fast);
+    }
+
     /// Provide custom lowering hooks for some operations.
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index 17eda59..c2bfcf5 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -113,17 +113,17 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    btl $5, {{[0-9]+}}(%esp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 1ba107a..6d5e995 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -71,8 +71,8 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
 ; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
-; X86-MOVBE-NEXT:    movbel %edx, 4(%eax)
-; X86-MOVBE-NEXT:    movbel %ecx, (%eax)
+; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
+; X86-MOVBE-NEXT:    movbel %edx, (%eax)
 ; X86-MOVBE-NEXT:    popl %esi
 ; X86-MOVBE-NEXT:    popl %edi
 ; X86-MOVBE-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index bf561a19..36bf74f 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -275,12 +275,12 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-NEXT:    testb $64, %cl
 ; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edi, %eax
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FAST-NEXT:    movl %ebx, %ebp
 ; X86-FAST-NEXT:    movl %edx, %ebx
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edi, %eax
+; X86-FAST-NEXT:    movl %esi, %edi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
@@ -324,72 +324,72 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
 ; X86-SLOW-NEXT:    pushl %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    testb $64, %al
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebp, %ecx
-; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    movl %edx, %ebp
+; X86-SLOW-NEXT:    movl %edi, %edx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl %edx, %ebx
-; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movl %esi, %ebx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    testb $32, %al
 ; X86-SLOW-NEXT:    je .LBB6_5
 ; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %esi
-; X86-SLOW-NEXT:    movl %edx, %ebp
+; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    movl %edx, %ebx
 ; X86-SLOW-NEXT:    movl %ecx, %edx
 ; X86-SLOW-NEXT:    jmp .LBB6_6
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    testb $32, %al
 ; X86-SLOW-NEXT:    jne .LBB6_4
 ; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %ebp
+; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    movl %edx, %edi
+; X86-SLOW-NEXT:    movl %edx, %esi
 ; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    shrl %ebx
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    shrl %ebp
 ; X86-SLOW-NEXT:    movb %al, %ch
 ; X86-SLOW-NEXT:    notb %ch
 ; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebx
-; X86-SLOW-NEXT:    orl %edi, %ebx
-; X86-SLOW-NEXT:    movl %ebp, %edi
+; X86-SLOW-NEXT:    shrl %cl, %ebp
+; X86-SLOW-NEXT:    orl %esi, %ebp
+; X86-SLOW-NEXT:    movl %ebx, %esi
 ; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    shrl %edx
 ; X86-SLOW-NEXT:    movb %ch, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %edi, %edx
-; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    orl %esi, %edx
+; X86-SLOW-NEXT:    movl %edi, %esi
 ; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    shrl %ebp
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    shrl %ebx
 ; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    orl %edi, %ebp
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    orl %esi, %ebx
 ; X86-SLOW-NEXT:    movb %al, %cl
 ; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    shrl %edi
 ; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %esi
-; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    orl %eax, %edi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %esi, 12(%eax)
-; X86-SLOW-NEXT:    movl %ebp, 8(%eax)
+; X86-SLOW-NEXT:    movl %edi, 12(%eax)
+; X86-SLOW-NEXT:    movl %ebx, 8(%eax)
 ; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %ebx, (%eax)
+; X86-SLOW-NEXT:    movl %ebp, (%eax)
 ; X86-SLOW-NEXT:    addl $4, %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index eb1f040..367a3dd 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -263,20 +263,20 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-NEXT:    pushl %esi
 ; X86-FAST-NEXT:    pushl %eax
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
 ; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
+; X86-FAST-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FAST-NEXT:    movl %edi, %ebp
 ; X86-FAST-NEXT:    movl %ebx, %edi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edx, %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_4
 ; X86-FAST-NEXT:    jmp .LBB6_5
@@ -287,20 +287,20 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edx, %ebx
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl %ebp, %esi
+; X86-FAST-NEXT:    movl %esi, %ebx
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movl %edx, %edi
+; X86-FAST-NEXT:    movl %ebp, %edx
 ; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %esi, %ebp
-; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
-; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
+; X86-FAST-NEXT:    shrdl %cl, %edi, %edx
+; X86-FAST-NEXT:    shrdl %cl, %esi, %edi
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
-; X86-FAST-NEXT:    movl %edx, 12(%eax)
+; X86-FAST-NEXT:    shrdl %cl, %ebx, %esi
+; X86-FAST-NEXT:    movl %esi, 12(%eax)
 ; X86-FAST-NEXT:    movl %edi, 8(%eax)
-; X86-FAST-NEXT:    movl %esi, 4(%eax)
+; X86-FAST-NEXT:    movl %edx, 4(%eax)
 ; X86-FAST-NEXT:    movl %ebp, (%eax)
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
@@ -316,25 +316,25 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
 ; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    testb $64, %cl
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebx, %edx
-; X86-SLOW-NEXT:    movl %edi, %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %esi, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl %ebx, %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl %edi, %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    jne .LBB6_5
 ; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %ebp, %esi
 ; X86-SLOW-NEXT:    movl %edx, %ebp
 ; X86-SLOW-NEXT:    movl %eax, %edx
@@ -345,8 +345,7 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    je .LBB6_4
 ; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebx, %esi
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    shrl %cl, %edx
 ; X86-SLOW-NEXT:    movl %ecx, %ebx
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index b033fc1..2849e44 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -14,16 +14,16 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -55,16 +55,16 @@ define <1 x i128> @add_v1i128(<1 x i128> %x, <1 x i128> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 38815d9..7482de0 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -75,12 +75,12 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    shldl $15, %edx, %eax
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
@@ -100,12 +100,12 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    shldl $15, %edx, %eax
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
@@ -173,22 +173,22 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shldl $17, %esi, %edx
-; X86-NEXT:    shldl $17, %ecx, %esi
+; X86-NEXT:    shldl $17, %edx, %esi
+; X86-NEXT:    shldl $17, %ecx, %edx
 ; X86-NEXT:    shldl $17, %eax, %ecx
 ; X86-NEXT:    shll $17, %eax
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    sete %bl
-; X86-NEXT:    pushl %edx
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll use@PLT
diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index 845fc60..2c24db9 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -6,8 +6,8 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
 ; X32-LABEL: test_shl:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    shldl $2, %ecx, %edx
 ; X32-NEXT:    movl %edx, 60(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -48,13 +48,13 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    shldq $2, %rcx, %rdx
-; X64-NEXT:    shldq $2, %rdi, %rcx
+; X64-NEXT:    shldq $2, %rdx, %rcx
+; X64-NEXT:    shldq $2, %rdi, %rdx
 ; X64-NEXT:    shldq $2, %r9, %rdi
 ; X64-NEXT:    shlq $63, %rsi
 ; X64-NEXT:    shlq $2, %r9
-; X64-NEXT:    movq %rdx, 56(%rax)
-; X64-NEXT:    movq %rcx, 48(%rax)
+; X64-NEXT:    movq %rcx, 56(%rax)
+; X64-NEXT:    movq %rdx, 48(%rax)
 ; X64-NEXT:    movq %rdi, 40(%rax)
 ; X64-NEXT:    movq %r9, 32(%rax)
 ; X64-NEXT:    movq %rsi, 24(%rax)
@@ -84,36 +84,37 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
 ; X32-NEXT:    .cfi_offset %edi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
 ; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %ebp, %ebx
+; X32-NEXT:    shldl $28, %edx, %ebx
+; X32-NEXT:    shldl $28, %esi, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    shldl $28, %ecx, %esi
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    shldl $28, %edi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    shldl $28, %esi, %edi
-; X32-NEXT:    shldl $28, %edx, %esi
-; X32-NEXT:    shldl $28, %eax, %edx
-; X32-NEXT:    shldl $28, %ebp, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    shldl $28, %eax, %edi
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    shldl $28, %eax, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    shrdl $4, %eax, %ecx
-; X32-NEXT:    shrl $4, %ebx
+; X32-NEXT:    shldl $28, %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    shrdl $4, %eax, %edx
+; X32-NEXT:    shrl $4, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ebx, 60(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebp, 60(%eax)
 ; X32-NEXT:    movl %ebx, 56(%eax)
-; X32-NEXT:    movl %edi, 52(%eax)
-; X32-NEXT:    movl %esi, 48(%eax)
-; X32-NEXT:    movl %edx, 44(%eax)
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, 40(%eax)
-; X32-NEXT:    movl %ebp, 36(%eax)
-; X32-NEXT:    movl %ecx, 32(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, 52(%eax)
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, 48(%eax)
+; X32-NEXT:    movl %ecx, 44(%eax)
+; X32-NEXT:    movl %edi, 40(%eax)
+; X32-NEXT:    movl %esi, 36(%eax)
+; X32-NEXT:    movl %edx, 32(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shrl $31, %ecx
 ; X32-NEXT:    movl %ecx, (%eax)
@@ -143,12 +144,12 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    shrdq $4, %rsi, %r9
-; X64-NEXT:    shrdq $4, %rcx, %rsi
+; X64-NEXT:    shrdq $4, %rdx, %rsi
+; X64-NEXT:    shrdq $4, %rcx, %rdx
 ; X64-NEXT:    shrq $63, %r8
-; X64-NEXT:    shrdq $4, %rdx, %rcx
-; X64-NEXT:    shrq $4, %rdx
-; X64-NEXT:    movq %rdx, 56(%rdi)
-; X64-NEXT:    movq %rcx, 48(%rdi)
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    movq %rcx, 56(%rdi)
+; X64-NEXT:    movq %rdx, 48(%rdi)
 ; X64-NEXT:    movq %rsi, 40(%rdi)
 ; X64-NEXT:    movq %r9, 32(%rdi)
 ; X64-NEXT:    movq %r8, (%rdi)
@@ -178,36 +179,37 @@ define <2 x i256> @test_sra(<2 x i256> %In) {
 ; X32-NEXT:    .cfi_offset %edi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
 ; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %ebp, %ebx
+; X32-NEXT:    shldl $26, %edx, %ebx
+; X32-NEXT:    shldl $26, %esi, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    shldl $26, %ecx, %esi
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    shldl $26, %edi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    shldl $26, %esi, %edi
-; X32-NEXT:    shldl $26, %edx, %esi
-; X32-NEXT:    shldl $26, %eax, %edx
-; X32-NEXT:    shldl $26, %ebp, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    shldl $26, %eax, %edi
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    shldl $26, %eax, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    shrdl $6, %eax, %ecx
-; X32-NEXT:    sarl $6, %ebx
+; X32-NEXT:    shldl $26, %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    shrdl $6, %eax, %edx
+; X32-NEXT:    sarl $6, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ebx, 60(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebp, 60(%eax)
 ; X32-NEXT:    movl %ebx, 56(%eax)
-; X32-NEXT:    movl %edi, 52(%eax)
-; X32-NEXT:    movl %esi, 48(%eax)
-; X32-NEXT:    movl %edx, 44(%eax)
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, 40(%eax)
-; X32-NEXT:    movl %ebp, 36(%eax)
-; X32-NEXT:    movl %ecx, 32(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, 52(%eax)
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, 48(%eax)
+; X32-NEXT:    movl %ecx, 44(%eax)
+; X32-NEXT:    movl %edi, 40(%eax)
+; X32-NEXT:    movl %esi, 36(%eax)
+; X32-NEXT:    movl %edx, 32(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    sarl $31, %ecx
 ; X32-NEXT:    movl %ecx, 28(%eax)
@@ -237,12 +239,12 @@ define <2 x i256> @test_sra(<2 x i256> %In) {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    shrdq $6, %rsi, %r9
-; X64-NEXT:    shrdq $6, %rcx, %rsi
+; X64-NEXT:    shrdq $6, %rdx, %rsi
+; X64-NEXT:    shrdq $6, %rcx, %rdx
 ; X64-NEXT:    sarq $63, %r8
-; X64-NEXT:    shrdq $6, %rdx, %rcx
-; X64-NEXT:    sarq $6, %rdx
-; X64-NEXT:    movq %rdx, 56(%rdi)
-; X64-NEXT:    movq %rcx, 48(%rdi)
+; X64-NEXT:    sarq $6, %rcx
+; X64-NEXT:    movq %rcx, 56(%rdi)
+; X64-NEXT:    movq %rdx, 48(%rdi)
 ; X64-NEXT:    movq %rsi, 40(%rdi)
 ; X64-NEXT:    movq %r9, 32(%rdi)
 ; X64-NEXT:    movq %r8, 24(%rdi)
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
index 93052e4..c080569 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -48,10 +48,10 @@ define void @merge_2_v4f32_align32(ptr %a0, ptr %a1) nounwind {
 ;
 ; X64-AVX1-LABEL: merge_2_v4f32_align32:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
-; X64-AVX1-NEXT:    vmovntdqa (%rdi), %xmm1
-; X64-AVX1-NEXT:    vmovntdq %xmm1, (%rsi)
-; X64-AVX1-NEXT:    vmovntdq %xmm0, 16(%rsi)
+; X64-AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
+; X64-AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
+; X64-AVX1-NEXT:    vmovntdq %xmm0, (%rsi)
+; X64-AVX1-NEXT:    vmovntdq %xmm1, 16(%rsi)
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: merge_2_v4f32_align32:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 25c0719..44a6fad 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1   | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1   | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512F
@@ -734,63 +734,34 @@ define i32 @eq_i128_pair(ptr %a, ptr %b) {
 ; if we allowed 2 pairs of 32-byte loads per block.
 
 define i32 @ne_i256_pair(ptr %a, ptr %b) {
-; SSE2-LABEL: ne_i256_pair:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 16(%rdi), %rax
-; SSE2-NEXT:    movq 24(%rdi), %rcx
-; SSE2-NEXT:    movq (%rdi), %rdx
-; SSE2-NEXT:    movq 8(%rdi), %r8
-; SSE2-NEXT:    xorq 8(%rsi), %r8
-; SSE2-NEXT:    xorq 24(%rsi), %rcx
-; SSE2-NEXT:    xorq (%rsi), %rdx
-; SSE2-NEXT:    xorq 16(%rsi), %rax
-; SSE2-NEXT:    movq 48(%rdi), %r9
-; SSE2-NEXT:    movq 32(%rdi), %r10
-; SSE2-NEXT:    movq 56(%rdi), %r11
-; SSE2-NEXT:    movq 40(%rdi), %rdi
-; SSE2-NEXT:    xorq 40(%rsi), %rdi
-; SSE2-NEXT:    orq %r8, %rdi
-; SSE2-NEXT:    xorq 56(%rsi), %r11
-; SSE2-NEXT:    orq %rcx, %r11
-; SSE2-NEXT:    orq %rdi, %r11
-; SSE2-NEXT:    xorq 32(%rsi), %r10
-; SSE2-NEXT:    orq %rdx, %r10
-; SSE2-NEXT:    xorq 48(%rsi), %r9
-; SSE2-NEXT:    orq %rax, %r9
-; SSE2-NEXT:    orq %r10, %r9
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %r11, %r9
-; SSE2-NEXT:    setne %al
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: ne_i256_pair:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq 16(%rdi), %rax
-; SSE41-NEXT:    movq 24(%rdi), %rcx
-; SSE41-NEXT:    movq (%rdi), %rdx
-; SSE41-NEXT:    movq 8(%rdi), %r8
-; SSE41-NEXT:    xorq 8(%rsi), %r8
-; SSE41-NEXT:    xorq 24(%rsi), %rcx
-; SSE41-NEXT:    xorq (%rsi), %rdx
-; SSE41-NEXT:    xorq 16(%rsi), %rax
-; SSE41-NEXT:    movq 48(%rdi), %r9
-; SSE41-NEXT:    movq 32(%rdi), %r10
-; SSE41-NEXT:    movq 56(%rdi), %r11
-; SSE41-NEXT:    movq 40(%rdi), %rdi
-; SSE41-NEXT:    xorq 40(%rsi), %rdi
-; SSE41-NEXT:    orq %r8, %rdi
-; SSE41-NEXT:    xorq 56(%rsi), %r11
-; SSE41-NEXT:    orq %rcx, %r11
-; SSE41-NEXT:    orq %rdi, %r11
-; SSE41-NEXT:    xorq 32(%rsi), %r10
-; SSE41-NEXT:    orq %rdx, %r10
-; SSE41-NEXT:    xorq 48(%rsi), %r9
-; SSE41-NEXT:    orq %rax, %r9
-; SSE41-NEXT:    orq %r10, %r9
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    orq %r11, %r9
-; SSE41-NEXT:    setne %al
-; SSE41-NEXT:    retq
+; SSE-LABEL: ne_i256_pair:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 16(%rdi), %rax
+; SSE-NEXT:    movq 24(%rdi), %rcx
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    xorq 8(%rsi), %r8
+; SSE-NEXT:    xorq 24(%rsi), %rcx
+; SSE-NEXT:    xorq (%rsi), %rdx
+; SSE-NEXT:    xorq 16(%rsi), %rax
+; SSE-NEXT:    movq 48(%rdi), %r9
+; SSE-NEXT:    movq 32(%rdi), %r10
+; SSE-NEXT:    movq 56(%rdi), %r11
+; SSE-NEXT:    movq 40(%rdi), %rdi
+; SSE-NEXT:    xorq 40(%rsi), %rdi
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    xorq 56(%rsi), %r11
+; SSE-NEXT:    orq %rcx, %r11
+; SSE-NEXT:    orq %rdi, %r11
+; SSE-NEXT:    xorq 32(%rsi), %r10
+; SSE-NEXT:    orq %rdx, %r10
+; SSE-NEXT:    xorq 48(%rsi), %r9
+; SSE-NEXT:    orq %rax, %r9
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ne_i256_pair:
 ; AVX1:       # %bb.0:
@@ -848,63 +819,34 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) {
 ; if we allowed 2 pairs of 32-byte loads per block.
 
 define i32 @eq_i256_pair(ptr %a, ptr %b) {
-; SSE2-LABEL: eq_i256_pair:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 16(%rdi), %rax
-; SSE2-NEXT:    movq 24(%rdi), %rcx
-; SSE2-NEXT:    movq (%rdi), %rdx
-; SSE2-NEXT:    movq 8(%rdi), %r8
-; SSE2-NEXT:    xorq 8(%rsi), %r8
-; SSE2-NEXT:    xorq 24(%rsi), %rcx
-; SSE2-NEXT:    xorq (%rsi), %rdx
-; SSE2-NEXT:    xorq 16(%rsi), %rax
-; SSE2-NEXT:    movq 48(%rdi), %r9
-; SSE2-NEXT:    movq 32(%rdi), %r10
-; SSE2-NEXT:    movq 56(%rdi), %r11
-; SSE2-NEXT:    movq 40(%rdi), %rdi
-; SSE2-NEXT:    xorq 40(%rsi), %rdi
-; SSE2-NEXT:    orq %r8, %rdi
-; SSE2-NEXT:    xorq 56(%rsi), %r11
-; SSE2-NEXT:    orq %rcx, %r11
-; SSE2-NEXT:    orq %rdi, %r11
-; SSE2-NEXT:    xorq 32(%rsi), %r10
-; SSE2-NEXT:    orq %rdx, %r10
-; SSE2-NEXT:    xorq 48(%rsi), %r9
-; SSE2-NEXT:    orq %rax, %r9
-; SSE2-NEXT:    orq %r10, %r9
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %r11, %r9
-; SSE2-NEXT:    sete %al
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: eq_i256_pair:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq 16(%rdi), %rax
-; SSE41-NEXT:    movq 24(%rdi), %rcx
-; SSE41-NEXT:    movq (%rdi), %rdx
-; SSE41-NEXT:    movq 8(%rdi), %r8
-; SSE41-NEXT:    xorq 8(%rsi), %r8
-; SSE41-NEXT:    xorq 24(%rsi), %rcx
-; SSE41-NEXT:    xorq (%rsi), %rdx
-; SSE41-NEXT:    xorq 16(%rsi), %rax
-; SSE41-NEXT:    movq 48(%rdi), %r9
-; SSE41-NEXT:    movq 32(%rdi), %r10
-; SSE41-NEXT:    movq 56(%rdi), %r11
-; SSE41-NEXT:    movq 40(%rdi), %rdi
-; SSE41-NEXT:    xorq 40(%rsi), %rdi
-; SSE41-NEXT:    orq %r8, %rdi
-; SSE41-NEXT:    xorq 56(%rsi), %r11
-; SSE41-NEXT:    orq %rcx, %r11
-; SSE41-NEXT:    orq %rdi, %r11
-; SSE41-NEXT:    xorq 32(%rsi), %r10
-; SSE41-NEXT:    orq %rdx, %r10
-; SSE41-NEXT:    xorq 48(%rsi), %r9
-; SSE41-NEXT:    orq %rax, %r9
-; SSE41-NEXT:    orq %r10, %r9
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    orq %r11, %r9
-; SSE41-NEXT:    sete %al
-; SSE41-NEXT:    retq
+; SSE-LABEL: eq_i256_pair:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 16(%rdi), %rax
+; SSE-NEXT:    movq 24(%rdi), %rcx
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    xorq 8(%rsi), %r8
+; SSE-NEXT:    xorq 24(%rsi), %rcx
+; SSE-NEXT:    xorq (%rsi), %rdx
+; SSE-NEXT:    xorq 16(%rsi), %rax
+; SSE-NEXT:    movq 48(%rdi), %r9
+; SSE-NEXT:    movq 32(%rdi), %r10
+; SSE-NEXT:    movq 56(%rdi), %r11
+; SSE-NEXT:    movq 40(%rdi), %rdi
+; SSE-NEXT:    xorq 40(%rsi), %rdi
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    xorq 56(%rsi), %r11
+; SSE-NEXT:    orq %rcx, %r11
+; SSE-NEXT:    orq %rdi, %r11
+; SSE-NEXT:    xorq 32(%rsi), %r10
+; SSE-NEXT:    orq %rdx, %r10
+; SSE-NEXT:    xorq 48(%rsi), %r9
+; SSE-NEXT:    orq %rax, %r9
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: eq_i256_pair:
 ; AVX1:       # %bb.0:
@@ -1238,35 +1180,65 @@ define i1 @eq_i256_op(i256 %a, i256 %b) {
 }
 
 define i1 @eq_i512_op(i512 %a, i512 %b) {
-; ANY-LABEL: eq_i512_op:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; ANY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT:    addq $1, %rdi
-; ANY-NEXT:    adcq $0, %rsi
-; ANY-NEXT:    adcq $0, %rdx
-; ANY-NEXT:    adcq $0, %rcx
-; ANY-NEXT:    adcq $0, %r8
-; ANY-NEXT:    adcq $0, %r9
-; ANY-NEXT:    adcq $0, %r10
-; ANY-NEXT:    adcq $0, %rax
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
-; ANY-NEXT:    orq %rsi, %r9
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    orq %r9, %rax
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
-; ANY-NEXT:    orq %rdx, %r10
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
-; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
-; ANY-NEXT:    orq %r8, %rdi
-; ANY-NEXT:    orq %r10, %rdi
-; ANY-NEXT:    orq %rax, %rdi
-; ANY-NEXT:    sete %al
-; ANY-NEXT:    retq
+; SSE-LABEL: eq_i512_op:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    addq $1, %rdi
+; SSE-NEXT:    adcq $0, %rsi
+; SSE-NEXT:    adcq $0, %rdx
+; SSE-NEXT:    adcq $0, %rcx
+; SSE-NEXT:    adcq $0, %r8
+; SSE-NEXT:    adcq $0, %r9
+; SSE-NEXT:    adcq $0, %r10
+; SSE-NEXT:    adcq $0, %rax
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    orq %rsi, %r9
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    orq %rcx, %rax
+; SSE-NEXT:    orq %r9, %rax
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    orq %rdx, %r10
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    orq %r10, %rdi
+; SSE-NEXT:    orq %rax, %rdi
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVXANY-LABEL: eq_i512_op:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVXANY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVXANY-NEXT:    addq $1, %rdi
+; AVXANY-NEXT:    adcq $0, %rsi
+; AVXANY-NEXT:    adcq $0, %rdx
+; AVXANY-NEXT:    adcq $0, %rcx
+; AVXANY-NEXT:    adcq $0, %r8
+; AVXANY-NEXT:    adcq $0, %r9
+; AVXANY-NEXT:    adcq $0, %r10
+; AVXANY-NEXT:    adcq $0, %rax
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
+; AVXANY-NEXT:    orq %rsi, %r9
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
+; AVXANY-NEXT:    orq %rcx, %rax
+; AVXANY-NEXT:    orq %r9, %rax
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
+; AVXANY-NEXT:    orq %rdx, %r10
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
+; AVXANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
+; AVXANY-NEXT:    orq %r8, %rdi
+; AVXANY-NEXT:    orq %r10, %rdi
+; AVXANY-NEXT:    orq %rax, %rdi
+; AVXANY-NEXT:    sete %al
+; AVXANY-NEXT:    retq
   %a2 = add i512 %a, 1
   %r = icmp eq i512 %a2, %b
   ret i1 %r
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index e7b318c..f353853 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -158,24 +158,24 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %ecx, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    cmovbl %edi, %eax
 ; X86-NEXT:    cmpl %esi, %ebp
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    cmovbl %edi, %ebx
 ; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    cmovbl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmpl %ecx, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cmovbl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -183,24 +183,24 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    xorl %edi, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    cmovel %ebx, %ecx
+; X86-NEXT:    cmovel %ebx, %edx
 ; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovll %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index fb7bc96..fbdb6e7 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -441,8 +441,8 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, %edi
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index fbbc857..a3d94f7 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -89,8 +89,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
@@ -251,10 +251,10 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, %ebp
@@ -585,8 +585,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
@@ -1295,8 +1295,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
@@ -1315,9 +1315,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
@@ -1379,9 +1379,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index e37950e..0a747b8 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -154,24 +154,24 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %ecx, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    cmovbl %edi, %eax
 ; X86-NEXT:    cmpl %esi, %ebp
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    cmovbl %edi, %ebx
 ; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    cmovbl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmpl %ecx, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cmovbl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -179,24 +179,24 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    xorl %edi, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    cmovel %ebx, %ecx
+; X86-NEXT:    cmovel %ebx, %edx
 ; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovbl %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccd27dd..f5248d8 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -87,8 +87,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
@@ -532,8 +532,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    movq %r8, %r11
 ; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rbx
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633..189f516 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -99,8 +99,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    jge .LBB4_2
 ; CHECK-NEXT:  # %bb.1: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
diff --git a/llvm/test/CodeGen/X86/xaluo128.ll b/llvm/test/CodeGen/X86/xaluo128.ll
index 977df0f..740a2dd 100644
--- a/llvm/test/CodeGen/X86/xaluo128.ll
+++ b/llvm/test/CodeGen/X86/xaluo128.ll
@@ -24,13 +24,13 @@ define zeroext i1 @saddoi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    seto %al
 ; X86-NEXT:    movl %edi, (%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -64,13 +64,13 @@ define zeroext i1 @uaddoi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    movl %edi, (%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -105,13 +105,13 @@ define zeroext i1 @ssuboi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    seto %al
 ; X86-NEXT:    movl %edi, (%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -145,13 +145,13 @@ define zeroext i1 @usuboi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    movl %edi, (%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-- 
2.7.4