From a207436b01b198c1fad9888d4b7953ccdb1ee830 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 8 Feb 2016 23:03:46 +0000
Subject: [PATCH] [X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding
 support

As discussed on PR26491, this patch adds support for lowering v4f32 shuffles to the MOVLHPS/MOVHLPS instructions. It also adds support for memory folding with their MOVLPS/MOVHPS load equivalents.

This first patch only really helps SSE1 targets as SSE2+ targets will widen the shuffle mask and use v2f64 equivalents (although they still combine to MOVLHPS/MOVHLPS for v2f64 splats). This will have to be addressed in a future patch, most likely when we add support for binary target shuffle combines.

Differential Revision: http://reviews.llvm.org/D16956

llvm-svn: 260168
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  6 ++++++
 llvm/lib/Target/X86/X86InstrInfo.cpp         | 23 ++++++++++++++++++++++
 llvm/test/CodeGen/X86/vector-shuffle-sse1.ll | 29 ++++++++++++++++++++++------
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f853c79..51b02e8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9022,6 +9022,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         return BlendPerm;
   }
 
+  // Use low/high mov instructions.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+    return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+    return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 092e443..bc73440 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -998,6 +998,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MINSDrr_Int,     X86::MINSDrm_Int,   0 },
     { X86::MINSSrr,         X86::MINSSrm,       0 },
     { X86::MINSSrr_Int,     X86::MINSSrm_Int,   0 },
+    { X86::MOVLHPSrr,       X86::MOVHPSrm,      TB_NO_REVERSE },
     { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
     { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
     { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
@@ -1298,6 +1299,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       0 },
     { X86::VMINSSrr,          X86::VMINSSrm,           0 },
     { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       0 },
+    { X86::VMOVLHPSrr,        X86::VMOVHPSrm,          TB_NO_REVERSE },
     { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
     { X86::VMULPDrr,          X86::VMULPDrm,           0 },
     { X86::VMULPSrr,          X86::VMULPSrm,           0 },
@@ -5531,6 +5533,23 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       }
     }
     break;
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+    // Move the upper 64-bits of the second operand to the lower 64-bits.
+    // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
+    // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
+    if (OpNum == 2) {
+      unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 8 <= Align) {
+        unsigned NewOpCode =
+            (MI->getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm
+                                                : X86::MOVLPSrm);
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
+        return NewMI;
+      }
+    }
+    break;
   };
 
   return nullptr;
@@ -5741,6 +5760,10 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::CVTSS2SDrm:
   case X86::Int_CVTSS2SDrr:
   case X86::Int_CVTSS2SDrm:
+  case X86::MOVHPDrm:
+  case X86::MOVHPSrm:
+  case X86::MOVLPDrm:
+  case X86::MOVLPSrm:
   case X86::RCPSSr:
   case X86::RCPSSm:
   case X86::RCPSSr_Int:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll
index 75c0f57..fc4652e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -94,7 +94,7 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
 ; SSE1-LABEL: shuffle_v4f32_0145:
 ; SSE1:       # BB#0:
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shuffle
@@ -102,8 +102,7 @@ define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
 ; SSE1-LABEL: shuffle_v4f32_6723:
 ; SSE1:       # BB#0:
-; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[2,3]
-; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   ret <4 x float> %shuffle
@@ -211,7 +210,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
 ; SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE1-NEXT:    xorps %xmm2, %xmm2
-; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; SSE1-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
 ; SSE1-NEXT:    movaps %xmm1, %xmm0
 ; SSE1-NEXT:    retq
@@ -232,8 +231,8 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
 ; SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE1-NEXT:    xorps %xmm2, %xmm2
-; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE1-NEXT:    retq
   %a = load <2 x float>, <2 x float>* %ptr
   %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -251,3 +250,21 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
   %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %shuffle
 }
+define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
+; SSE1-LABEL: shuffle_mem_v4f32_0145:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movhps (%rdi), %xmm0
+; SSE1-NEXT:    retq
+  %b = load <4 x float>, <4 x float>* %pb, align 16
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
+; SSE1-LABEL: shuffle_mem_v4f32_6723:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movlps 8(%rdi), %xmm0
+; SSE1-NEXT:    retq
+  %b = load <4 x float>, <4 x float>* %pb, align 16
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
-- 
2.7.4