From 1425e10cc6f8306cba01589c9e3f8459fa7e2ffe Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 18 Jul 2018 05:10:51 +0000
Subject: [PATCH] [X86] Generate v2f64 X86ISD::UNPCKL/UNPCKH instead of
 X86ISD::MOVLHPS/MOVHLPS for unary v2f64 {0,0} and {1,1} shuffles with SSE2.

I'm trying to restrict the MOVLHPS/MOVHLPS ISD nodes to SSE1 only. With SSE2 we can use unpcks. I believe this will allow some patterns to be cleaned up to require fewer bitcasts.

I've put in an odd isel hack to still select MOVHLPS instruction from the unpckh node to avoid changing tests and because movhlps is a shorter encoding. Ideally we'd do execution domain switching on this, but the operands are in the wrong order and are tied. We might be able to try a commute in the domain switching using custom code.

We already support domain switching for UNPCKLPD and MOVLHPS.

llvm-svn: 337348
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  8 ++++----
 llvm/lib/Target/X86/X86InstrSSE.td           | 13 +++++++++++++
 llvm/test/CodeGen/X86/extractelement-load.ll |  2 +-
 3 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 37338ab..7a32ce5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29387,14 +29387,14 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
-      Shuffle = X86ISD::MOVLHPS;
-      SrcVT = DstVT = MVT::v4f32;
+      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
+      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
-      Shuffle = X86ISD::MOVHLPS;
-      SrcVT = DstVT = MVT::v4f32;
+      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
+      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index c8ad7d9..af40b00 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -862,6 +862,19 @@ let Constraints = "$src1 = $dst" in {
                       Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
 }
 
+// TODO: This is largely to trick fastisel into ignoring the pattern.
+def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2),
+                          (X86Unpckh node:$src1, node:$src2), [{
+  return N->getOperand(0) == N->getOperand(1);
+}]>;
+
+let Predicates = [UseSSE2] in {
+  // TODO: This is a hack pattern to allow lowering to emit unpckh instead of
+  // movhlps for sse2 without changing a bunch of tests.
+  def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)),
+            (MOVHLPSrr VR128:$src, VR128:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Conversion Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index f6249c6..94235d2 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -85,7 +85,7 @@ define i64 @t4(<2 x double>* %a) {
 ; X32-SSE2-LABEL: t4:
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X32-SSE2-NEXT:    movd %xmm1, %eax
 ; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-- 
2.7.4