From 6736096ac3efd149c34dc95a8ab3a8105349a535 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 17 Sep 2016 18:50:54 +0000
Subject: [PATCH] [X86][SSE] Improve target shuffle mask extraction

Add ability to extract vXi64 'vzext_movl' masks on 32-bit targets

llvm-svn: 281834
---
 llvm/lib/Target/X86/X86ISelLowering.cpp            | 24 +++++++++++++---------
 .../CodeGen/X86/vector-shuffle-combining-avx.ll    |  9 +-------
 .../CodeGen/X86/vector-shuffle-combining-xop.ll    |  5 +----
 3 files changed, 16 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b757290..a905188 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4763,6 +4763,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
 
   MVT VT = MaskNode.getSimpleValueType();
   assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
+  unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
 
   // Split an APInt element into MaskEltSizeInBits sized pieces and
   // insert into the shuffle mask.
@@ -4794,17 +4795,20 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
 
   if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
       MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
-
-    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-    if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
-      return false;
-    unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-
     SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
     if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
-      SplitElementToMask(CN->getAPIntValue());
-      RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
-      return true;
+      if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
+        RawMask.push_back(CN->getZExtValue());
+        RawMask.append(NumMaskElts - 1, 0);
+        return true;
+      }
+
+      if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
+        unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+        SplitElementToMask(CN->getAPIntValue());
+        RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
+        return true;
+      }
     }
     return false;
   }
@@ -4815,7 +4819,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
   // We can always decode if the buildvector is all zero constants,
   // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
   if (all_of(MaskNode->ops(), X86::isZeroNode)) {
-    RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
+    RawMask.append(NumMaskElts, 0);
     return true;
   }
 
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 9112fea..2d1bf08 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -258,10 +258,6 @@ define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
 define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
 ; X32-LABEL: combine_vpermilvar_2f64_identity:
 ; X32:       # BB#0:
-; X32-NEXT:    movl $2, %eax
-; X32-NEXT:    vmovd %eax, %xmm1
-; X32-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermilvar_2f64_identity:
@@ -365,10 +361,7 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
 define <2 x double> @constant_fold_vpermilvar_pd() {
 ; X32-LABEL: constant_fold_vpermilvar_pd:
 ; X32:       # BB#0:
-; X32-NEXT:    movl $2, %eax
-; X32-NEXT:    vmovd %eax, %xmm0
-; X32-NEXT:    vmovapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00]
-; X32-NEXT:    vpermilpd %xmm0, %xmm1, %xmm0
+; X32-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 4b760d1..121fafc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -15,10 +15,7 @@ declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
 define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
 ; X32-LABEL: combine_vpermil2pd_identity:
 ; X32:       # BB#0:
-; X32-NEXT:    movl $2, %eax
-; X32-NEXT:    vmovd %eax, %xmm2
-; X32-NEXT:    vpermil2pd $0, %xmm2, %xmm0, %xmm1, %xmm0
-; X32-NEXT:    vpermil2pd $0, %xmm2, %xmm0, %xmm0, %xmm0
+; X32-NEXT:    vmovaps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermil2pd_identity:
-- 
2.7.4