From 0bab0f6161193cd0cd24b7b0fc51590a60e810d2 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 25 May 2021 07:52:48 -0400
Subject: [PATCH] [InstCombine] canonicalize cast before unary shuffle

We could go either direction on this transform. VectorCombine already goes this
way for bitcasts (and handles more complicated cases using the cost model), so
let's try cast-first.

Deferring completely to VectorCombine is another possibility. But the backend
should be able to invert this easily when the vectors have the same shape, so
it doesn't seem like a transform that we need to avoid.

The motivating example from https://llvm.org/PR49081 has an int-to-float
sandwiched between 2 shuffles, and the backend currently does not reduce that,
so on x86, we get something like:

  pshufd	$249, %xmm0, %xmm0]
  cvtdq2ps	%xmm0, %xmm0
  shufps	$144, %xmm0, %xmm0

...instead of just a single conversion instruction.

Differential Revision: https://reviews.llvm.org/D103038
---
 .../Transforms/InstCombine/InstCombineCasts.cpp    | 20 +++++++++-
 .../InstCombine/X86/x86-f16c-inseltpoison.ll       |  4 +-
 llvm/test/Transforms/InstCombine/X86/x86-f16c.ll   |  4 +-
 .../Transforms/InstCombine/shuffle-cast-dist.ll    |  4 +-
 .../InstCombine/shufflevec-bitcast-inseltpoison.ll |  6 +--
 .../Transforms/InstCombine/shufflevec-bitcast.ll   |  6 +--
 llvm/test/Transforms/InstCombine/vector-casts.ll   | 45 ++++++++++++++++++----
 7 files changed, 68 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 785ee24..49c6057 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -277,13 +277,13 @@ InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
 /// Implement the transforms common to all CastInst visitors.
 Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
+  Type *Ty = CI.getType();
 
   // Try to eliminate a cast of a cast.
   if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
     if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
-      auto *Ty = CI.getType();
       auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
       // Point debug users of the dying cast to the new one.
       if (CSrc->hasOneUse())
@@ -319,6 +319,24 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
         return NV;
   }
 
+  // Canonicalize a unary shuffle after the cast if neither operation changes
+  // the size or element size of the input vector.
+  // TODO: We could allow size-changing ops if that doesn't harm codegen.
+  // cast (shuffle X, Mask) --> shuffle (cast X), Mask
+  Value *X;
+  ArrayRef<int> Mask;
+  if (match(Src, m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(Mask))))) {
+    // TODO: Allow scalable vectors?
+    auto *SrcTy = dyn_cast<FixedVectorType>(X->getType());
+    auto *DestTy = dyn_cast<FixedVectorType>(Ty);
+    if (SrcTy && DestTy &&
+        SrcTy->getNumElements() == DestTy->getNumElements() &&
+        SrcTy->getPrimitiveSizeInBits() == DestTy->getPrimitiveSizeInBits()) {
+      Value *CastX = Builder.CreateCast(CI.getOpcode(), X, DestTy);
+      return new ShuffleVectorInst(CastX, UndefValue::get(DestTy), Mask);
+    }
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
index bc0b6792..041d55e 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
@@ -24,8 +24,8 @@ define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
 ; All 8 elements required.
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
index 19a850c..bc7671e 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
@@ -24,8 +24,8 @@ define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
 ; All 8 elements required.
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll b/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
index c491b82..df0cb49 100644
--- a/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
@@ -4,8 +4,8 @@
 define <2 x float> @vtrn1(<2 x i32> %v)
 ; CHECK-LABEL: @vtrn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R_UNCASTED:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = bitcast <2 x i32> [[R_UNCASTED]] to <2 x float>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
 {
diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
index 0a88370d..e1cf1cf 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
@@ -56,9 +56,9 @@ define <4 x i16> @splat_bitcast_operand_uses(<8 x i8> %x) {
 
 define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
 ; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[S2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[BC]]
 ;
   %s1 = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %bc = bitcast <4 x float> %s1 to <4 x i32>
diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index 5f0ca89..ba57de3 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -56,9 +56,9 @@ define <4 x i16> @splat_bitcast_operand_uses(<8 x i8> %x) {
 
 define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
 ; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[S2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[BC]]
 ;
   %s1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %bc = bitcast <4 x float> %s1 to <4 x i32>
diff --git a/llvm/test/Transforms/InstCombine/vector-casts.ll b/llvm/test/Transforms/InstCombine/vector-casts.ll
index 6bb4f45..0cabb62 100644
--- a/llvm/test/Transforms/InstCombine/vector-casts.ll
+++ b/llvm/test/Transforms/InstCombine/vector-casts.ll
@@ -413,8 +413,8 @@ define <2 x i64> @zext_less_casting_with_wideop(<2 x i64> %x, <2 x i64> %y) {
 
 define <4 x float> @sitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @sitofp_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
-; CHECK-NEXT:    [[R:%.*]] = sitofp <4 x i32> [[S]] to <4 x float>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[X:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -424,8 +424,8 @@ define <4 x float> @sitofp_shuf(<4 x i32> %x) {
 
 define <3 x half> @uitofp_shuf(<3 x i16> %x) {
 ; CHECK-LABEL: @uitofp_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x i16> [[X:%.*]], <3 x i16> poison, <3 x i32> <i32 2, i32 undef, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = uitofp <3 x i16> [[S]] to <3 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp <3 x i16> [[X:%.*]] to <3 x half>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x half> [[TMP1]], <3 x half> undef, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x half> [[R]]
 ;
   %s = shufflevector <3 x i16> %x, <3 x i16> poison, <3 x i32> <i32 2, i32 undef, i32 0>
@@ -435,8 +435,8 @@ define <3 x half> @uitofp_shuf(<3 x i16> %x) {
 
 define <4 x i64> @fptosi_shuf(<4 x double> %x) {
 ; CHECK-LABEL: @fptosi_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[X:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
-; CHECK-NEXT:    [[R:%.*]] = fptosi <4 x double> [[S]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x double> [[X:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
 ;
   %s = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
@@ -446,8 +446,8 @@ define <4 x i64> @fptosi_shuf(<4 x double> %x) {
 
 define <2 x i32> @fptoui_shuf(<2 x float> %x) {
 ; CHECK-LABEL: @fptoui_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[R:%.*]] = fptoui <2 x float> [[S]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptoui <2 x float> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %s = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 1>
@@ -455,6 +455,9 @@ define <2 x i32> @fptoui_shuf(<2 x float> %x) {
   ret <2 x i32> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the shuffle?
+
 define <4 x half> @narrowing_sitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @narrowing_sitofp_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -466,6 +469,8 @@ define <4 x half> @narrowing_sitofp_shuf(<4 x i32> %x) {
   ret <4 x half> %r
 }
 
+; negative test
+
 define <4 x double> @widening_uitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @widening_uitofp_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -477,6 +482,8 @@ define <4 x double> @widening_uitofp_shuf(<4 x i32> %x) {
   ret <4 x double> %r
 }
 
+; negative test
+
 define <3 x i64> @fptosi_narrowing_shuf(<4 x double> %x) {
 ; CHECK-LABEL: @fptosi_narrowing_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[X:%.*]], <4 x double> poison, <3 x i32> <i32 undef, i32 2, i32 3>
@@ -488,6 +495,9 @@ define <3 x i64> @fptosi_narrowing_shuf(<4 x double> %x) {
   ret <3 x i64> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the cast?
+
 define <3 x i32> @fptoui_widening_shuf(<2 x float> %x) {
 ; CHECK-LABEL: @fptoui_widening_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <3 x i32> <i32 1, i32 1, i32 0>
@@ -499,6 +509,9 @@ define <3 x i32> @fptoui_widening_shuf(<2 x float> %x) {
   ret <3 x i32> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the cast?
+
 define <4 x half> @narrowing_sitofp_widening_shuf(<2 x i32> %x) {
 ; CHECK-LABEL: @narrowing_sitofp_widening_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 undef>
@@ -512,6 +525,8 @@ define <4 x half> @narrowing_sitofp_widening_shuf(<2 x i32> %x) {
 
 declare void @use(<4 x i32>)
 
+; negative test
+
 define <4 x float> @sitofp_shuf_extra_use(<4 x i32> %x) {
 ; CHECK-LABEL: @sitofp_shuf_extra_use(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -524,3 +539,17 @@ define <4 x float> @sitofp_shuf_extra_use(<4 x i32> %x) {
   %r = sitofp <4 x i32> %s to <4 x float>
   ret <4 x float> %r
 }
+
+; negative test
+; TODO: Allow scalable vectors?
+
+define <vscale x 4 x float> @sitofp_shuf_scalable(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: @sitofp_shuf_scalable(
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <vscale x 4 x i32> [[X:%.*]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = sitofp <vscale x 4 x i32> [[S]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[R]]
+;
+  %s = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %r = sitofp <vscale x 4 x i32> %s to <vscale x 4 x float>
+  ret <vscale x 4 x float> %r
+}
-- 
2.7.4