From b6050ca181684997d0284ca39a78adc6833ddfc4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 2 Apr 2020 13:21:29 -0400
Subject: [PATCH] [VectorCombine] transform bitcasted shuffle to narrower
 elements

bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'

We do not attempt this in InstCombine because we do not want to change
types and create new shuffle ops that are potentially not lowered as
well as the original code. Here, we can check the cost model to see if
it is worthwhile.

I've aggressively enabled this transform even if the types are the same
size and/or equal cost because moving the bitcast allows InstCombine to
make further simplifications.

In the motivating cases from PR35454:
https://bugs.llvm.org/show_bug.cgi?id=35454
...this is enough to let instcombine and the backend eliminate the
redundant shuffles, but we probably want to extend VectorCombine to
handle the inverse pattern (shuffle-of-bitcast) to get that
simplification directly in IR.

Differential Revision: https://reviews.llvm.org/D76727
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp   | 45 ++++++++++++
 llvm/test/Transforms/VectorCombine/X86/shuffle.ll | 87 ++++++++++++++++-------
 2 files changed, 107 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 70ee8cc..66095af 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -244,6 +245,49 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) {
   return true;
 }
 
+/// If this is a bitcast to narrow elements from a shuffle of wider elements,
+/// try to bitcast the source vector to the narrow type followed by shuffle.
+/// This can enable further transforms by moving bitcasts or shuffles together.
+static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) {
+  Value *V;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(),
+                                                    m_Mask(Mask))))))
+    return false;
+
+  Type *DestTy = I.getType();
+  Type *SrcTy = V->getType();
+  if (!DestTy->isVectorTy() || I.getOperand(0)->getType() != SrcTy)
+    return false;
+
+  // TODO: Handle bitcast from narrow element type to wide element type.
+  assert(SrcTy->isVectorTy() && "Shuffle of non-vector type?");
+  unsigned DestNumElts = DestTy->getVectorNumElements();
+  unsigned SrcNumElts = SrcTy->getVectorNumElements();
+  if (SrcNumElts > DestNumElts)
+    return false;
+
+  // The new shuffle must not cost more than the old shuffle. The bitcast is
+  // moved ahead of the shuffle, so assume that it has the same cost as before.
+  if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
+    return false;
+
+  // Bitcast the source vector and expand the shuffle mask to the equivalent for
+  // narrow elements.
+  // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+  IRBuilder<> Builder(&I);
+  Value *CastV = Builder.CreateBitCast(V, DestTy);
+  SmallVector<int, 16> NewMask;
+  assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
+  unsigned ScaleFactor = DestNumElts / SrcNumElts;
+  scaleShuffleMask(ScaleFactor, Mask, NewMask);
+  Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy),
+                                            NewMask);
+  I.replaceAllUsesWith(Shuf);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
@@ -265,6 +309,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       MadeChange |= foldExtractExtract(I, TTI);
+      MadeChange |= foldBitcastShuf(I, TTI);
     }
   }
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
index b551654..5a758d7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -2,28 +2,39 @@
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
+; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb)
+
 define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @bitcast_shuf_narrow_element(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    ret <16 x i8> [[TMP2]]
 ;
   %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %r = bitcast <4 x i32> %shuf to <16 x i8>
   ret <16 x i8> %r
 }
 
+; v4f32 is the same cost as v4i32, so this always works
+
 define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_same_size(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float>
-; CHECK-NEXT:    ret <4 x float> [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %r = bitcast <4 x i32> %shuf to <4 x float>
   ret <4 x float> %r
 }
 
+; Negative test - length-changing shuffle
+
 define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -35,6 +46,8 @@ define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
   ret <16 x i8> %r
 }
 
+; Negative test - must cast to vector type
+
 define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -46,6 +59,8 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
   ret i128 %r
 }
 
+; Negative test - but might want to try this
+
 define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
 ; CHECK-LABEL: @bitcast_shuf_wide_element(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
@@ -59,6 +74,8 @@ define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
 
 declare void @use(<4 x i32>)
 
+; Negative test - don't create an extra shuffle
+
 define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_uses(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -73,15 +90,25 @@ define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
 }
 
 define <2 x i64> @PR35454_1(<2 x i64> %v) {
-; CHECK-LABEL: @PR35454_1(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
-; CHECK-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[BC3]]
+; SSE-LABEL: @PR35454_1(
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
+; SSE-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; SSE-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[BC3]]
+;
+; AVX-LABEL: @PR35454_1(
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[TMP2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
+; AVX-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[BC3]]
 ;
   %bc = bitcast <2 x i64> %v to <4 x i32>
   %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -94,15 +121,25 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) {
 }
 
 define <2 x i64> @PR35454_2(<2 x i64> %v) {
-; CHECK-LABEL: @PR35454_2(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
-; CHECK-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[BC3]]
+; SSE-LABEL: @PR35454_2(
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
+; SSE-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[BC3]]
+;
+; AVX-LABEL: @PR35454_2(
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
+; AVX-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[TMP2]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
+; AVX-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[BC3]]
 ;
   %bc = bitcast <2 x i64> %v to <4 x i32>
   %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-- 
2.7.4