From b6050ca181684997d0284ca39a78adc6833ddfc4 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 2 Apr 2020 13:21:29 -0400 Subject: [PATCH] [VectorCombine] transform bitcasted shuffle to narrower elements bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' We do not attempt this in InstCombine because we do not want to change types and create new shuffle ops that are potentially not lowered as well as the original code. Here, we can check the cost model to see if it is worthwhile. I've aggressively enabled this transform even if the types are the same size and/or equal cost because moving the bitcast allows InstCombine to make further simplifications. In the motivating cases from PR35454: https://bugs.llvm.org/show_bug.cgi?id=35454 ...this is enough to let instcombine and the backend eliminate the redundant shuffles, but we probably want to extend VectorCombine to handle the inverse pattern (shuffle-of-bitcast) to get that simplification directly in IR. Differential Revision: https://reviews.llvm.org/D76727 --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 45 ++++++++++++ llvm/test/Transforms/VectorCombine/X86/shuffle.ll | 87 ++++++++++++++++------- 2 files changed, 107 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 70ee8cc..66095af 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -244,6 +245,49 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) { return true; } +/// If this is a bitcast to narrow elements from a shuffle of wider elements, +/// try to bitcast the source vector to the narrow type followed by shuffle. +/// This can enable further transforms by moving bitcasts or shuffles together. +static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) { + Value *V; + ArrayRef Mask; + if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(), + m_Mask(Mask)))))) + return false; + + Type *DestTy = I.getType(); + Type *SrcTy = V->getType(); + if (!DestTy->isVectorTy() || I.getOperand(0)->getType() != SrcTy) + return false; + + // TODO: Handle bitcast from narrow element type to wide element type. + assert(SrcTy->isVectorTy() && "Shuffle of non-vector type?"); + unsigned DestNumElts = DestTy->getVectorNumElements(); + unsigned SrcNumElts = SrcTy->getVectorNumElements(); + if (SrcNumElts > DestNumElts) + return false; + + // The new shuffle must not cost more than the old shuffle. The bitcast is + // moved ahead of the shuffle, so assume that it has the same cost as before. + if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) + return false; + + // Bitcast the source vector and expand the shuffle mask to the equivalent for + // narrow elements. + // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' + IRBuilder<> Builder(&I); + Value *CastV = Builder.CreateBitCast(V, DestTy); + SmallVector NewMask; + assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestNumElts / SrcNumElts; + scaleShuffleMask(ScaleFactor, Mask, NewMask); + Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), + NewMask); + I.replaceAllUsesWith(Shuf); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, const TargetTransformInfo &TTI, @@ -265,6 +309,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI, if (isa(I)) continue; MadeChange |= foldExtractExtract(I, TTI); + MadeChange |= foldBitcastShuf(I, TTI); } } diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll index b551654..5a758d7 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -2,28 +2,39 @@ ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX +; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb) + define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_narrow_element( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> -; CHECK-NEXT: ret <16 x i8> [[R]] +; SSE-LABEL: @bitcast_shuf_narrow_element( +; SSE-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> +; SSE-NEXT: ret <16 x i8> [[R]] +; +; AVX-LABEL: @bitcast_shuf_narrow_element( +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; AVX-NEXT: ret <16 x i8> [[TMP2]] ; %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %r = bitcast <4 x i32> %shuf to <16 x i8> ret <16 x i8> %r } +; v4f32 is the same cost as v4i32, so this always works + define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_same_size( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %r = bitcast <4 x i32> %shuf to <4 x float> ret <4 x float> %r } +; Negative test - length-changing shuffle + define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <4 x i32> @@ -35,6 +46,8 @@ define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) { ret <16 x i8> %r } +; Negative test - must cast to vector type + define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> @@ -46,6 +59,8 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { ret i128 %r } +; Negative test - but might want to try this + define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { ; CHECK-LABEL: @bitcast_shuf_wide_element( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> @@ -59,6 +74,8 @@ define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { declare void @use(<4 x i32>) +; Negative test - don't create an extra shuffle + define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { ; CHECK-LABEL: @bitcast_shuf_uses( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> @@ -73,15 +90,25 @@ define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { } define <2 x i64> @PR35454_1(<2 x i64> %v) { -; CHECK-LABEL: @PR35454_1( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> -; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> -; CHECK-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> -; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[BC3]] +; SSE-LABEL: @PR35454_1( +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> +; SSE-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], +; SSE-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[BC3]] +; +; AVX-LABEL: @PR35454_1( +; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; AVX-NEXT: [[ADD:%.*]] = shl <16 x i8> [[TMP2]], +; AVX-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; AVX-NEXT: ret <2 x i64> [[BC3]] ; %bc = bitcast <2 x i64> %v to <4 x i32> %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> @@ -94,15 +121,25 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) { } define <2 x i64> @PR35454_2(<2 x i64> %v) { -; CHECK-LABEL: @PR35454_2( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> -; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> -; CHECK-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], -; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> -; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[BC3]] +; SSE-LABEL: @PR35454_2( +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> +; SSE-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], +; SSE-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[BC3]] +; +; AVX-LABEL: @PR35454_2( +; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; AVX-NEXT: [[ADD:%.*]] = shl <8 x i16> [[TMP2]], +; AVX-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; AVX-NEXT: ret <2 x i64> [[BC3]] ; %bc = bitcast <2 x i64> %v to <4 x i32> %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> -- 2.7.4