// TODO: Add support for matching multiple PACKSS/PACKUS stages.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
- SelectionDAG &DAG,
+ const SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned MaxStages = 1) {
unsigned NumElts = VT.getVectorNumElements();
auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
unsigned NumSrcBits = PackVT.getScalarSizeInBits();
unsigned NumPackedBits = NumSrcBits - BitSize;
- SDValue VV1 = DAG.getBitcast(PackVT, N1);
- SDValue VV2 = DAG.getBitcast(PackVT, N2);
+ N1 = peekThroughBitcasts(N1);
+ N2 = peekThroughBitcasts(N2);
+ unsigned NumBits1 = N1.getScalarValueSizeInBits();
+ unsigned NumBits2 = N2.getScalarValueSizeInBits();
+ bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
+ bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
+ if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
+ (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
+ return false;
if (Subtarget.hasSSE41() || BitSize == 8) {
APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
- if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
- (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
- V1 = VV1;
- V2 = VV2;
+ if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
+ (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
+ V1 = N1;
+ V2 = N2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKUS;
return true;
}
}
- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
- V1 = VV1;
- V2 = VV2;
+ bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
+ bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
+ if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
+ DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
+ (N2.isUndef() || IsZero2 || IsAllOnes2 ||
+ DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
+ V1 = N1;
+ V2 = N2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKSS;
return true;
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,6,7],zmm1[2,3,6,7]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[0,1,4,5]
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512BW-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: