Helps fix cases where we've splatted smaller types to a wider vector element type without needing the upper bits.
Avoid this on AVX512 targets as that can affect broadcast folding.
if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
TLO, Depth + 1))
return true;
+ // If we don't need the upper bits, attempt to narrow the broadcast source.
+ // Don't attempt this on AVX512 as it might affect broadcast folding.
+ // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
+ if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
+ OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
+ MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
+ SDValue NewSrc =
+ TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
+ MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
+ SDValue NewBcst =
+ TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
+ }
break;
}
case X86ISD::PCMPGT:
;
; AVX2-LABEL: ext_i4_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: ext_i4_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0