If the inner broadcast scalar type is smaller/same width as the outer broadcast scalar type then we can broadcast using the same inner type directly. Works for vbroadcast_load as well.
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
}
+ // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
+ // If we're re-broadcasting a smaller type then broadcast with that type and
+ // bitcast.
+ // TODO: Do this for any splat?
+ if (Src.getOpcode() == ISD::BITCAST &&
+ (BC.getOpcode() == X86ISD::VBROADCAST ||
+ BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
+ (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
+ (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
+ MVT NewVT =
+ MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
+ VT.getSizeInBits() / BCVT.getScalarSizeInBits());
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
+ }
+
// Reduce broadcast source vector to lowest 128-bits.
if (SrcVT.getSizeInBits() > 128)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
; AVX2-NEXT: vbroadcastss (%eax), %xmm0
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
; AVX2-NEXT: vmovaps %ymm0, (%eax)
; AVX2-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000
; AVX2-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001
; AVX2-NEXT: movq %rcx, 46348(%rax)
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX2-NEXT: vbroadcastsd %xmm0, %ymm1
-; AVX2-NEXT: vmovups %ymm1, 48296(%rax)
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT: vmovups %ymm0, 48296(%rax)
; AVX2-NEXT: vmovlps %xmm0, 47372(%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq