for (unsigned i = 0; i != Scale; ++i)
ShuffleMask.append(EltSizeInBits, i);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+ (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+ // If we have register broadcast instructions, use the scalar size as the
+ // element type for the shuffle. Then cast to the wider element type. The
+ // widened bits won't be used, and this might allow the use of a broadcast
+ // load.
+ assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+ unsigned Scale = EltSizeInBits / NumElts;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ ShuffleMask.append(NumElts * Scale, 0);
+ Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+ Vec = DAG.getBitcast(VT, Vec);
} else {
// For smaller scalar integers, we can simply any-extend it to the vector
// element size (we don't care about the upper bits) and broadcast it to all
SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
ShuffleMask.append(NumElts, 0);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
}
- Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
// Now, mask the relevant bit in each element.
SmallVector<SDValue, 32> Bits;
; AVX2-LABEL: ext_i8_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-LABEL: ext_i8_8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: ext_i8_8i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
; AVX2-LABEL: ext_i16_16i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
; AVX2-LABEL: ext_i8_8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-LABEL: ext_i8_8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
;
; AVX2-LABEL: ext_i8_8i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
; AVX2-LABEL: ext_i16_16i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
; AVX2-LABEL: bitcast_i8_8i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
;
; AVX2-LABEL: load_sext_8i1_to_8i16:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movzwl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
;
; AVX2-LABEL: load_sext_8i1_to_8i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0