isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
V1.getScalarValueSizeInBits() == 64 &&
V2.getScalarValueSizeInBits() == 64) {
+ // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
+ unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
+ unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
+ if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
+ SrcVT = MVT::v4i32;
+ DstVT = MVT::v8i16;
+ Shuffle = X86ISD::PACKUS;
+ return true;
+ }
+ // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
+ if (MinLZV1 >= 56 && MinLZV2 >= 56) {
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v16i8;
+ Shuffle = X86ISD::PACKUS;
+ return true;
+ }
// Use PACKSSWD if the signbits extend to the lowest 16-bits.
if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
SrcVT = MVT::v4i32;
}
define <4 x i32> @shuffle_lshr_2v2i64(<2 x i64> %a0, <2 x i64> %a1) {
-; SSE-LABEL: shuffle_lshr_2v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlq $63, %xmm0
-; SSE-NEXT: psrlq $63, %xmm1
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: ret{{[l|q]}}
+; SSE2-LABEL: shuffle_lshr_2v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psrlq $63, %xmm0
+; SSE2-NEXT: psrlq $63, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: ret{{[l|q]}}
+;
+; SSE4-LABEL: shuffle_lshr_2v2i64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: psrlq $63, %xmm0
+; SSE4-NEXT: psrlq $63, %xmm1
+; SSE4-NEXT: packusdw %xmm1, %xmm0
+; SSE4-NEXT: ret{{[l|q]}}
;
; AVX-LABEL: shuffle_lshr_2v2i64:
; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
%lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
%lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
}
define <4 x float> @shuffle_lshr_2v2i64_bitcast(<2 x i64> %a0, <2 x i64> %a1) {
-; SSE-LABEL: shuffle_lshr_2v2i64_bitcast:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlq $63, %xmm0
-; SSE-NEXT: psrlq $63, %xmm1
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: ret{{[l|q]}}
+; SSE2-LABEL: shuffle_lshr_2v2i64_bitcast:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psrlq $63, %xmm0
+; SSE2-NEXT: psrlq $63, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: ret{{[l|q]}}
+;
+; SSE4-LABEL: shuffle_lshr_2v2i64_bitcast:
+; SSE4: # %bb.0:
+; SSE4-NEXT: psrlq $63, %xmm0
+; SSE4-NEXT: psrlq $63, %xmm1
+; SSE4-NEXT: packusdw %xmm1, %xmm0
+; SSE4-NEXT: ret{{[l|q]}}
;
; AVX-LABEL: shuffle_lshr_2v2i64_bitcast:
; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
%lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
%lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>