assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ if (Subtarget.hasSSE41())
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
return V;
if (Subtarget.hasSSE41()) {
- if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
- return Blend;
-
// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s
define <16 x float> @test1(<16 x float> %x, ptr %br, float %y) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; CHECK-NEXT: vbroadcastss %xmm1, %zmm1
-; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15]
-; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test1:
+; KNL: ## %bb.0:
+; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT: movw $16384, %ax ## imm = 0x4000
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test1:
+; SKX: ## %bb.0:
+; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT: movw $16384, %ax ## imm = 0x4000
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
%rrr = load float, ptr %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; ALL: # %bb.0:
-; ALL-NEXT: vmovups (%rdi), %zmm1
-; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
-; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
+; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; X86-AVX512F: # %bb.0:
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT: vmovups (%eax), %zmm1
-; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
-; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
+; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
+; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
; X86-AVX512F-NEXT: retl
%ptr3 = getelementptr inbounds float, ptr %ptr, i64 3
%ptrC = getelementptr inbounds float, ptr %ptr, i64 12