lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space.
NOTE: SSE3 targets can use MOVDDUP but not all SSE era CPUs can perform this as cheaply as a vector load, we will need to add scheduler model checks if we want to pursue this.
MachineBasicBlock &MBB,
MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
+ MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
+ bool HasDQI = ST->hasDQI();
auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
return false;
};
+ // Attempt to convert full width vector loads into broadcast loads.
+ switch (Opc) {
+ /* FP Loads */
+ case X86::MOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPDrm:
+ case X86::MOVUPSrm:
+ // TODO: SSE3 MOVDDUP Handling
+ return false;
+ case X86::VMOVAPDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVUPSrm:
+ return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
+ 1);
+ case X86::VMOVAPDYrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVUPSYrm:
+ return ConvertToBroadcast(0, X86::VBROADCASTF128, X86::VBROADCASTSDYrm,
+ X86::VBROADCASTSSYrm, 0, 0, 1);
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPSZ128rm:
+ return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
+ X86::VBROADCASTSSZ128rm, 0, 0, 1);
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVUPSZ256rm:
+ return ConvertToBroadcast(
+ 0, HasDQI ? X86::VBROADCASTF64X2Z128rm : X86::VBROADCASTF32X4Z256rm,
+ X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm, 0, 0, 1);
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVUPSZrm:
+ return ConvertToBroadcast(
+ HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm,
+ HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm,
+ X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1);
+ }
+
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
// to a broadcast-fold instruction variant.
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) {
define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-LABEL: fneg:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: retq
define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e2:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X86-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: retq
entry:
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
;
; X86-FAST-ALL-LABEL: trunc4:
; X86-FAST-ALL: # %bb.0:
-; X86-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; X86-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; X86-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; X86-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X86-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X86-FAST-ALL-NEXT: vzeroupper
;
; X64-FAST-ALL-LABEL: trunc4:
; X64-FAST-ALL: # %bb.0:
-; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; X64-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; X64-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X64-FAST-ALL-NEXT: vzeroupper
define <32 x i8> @test_x86_avx2_packsswb_fold() {
; X86-AVX-LABEL: test_x86_avx2_packsswb_fold:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_packsswb_fold:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
; X64-AVX512VL: # %bb.0:
-; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
ret <32 x i8> %res
define <32 x i8> @test_x86_avx2_packuswb_fold() {
; X86-AVX-LABEL: test_x86_avx2_packuswb_fold:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_packuswb_fold:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
; X64-AVX512VL: # %bb.0:
-; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
ret <32 x i8> %res
define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e4:
; X86: ## %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
; X86-NEXT: retl
;
; X64-LABEL: _e4:
; X64: ## %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
; X64-NEXT: retq
%vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
%vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %edi
; X32-NEXT: subl $88, %esp
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
; X32-NEXT: vmovups %zmm0, (%esp)
; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
; X32-NEXT: movl $2, {{[0-9]+}}(%esp)
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u>
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u>
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
-; X86XOP-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
+; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
-; GFNI-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
+; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
; AVX1-LABEL: trunc:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
; AVX1-LABEL: example25:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB5_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,8]
; SSE-NEXT: retq
;
-; AVX1-LABEL: and_or_v2i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,8]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: and_or_v2i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,8]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: and_or_v2i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = [8,8]
-; AVX512-NEXT: # xmm0 = mem[0,0]
-; AVX512-NEXT: retq
+; AVX-LABEL: and_or_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [8,8]
+; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: retq
%1 = or <2 x i64> %a0, <i64 255, i64 255>
%2 = and <2 x i64> %1, <i64 8, i64 8>
ret <2 x i64> %2
; SSE-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3]
; SSE-NEXT: retq
;
-; AVX1-LABEL: and_or_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [3,3,3,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: and_or_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: and_or_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: and_or_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
+; AVX-NEXT: retq
%1 = or <4 x i32> %a0, <i32 15, i32 15, i32 15, i32 15>
%2 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %2
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_vec_sdiv_dupe:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
-; AVX2ORLATER: # %bb.0:
-; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; AVX2ORLATER-NEXT: retq
-;
-; XOP-LABEL: combine_vec_sdiv_dupe:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; XOP-NEXT: retq
+; AVX-LABEL: combine_vec_sdiv_dupe:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; AVX-NEXT: retq
%1 = sdiv <4 x i32> %x, %x
ret <4 x i32> %1
}
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_vec_udiv_dupe:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_udiv_dupe:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_vec_udiv_dupe:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_dupe:
; XOP: # %bb.0:
-; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
; XOP-NEXT: retq
%1 = udiv <4 x i32> %x, %x
ret <4 x i32> %1
; X64-SSSE3-NEXT: popq %rbp
; X64-SSSE3-NEXT: retq
;
-; X64-AVX1-LABEL: main:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rbp
-; X64-AVX1-NEXT: movq %rsp, %rbp
-; X64-AVX1-NEXT: andq $-32, %rsp
-; X64-AVX1-NEXT: subq $64, %rsp
-; X64-AVX1-NEXT: movq n1@GOTPCREL(%rip), %rax
-; X64-AVX1-NEXT: vmovaps (%rax), %ymm0
-; X64-AVX1-NEXT: movl zero+4(%rip), %ecx
-; X64-AVX1-NEXT: movl zero+8(%rip), %eax
-; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip)
-; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
-; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0
-; X64-AVX1-NEXT: vextractps $2, %xmm0, %esi
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %esi
-; X64-AVX1-NEXT: movl %eax, %esi
-; X64-AVX1-NEXT: vextractps $1, %xmm0, %edi
-; X64-AVX1-NEXT: movl %ecx, %eax
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %edi
-; X64-AVX1-NEXT: addl %esi, %eax
-; X64-AVX1-NEXT: movq %rbp, %rsp
-; X64-AVX1-NEXT: popq %rbp
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: main:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: pushq %rbp
-; X64-AVX2-NEXT: movq %rsp, %rbp
-; X64-AVX2-NEXT: andq $-32, %rsp
-; X64-AVX2-NEXT: subq $64, %rsp
-; X64-AVX2-NEXT: movq n1@GOTPCREL(%rip), %rax
-; X64-AVX2-NEXT: vmovaps (%rax), %ymm0
-; X64-AVX2-NEXT: movl zero+4(%rip), %ecx
-; X64-AVX2-NEXT: movl zero+8(%rip), %eax
-; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip)
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
-; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
-; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0
-; X64-AVX2-NEXT: vextractps $2, %xmm0, %esi
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %esi
-; X64-AVX2-NEXT: movl %eax, %esi
-; X64-AVX2-NEXT: vextractps $1, %xmm0, %edi
-; X64-AVX2-NEXT: movl %ecx, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %edi
-; X64-AVX2-NEXT: addl %esi, %eax
-; X64-AVX2-NEXT: movq %rbp, %rsp
-; X64-AVX2-NEXT: popq %rbp
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
+; X64-AVX-LABEL: main:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rbp
+; X64-AVX-NEXT: movq %rsp, %rbp
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $64, %rsp
+; X64-AVX-NEXT: movq n1@GOTPCREL(%rip), %rax
+; X64-AVX-NEXT: vmovaps (%rax), %ymm0
+; X64-AVX-NEXT: movl zero+4(%rip), %ecx
+; X64-AVX-NEXT: movl zero+8(%rip), %eax
+; X64-AVX-NEXT: vmovaps %ymm0, zero(%rip)
+; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
+; X64-AVX-NEXT: vmovaps %ymm0, (%rsp)
+; X64-AVX-NEXT: vmovaps (%rsp), %ymm0
+; X64-AVX-NEXT: vextractps $2, %xmm0, %esi
+; X64-AVX-NEXT: xorl %edx, %edx
+; X64-AVX-NEXT: divl %esi
+; X64-AVX-NEXT: movl %eax, %esi
+; X64-AVX-NEXT: vextractps $1, %xmm0, %edi
+; X64-AVX-NEXT: movl %ecx, %eax
+; X64-AVX-NEXT: xorl %edx, %edx
+; X64-AVX-NEXT: divl %edi
+; X64-AVX-NEXT: addl %esi, %eax
+; X64-AVX-NEXT: movq %rbp, %rsp
+; X64-AVX-NEXT: popq %rbp
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
%stackptr = alloca <8 x i32>, align 32
%z = load <8 x i32>, ptr @zero, align 32
%t1 = load <8 x i32>, ptr @n1, align 32
define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x double> %b) {
; FMA3-LABEL: negated_constant_v4f64_2fma_undefs:
; FMA3: # %bb.0:
-; FMA3-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; FMA3-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; FMA3-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem
; FMA3-NEXT: vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm0
;
; FMA4-LABEL: negated_constant_v4f64_2fma_undefs:
; FMA4: # %bb.0:
-; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + mem
; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm0
define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; CHECK-LABEL: test_mm_fnmsub_ps:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %xmm3, %xmm0, %xmm4
; CHECK-NEXT: vxorps %xmm3, %xmm2, %xmm0
; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test_mm_fnmsub_pd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT: # xmm3 = mem[0,0]
; CHECK-NEXT: vxorpd %xmm3, %xmm0, %xmm4
; CHECK-NEXT: vxorpd %xmm3, %xmm2, %xmm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; CHECK-LABEL: test_mm256_fnmsub_ps:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
; CHECK-LABEL: test_mm256_fnmsub_pd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorpd %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vxorpd %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
; FMA-INFS-LABEL: test_v4f32_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
;
; FMA4-INFS-LABEL: test_v4f32_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
; FMA4-INFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
; FMA-INFS-LABEL: test_v8f32_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
;
; FMA4-INFS-LABEL: test_v8f32_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1
define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
; FMA-INFS-LABEL: test_v2f64_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: # xmm3 = mem[0,0]
; FMA-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
;
; FMA4-INFS-LABEL: test_v2f64_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: # xmm3 = mem[0,0]
; FMA4-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
; FMA-INFS-LABEL: test_v4f64_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
;
; FMA4-INFS-LABEL: test_v4f64_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1
define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
;
; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
;
; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
; FMA-INFS-LABEL: test_v16f32_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7
; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6
; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3
;
; FMA4-INFS-LABEL: test_v16f32_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7
; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6
; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3
define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
; FMA-INFS-LABEL: test_v8f64_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7
; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6
; FMA-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3
;
; FMA4-INFS-LABEL: test_v8f64_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7
; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6
; FMA4-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3
; FMA: # %bb.0:
; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
-; FMA-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA-NEXT: vxorpd %ymm2, %ymm0, %ymm0
; FMA-NEXT: vxorpd %ymm2, %ymm1, %ymm1
; FMA-NEXT: retq
; FMA4: # %bb.0:
; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA4-NEXT: vxorpd %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vxorpd %ymm2, %ymm1, %ymm1
; FMA4-NEXT: retq
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; AVX1-LABEL: test_fmaximum_vector_signed_zero:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: test_fmaximum_vector_signed_zero:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: test_fmaximum_vector_signed_zero:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; X86-LABEL: test_fmaximum_vector_signed_zero:
; X86: # %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
%r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; AVX1-LABEL: test_fmaximum_vector_signed_zero_first:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: test_fmaximum_vector_signed_zero_first:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: test_fmaximum_vector_signed_zero_first:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; X86-LABEL: test_fmaximum_vector_signed_zero_first:
; X86: # %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
%r = call <4 x float> @llvm.maximum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
define <4 x i16> @test_sext_4i8_4i16() {
; X32-LABEL: test_sext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x i16> @test_sext_4i8_4i16_undef() {
; X32-LABEL: test_sext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,0,65533,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,0,65533,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x i16> @test_zext_4i8_4i16() {
; X32-LABEL: test_zext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x i16> @test_zext_4i8_4i16_undef() {
; X32-LABEL: test_zext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x float> @test1() {
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0]
+; CHECK-NEXT: # xmm0 = mem[0,0]
; CHECK-NEXT: ret{{[l|q]}}
%1 = trunc <4 x i3> <i3 -1, i3 -22, i3 7, i3 8> to <4 x i1>
%2 = sitofp <4 x i1> %1 to <4 x float>
;
; AVX1-LABEL: round_v16f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
;
; AVX1-LABEL: round_v8f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3
-; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
;
; X86-AVX-LABEL: elt1_v2f64:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1]
+; X86-AVX-NEXT: # xmm0 = mem[0,0]
; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: elt1_v2f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u>
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; X64-AVX-NEXT: # xmm1 = mem[0,0]
; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X64-AVX-NEXT: retq
%ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X86-LABEL: knownbits_mask_or_shuffle_uitofp:
; X86: # %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
; X86-NEXT: retl
;
; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
%2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
; X86-LABEL: knownbits_mask_concat_uitofp:
; X86: # %bb.0:
; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X86-NEXT: vandps %xmm2, %xmm0, %xmm0
; X64-LABEL: knownbits_mask_concat_uitofp:
; X64: # %bb.0:
; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
; X64-NEXT: vandps %xmm2, %xmm1, %xmm1
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X64-NEXT: vandps %xmm2, %xmm0, %xmm0
;
; AVX1-LABEL: truncstore_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
;
; AVX1-LABEL: truncstore_v16i32_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
;
; AVX1-LABEL: truncstore_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
;
; AVX1-LABEL: truncstore_v32i16_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX-LABEL: truncstore_v2i64_v2i16:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535]
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX-LABEL: truncstore_v2i64_v2i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
;
; AVX-LABEL: memset_16_nonzero_bytes:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %xmm0, (%rdi)
; AVX-NEXT: retq
%call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1)
;
; AVX-LABEL: memset_32_nonzero_bytes:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX1-LABEL: memset_64_nonzero_bytes:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
;
; AVX2-LABEL: memset_64_nonzero_bytes:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
; AVX2-NEXT: vmovups %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
;
; AVX512BW-LABEL: memset_64_nonzero_bytes:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX1-LABEL: memset_128_nonzero_bytes:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
;
; AVX2-LABEL: memset_128_nonzero_bytes:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
; AVX2-NEXT: vmovups %ymm0, 64(%rdi)
; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
;
; AVX512BW-LABEL: memset_128_nonzero_bytes:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi)
; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
;
; AVX1-LABEL: memset_256_nonzero_bytes:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
;
; AVX2-LABEL: memset_256_nonzero_bytes:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX2-NEXT: vmovups %ymm0, 224(%rdi)
; AVX2-NEXT: vmovups %ymm0, 192(%rdi)
; AVX2-NEXT: vmovups %ymm0, 160(%rdi)
;
; AVX512BW-LABEL: memset_256_nonzero_bytes:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi)
; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi)
; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi)
; X32-LABEL: big_nonzero_32_bytes_splat:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: big_nonzero_32_bytes_splat:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
; X64-NEXT: vmovups %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm2
; AVX2-FAST-ALL-NEXT: vbroadcastsd %xmm1, %ymm3
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0
; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <u,u,u,u,u,1,4,7>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
+; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0
; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1
; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <5,u,u,6,u,u,7,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6]
+; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3
; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
;
; AVX1-LABEL: test17:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3
; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
;
; AVX1-LABEL: test35:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3
; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; CHECK: # %bb.0:
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
;
; AVX1-LABEL: PR32368_512:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vaddps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1>
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
+; CHECK-NEXT: # xmm2 = mem[0,0]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: retq
%1 = shufflevector <4 x double> %a, <4 x double> <double undef, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C>, <8 x i32> <i32 6, i32 5, i32 2, i32 3, i32 5, i32 1, i32 3, i32 7>
; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-RECIP-LABEL: v4f32_no_estimate:
-; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: retq
-;
-; FMA-RECIP-LABEL: v4f32_no_estimate:
-; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; FMA-RECIP-NEXT: retq
-;
-; BDVER2-LABEL: v4f32_no_estimate:
-; BDVER2: # %bb.0:
-; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; BDVER2-NEXT: retq
-;
-; BTVER2-LABEL: v4f32_no_estimate:
-; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: retq
-;
-; SANDY-LABEL: v4f32_no_estimate:
-; SANDY: # %bb.0:
-; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: retq
-;
-; HASWELL-LABEL: v4f32_no_estimate:
-; HASWELL: # %bb.0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; HASWELL-NEXT: retq
-;
-; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
-; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: retq
-;
-; AVX512-LABEL: v4f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v4f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
;
; BTVER2-LABEL: v4f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
; FMA-RECIP-LABEL: v4f32_two_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
; BDVER2-LABEL: v4f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm0, %xmm1
-; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
;
; BTVER2-LABEL: v4f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX-RECIP-LABEL: v8f32_no_estimate:
-; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT: retq
-;
-; FMA-RECIP-LABEL: v8f32_no_estimate:
-; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; FMA-RECIP-NEXT: retq
-;
-; BDVER2-LABEL: v8f32_no_estimate:
-; BDVER2: # %bb.0:
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; BDVER2-NEXT: retq
-;
-; BTVER2-LABEL: v8f32_no_estimate:
-; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT: retq
-;
-; SANDY-LABEL: v8f32_no_estimate:
-; SANDY: # %bb.0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT: retq
-;
-; HASWELL-LABEL: v8f32_no_estimate:
-; HASWELL: # %bb.0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; HASWELL-NEXT: retq
-;
-; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
-; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT: retq
-;
-; AVX512-LABEL: v8f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v8f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
;
; BTVER2-LABEL: v8f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
; FMA-RECIP-LABEL: v8f32_two_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1
; BDVER2-LABEL: v8f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm1
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
;
; BTVER2-LABEL: v8f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
;
; AVX-RECIP-LABEL: v16f32_no_estimate:
; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v16f32_no_estimate:
; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v16f32_no_estimate:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v16f32_no_estimate:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v16f32_no_estimate:
; SANDY: # %bb.0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1
; SANDY-NEXT: retq
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
; FMA-RECIP-LABEL: v16f32_one_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
; BDVER2-LABEL: v16f32_one_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vrcpps %ymm1, %ymm4
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
;
; BTVER2-LABEL: v16f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vrcpps %ymm1, %ymm4
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vrcpps %ymm1, %ymm4
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
; FMA-RECIP-LABEL: v16f32_two_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
; BDVER2-LABEL: v16f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
;
; BTVER2-LABEL: v16f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
;
; BTVER2-LABEL: v4f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
; FMA-RECIP-LABEL: v4f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
;
; BTVER2-LABEL: v4f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
;
; BTVER2-LABEL: v8f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
; FMA-RECIP-LABEL: v8f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
;
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
; BDVER2-LABEL: v16f32_one_step_2_divs:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
; BDVER2-NEXT: vrcpps %ymm1, %ymm2
;
; BTVER2-LABEL: v16f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
; FMA-RECIP-LABEL: v16f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
; BDVER2-LABEL: v16f32_two_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
;
; BTVER2-LABEL: v16f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: # xmm3 = mem[0,0]
; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
; AVX512F-LABEL: v2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: # xmm3 = mem[0,0]
; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX1-NEXT: vxorpd %ymm5, %ymm4, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
+; AVX2-NEXT: # xmm1 = mem[0,0]
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi)
; AVX2-FAST-ALL-NEXT: vzeroupper
;
; AVX-LABEL: const_vector:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
; AVX-NEXT: retq
;
; AVX2-LABEL: const_vector:
; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1
; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0
; SNB-NEXT: retq
; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1
; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0
; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0
; SNB-NEXT: retq
; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
; SSE-NEXT: divps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: v4f32_no_estimate:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vsqrtps %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: v4f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vsqrtps %xmm0, %xmm0
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v4f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtps %xmm0, %xmm0
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <4 x float> %div
; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
; SSE-NEXT: divps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: v8f32_no_estimate:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vsqrtps %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: v8f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vsqrtps %ymm0, %ymm0
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v8f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <8 x float> %div
; AVX1: # %bb.0:
; AVX1-NEXT: vsqrtps %ymm1, %ymm1
; AVX1-NEXT: vsqrtps %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
; AVX1-LABEL: v16f32_estimate:
; AVX1: # %bb.0:
; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vrsqrtps %ymm1, %ymm5
; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0
; AVX-LABEL: sqrt_simplify_before_recip_vec:
; AVX: # %bb.0:
; AVX-NEXT: vsqrtpd %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmovupd %xmm1, (%rdi)
; AVX-NEXT: retq
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_srem_one_eq:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_srem_one_eq:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_srem_one_eq:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_srem_one_eq:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX-NEXT: retq
%srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_srem_allones:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_srem_allones:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_srem_allones:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_srem_allones:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX-NEXT: retq
%srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
; X86-SSE-NEXT: movaps %xmm0, (%eax)
; X86-SSE-NEXT: retl
;
-; X86-AVX1-LABEL: test17:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
-; X86-AVX1-NEXT: vmovaps %xmm0, (%eax)
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX512-LABEL: test17:
-; X86-AVX512: # %bb.0: # %entry
-; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
-; X86-AVX512-NEXT: vmovaps %xmm0, (%eax)
-; X86-AVX512-NEXT: retl
+; X86-AVX-LABEL: test17:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
+; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: test17:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movaps %xmm0, (%rax)
; X64-SSE-NEXT: retq
;
-; X64-AVX1-LABEL: test17:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
-; X64-AVX1-NEXT: vmovaps %xmm0, (%rax)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX512-LABEL: test17:
-; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
-; X64-AVX512-NEXT: vmovaps %xmm0, (%rax)
-; X64-AVX512-NEXT: retq
+; X64-AVX-LABEL: test17:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
+; X64-AVX-NEXT: vmovaps %xmm0, (%rax)
+; X64-AVX-NEXT: retq
entry:
%0 = insertelement <4 x i32> undef, i32 undef, i32 1
%1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
%m = mul <4 x i32> %x, %y
ret <4 x i32> %m
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-AVX1: {{.*}}
+; X64-AVX512: {{.*}}
+; X86-AVX1: {{.*}}
+; X86-AVX512: {{.*}}
;
; X64-AVX2-LABEL: vec_v2i64:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807]
+; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: # xmm2 = mem[0,0]
+; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807]
+; X64-AVX2-NEXT: # xmm3 = mem[0,0]
; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm3
; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
; X64-AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: # xmm2 = mem[0,0]
; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX512F-NEXT: retq
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_one_eq:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_urem_one_eq:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_urem_one_eq:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_urem_one_eq:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX-NEXT: retq
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 {
; X86-LABEL: two_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: vandps %ymm0, %ymm1, %ymm0
;
; X64-LABEL: two_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
define <8 x i32> @three_ands(<8 x float> %x) {
; X86-LABEL: three_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: three_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @four_ands(<8 x float> %x) {
; X86-LABEL: four_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: four_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @five_ands(<8 x float> %x) {
; X86-LABEL: five_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: five_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @two_or(<8 x float> %x) {
; X86-LABEL: two_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
;
; X64-LABEL: two_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
define <8 x i32> @three_or(<8 x float> %x) {
; X86-LABEL: three_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: three_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @four_or(<8 x float> %x) {
; X86-LABEL: four_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: four_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @five_or(<8 x float> %x) {
; X86-LABEL: five_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: five_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @three_or_and(<8 x float> %x) {
; X86-LABEL: three_or_and:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: three_or_and:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @four_or_and(<8 x float> %x) {
; X86-LABEL: four_or_and:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
;
; X64-LABEL: four_or_and:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
define <8 x i32> @five_or_and(<8 x float> %x) {
; X86-LABEL: five_or_and:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: five_or_and:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X86-LABEL: four_or_and_xor:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1
;
; X64-LABEL: four_or_and_xor:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1
define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X86-LABEL: five_or_and_xor:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: five_or_and_xor:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X86-LABEL: six_or_and_xor:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; X64-LABEL: six_or_and_xor:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
;
; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0
define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 {
; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2]
; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3
; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 {
; AVX-LABEL: strict_vector_fptoui_v8f32_to_v8i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm4
; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1
; AVX-NEXT: vsubps %ymm1, %ymm0, %ymm0
define <4 x i16> @const_16_32() nounwind {
; CHECK-LABEL: const_16_32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u>
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
+; CHECK-NEXT: # xmm0 = mem[0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i32> <i32 0, i32 3, i32 8, i32 7> to <4 x i16>
ret <4 x i16> %G
define <4 x i16> @const_16_64() nounwind {
; CHECK-LABEL: const_16_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u>
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
+; CHECK-NEXT: # xmm0 = mem[0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i64> <i64 0, i64 3, i64 8, i64 7> to <4 x i16>
ret <4 x i16> %G
define <8 x double> @fabs_v8f64(<8 x double> %p) {
; X86-AVX-LABEL: fabs_v8f64:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: fabs_v8f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64-AVX-NEXT: retq
define <16 x float> @fabs_v16f32(<16 x float> %p) {
; X86-AVX-LABEL: fabs_v16f32:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: fabs_v16f32:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64-AVX-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
;
; AVX-LABEL: fptoui_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [2,4,2,4]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd (%rdi), %ymm2
; AVX1-NEXT: vmovapd 32(%rdi), %ymm3
-; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1]
; AVX1-NEXT: vandpd %ymm4, %ymm3, %ymm5
; AVX1-NEXT: vmovaps (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vsubpd %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vsubpd %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovupd %xmm0, (%rdi)
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT: # xmm6 = mem[0,0]
; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0
; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1
; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovupd %xmm0, (%rdi)
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT: # xmm6 = mem[0,0]
; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0
; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1
; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512F-NEXT: # xmm2 = mem[0,0]
; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vmovupd %xmm0, (%rdi)
; AVX512DQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1
-; AVX512DQ-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512DQ-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512DQ-NEXT: # xmm2 = mem[0,0]
; AVX512DQ-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX512DQ-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vmovupd %xmm0, (%rdi)
;
; AVX-LABEL: constrained_vector_fmul_v2f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
;
-; AVX1-LABEL: constrained_vector_fmul_v4f64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: constrained_vector_fmul_v4f64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: constrained_vector_fmul_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
entry:
%mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
<4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
;
; AVX-LABEL: constrained_vector_fadd_v2f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
;
-; AVX1-LABEL: constrained_vector_fadd_v4f64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: constrained_vector_fadd_v4f64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: constrained_vector_fadd_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
entry:
%add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
<4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
;
; AVX-LABEL: constrained_vector_fsub_v2f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
;
-; AVX1-LABEL: constrained_vector_fsub_v4f64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
-; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: constrained_vector_fsub_v4f64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
-; AVX512-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: constrained_vector_fsub_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
entry:
%sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
<4 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
;
; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vsubps %xmm0, %xmm1, %xmm0
;
; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1]
; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2]
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0
define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; XOPAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <u,u,u,u,u,1,4,7>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
+; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
;
; AVX2-ONLY-LABEL: load_i32_stride4_vf4:
; AVX2-ONLY: # %bb.0:
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi)
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm5
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm5, %ymm6
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7
; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm11, %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm11, %ymm4
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm5, %ymm0
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm10
; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm9, %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm14
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm1 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <4,2,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx)
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <4,2,u,u>
+; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
+; AVX512F-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512F-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512F-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx)
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX512F-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512F-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512F-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <4,2,u,u>
+; AVX512BW-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
+; AVX512BW-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512BW-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512BW-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512BW-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512BW-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512BW-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512BW-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%rdx)
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512BW-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
-; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512BW-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm11 = mem[0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm5 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <u,u,u,4,2,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [2,4,2,4,2,4,2,4]
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm11 = mem[0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm5 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm4 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm4 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm14 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm14 = mem[0,0]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm8 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm8 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <4,3,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
+; AVX2-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
+; AVX2-SLOW-NEXT: # xmm10 = mem[0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm2
; AVX2-FAST-NEXT: vbroadcastss 84(%rdi), %xmm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <1,0,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
+; AVX2-FAST-NEXT: # xmm10 = mem[0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-SLOW-NEXT: # xmm3 = mem[0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-SLOW-NEXT: # xmm5 = mem[0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FAST-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,1,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,4,1,5,0,4,1,5]
; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1>
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,0,2,4,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-SLOW-NEXT: # xmm9 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1]
; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = <u,1,5,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,1,5,1,5,1,5,1]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [5,0,2,6,5,0,2,6]
; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = <u,u,u,u,u,3,7,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [2,6,0,3,2,6,0,3]
; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <u,u,0,4>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,4,0,1,0,4,0,1]
; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vbroadcastss (%r10), %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1]
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm10
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm11, %ymm8
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm7
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
;
; AVX1-LABEL: test_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1]
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
;
; AVX1-SLOW-LABEL: test_v16i32_v16i8:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
;
; AVX1-FAST-LABEL: test_v16i32_v16i8:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-FAST-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
;
; AVX1-SLOW-LABEL: test_v32i32_v32i8:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1
;
; AVX1-FAST-LABEL: test_v32i32_v32i8:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1
;
; AVX1-LABEL: test_v64i16_v64i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
;
; AVX1-LABEL: trunc_v16i32_v16i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
;
; AVX2-FAST-ALL-LABEL: shuffle_v8f32_c348cda0:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,4,0,0,3,4,0]
+; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0]
; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
;
; AVX2-FAST-ALL-LABEL: shuffle_v8f32_f511235a:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7,2,7,2,7,2,7,2]
; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX512-FAST-LABEL: expand15:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <u,u,0,u,1,u,u,u>
+; AVX512-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [1,0,0,0,1,0,0,0]
+; AVX512-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
; AVX512-FAST-NEXT: ret{{[l|q]}}
; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0]
; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
-; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u>
+; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0]
+; X86-AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0]
; X86-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1]
; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3
; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
-; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,11,u,u>
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,11,3,11]
+; X64-AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,8,9,3]
; X64-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
;
; AVX-LABEL: PR30264:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.0E+0,1.0E+0,4.0E+0,1.0E+0]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
; SSE41-NEXT: movaps %xmm2, (%rax)
; SSE41-NEXT: retq
;
-; AVX1-LABEL: SpinningCube:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
-; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovaps %xmm2, (%rax)
-; AVX1-NEXT: vbroadcastss (%rax), %xmm2
-; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovaps %xmm0, (%rax)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: SpinningCube:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
-; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovaps %xmm2, (%rax)
-; AVX2-NEXT: vbroadcastss (%rax), %xmm2
-; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovaps %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX-LABEL: SpinningCube:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, (%rax)
+; AVX-NEXT: vbroadcastss (%rax), %xmm2
+; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rax)
+; AVX-NEXT: retq
entry:
store float 1.000000e+00, ptr undef, align 4
%0 = load float, ptr undef, align 4
;
; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
;
; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
;
; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
;
; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <u,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
;
; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
;
; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
;
; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
;
; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
;
; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
;
; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
;
; AVX-LABEL: trunc_usat_v2i64_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX-LABEL: trunc_usat_v2i64_v2i32_store:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729]
; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
;
; AVX-LABEL: trunc_usat_v2i64_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; AVX-LABEL: trunc_usat_v2i64_v2i8_store:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
;
; AVX1-LABEL: trunc8i64_8i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc8i64_8i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc16i32_16i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
;
; AVX1-LABEL: trunc16i32_16i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
;
; AVX1-LABEL: trunc32i16_32i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
;
; AVX1-LABEL: trunc2x16i16_32i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
;
; AVX1-LABEL: store_merge_split:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq (%rdi,%rsi,8), %rax
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
; AVX1-NEXT: vmovupd %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX-LABEL: fsel_nonzero_false_val:
; AVX: # %bb.0:
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX-LABEL: fsel_nonzero_constants:
; AVX: # %bb.0:
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; AVX-NEXT: retq
;
define <4 x float> @undef1() {
ret <4 x float> <float 1.0, float 1.0, float undef, float undef>
-; CHECK: .globl __xmm@00000000000000003f8000003f800000
-; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000003f8000003f800000
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
-; CHECK-NEXT: .long 0x3f800000 # float 1
+; CHECK: .globl __real@3f800000
+; CHECK-NEXT: .section .rdata,"dr",discard,__real@3f800000
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: __real@3f800000:
; CHECK-NEXT: .long 0x3f800000 # float 1
-; CHECK-NEXT: .zero 4
-; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .text
; CHECK: undef1:
-; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0
+; CHECK: vbroadcastss __real@3f800000(%rip), %xmm0
; CHECK-NEXT: ret
}