From 07a1787501fc7141c8788421c265ae95008f7c13 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 16 Jul 2018 06:56:09 +0000 Subject: [PATCH] [X86] Merge the FR128 and VR128 regclass since they have identical spill and alignment characteristics. This unfortunately requires a bunch of bitcasts to be added added to SUBREG_TO_REG, COPY_TO_REGCLASS, and instructions in output patterns. Otherwise tablegen seems to default to picking f128 and then we fail when something tries to get the register class for f128 which isn't always valid. The test changes are because we were previously mixing fr128 and vr128 due to contrainRegClass finding FR128 first and passes like live range shrinking weren't handling that well. llvm-svn: 337147 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- llvm/lib/Target/X86/X86InstrAVX512.td | 290 ++++++++++---------- llvm/lib/Target/X86/X86InstrCompiler.td | 2 +- llvm/lib/Target/X86/X86InstrFMA.td | 24 +- llvm/lib/Target/X86/X86InstrSSE.td | 292 +++++++++++---------- llvm/lib/Target/X86/X86InstrVecCompiler.td | 2 +- llvm/lib/Target/X86/X86RegisterInfo.td | 10 +- llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll | 52 ++-- llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 160 +++++------ llvm/test/CodeGen/X86/avx512-intrinsics.ll | 140 +++++----- .../CodeGen/X86/avx512dq-intrinsics-upgrade.ll | 4 +- llvm/test/CodeGen/X86/avx512dq-intrinsics.ll | 8 +- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 104 ++++---- llvm/test/CodeGen/X86/buildvec-insertvec.ll | 8 +- llvm/test/CodeGen/X86/domain-reassignment.mir | 8 +- llvm/test/CodeGen/X86/haddsub-2.ll | 44 ++-- llvm/test/CodeGen/X86/half.ll | 16 +- .../CodeGen/X86/merge-consecutive-loads-128.ll | 16 +- llvm/test/CodeGen/X86/pr29112.ll | 20 +- llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 52 ++-- llvm/test/CodeGen/X86/sse1.ll | 8 +- llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 77 +++--- llvm/test/CodeGen/X86/var-permute-128.ll | 34 +-- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 84 +++--- .../CodeGen/X86/vector-shuffle-variable-128.ll | 20 +- llvm/test/CodeGen/X86/vector-sqrt.ll | 12 +- 26 files changed, 753 insertions(+), 740 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 98995a7..37338ab 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -613,7 +613,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Long double always uses X87, except f128 in MMX. if (UseX87) { if (Subtarget.is64Bit() && Subtarget.hasMMX()) { - addRegisterClass(MVT::f128, &X86::FR128RegClass); + addRegisterClass(MVT::f128, &X86::VR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); setOperationAction(ISD::FNEG , MVT::f128, Custom); @@ -3078,7 +3078,7 @@ SDValue X86TargetLowering::LowerFormalArguments( else if (RegVT == MVT::f80) RC = &X86::RFP80RegClass; else if (RegVT == MVT::f128) - RC = &X86::FR128RegClass; + RC = &X86::VR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -28544,7 +28544,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR64: - case X86::CMOV_FR128: + case X86::CMOV_F128: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 36c859a..0791df5 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1118,18 +1118,18 @@ multiclass avx512_broadcast_scalar opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)), (!cast(Name#DestInfo.ZSuffix#r) - (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; + (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, (X86VBroadcast SrcInfo.FRC:$src), DestInfo.RC:$src0)), (!cast(Name#DestInfo.ZSuffix#rk) DestInfo.RC:$src0, DestInfo.KRCWM:$mask, - (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; + (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, (X86VBroadcast SrcInfo.FRC:$src), DestInfo.ImmAllZerosV)), (!cast(Name#DestInfo.ZSuffix#rkz) - DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; + DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; } // Split version to allow mask and broadcast node to be different types. This @@ -1328,10 +1328,11 @@ defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, // automatically does the extract. multiclass avx512_int_broadcast_rm_lowering { + X86VectorVTInfo SrcInfo, + X86VectorVTInfo ExtInfo> { def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), (!cast(Name#DestInfo.ZSuffix#"r") - (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>; + (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>; } multiclass avx512_int_broadcast_rm_vl opc, string OpcodeStr, @@ -1339,15 +1340,15 @@ multiclass avx512_int_broadcast_rm_vl opc, string OpcodeStr, let Predicates = [prd] in { defm Z : avx512_broadcast_rm, - avx512_int_broadcast_rm_lowering, + avx512_int_broadcast_rm_lowering, EVEX_V512; // Defined separately to avoid redefinition. - defm Z_Alt : avx512_int_broadcast_rm_lowering; + defm Z_Alt : avx512_int_broadcast_rm_lowering; } let Predicates = [prd, HasVLX] in { defm Z256 : avx512_broadcast_rm, - avx512_int_broadcast_rm_lowering, + avx512_int_broadcast_rm_lowering, EVEX_V256; defm Z128 : avx512_broadcast_rm, @@ -1677,20 +1678,20 @@ defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", let Predicates = [HasVLX] in { def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))), - (VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>; + (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))), - (VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; + (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; } def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), - (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; + (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>; def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), - (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>; + (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), - (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>; def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), - (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; + (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER @@ -3730,7 +3731,7 @@ multiclass masked_move_for_extract(InstrStr#"rrk") Cast.RC:$src0, Cast.KRCWM:$mask, - (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>; + (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, (bitconvert @@ -3739,7 +3740,7 @@ multiclass masked_move_for_extract(InstrStr#"rrkz") Cast.KRCWM:$mask, - (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>; + (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; } @@ -4005,10 +4006,10 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.EltVT _.FRC:$src1), (_.EltVT _.FRC:$src2))))))), (!cast(InstrStr#rrk) - (COPY_TO_REGCLASS _.FRC:$src2, _.RC), + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, _.RC)), VK1WM:$mask, (_.VT _.RC:$src0), - (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>; def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector @@ -4018,7 +4019,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (!cast(InstrStr#rrkz) VK1WM:$mask, (_.VT _.RC:$src0), - (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>; } multiclass avx512_store_scalar_lowering; def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), - (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)), VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; + (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>; def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)), - (COPY_TO_REGCLASS (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; + (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>; def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), - (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)), VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; + (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)), - (COPY_TO_REGCLASS (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; + (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), @@ -4334,38 +4337,40 @@ let Predicates = [HasAVX512, OptForSize] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; + (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4i32 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; + (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; + (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>; + (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4i32 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>; + (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; + (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; } @@ -4374,25 +4379,25 @@ let Predicates = [HasAVX512, OptForSize] in { let Predicates = [HasAVX512, OptForSpeed] in { def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VBLENDPSrri (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm), - (i8 1)), sub_xmm)>; + (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), + (i8 1))), sub_xmm)>; def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VPBLENDWrri (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm), - (i8 3)), sub_xmm)>; + (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), + (i8 3))), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VBLENDPDrri (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm), - (i8 1)), sub_xmm)>; + (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), + (i8 1))), sub_xmm)>; def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VPBLENDWrri (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm), - (i8 0xf)), sub_xmm)>; + (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), + (i8 0xf))), sub_xmm)>; } let Predicates = [HasAVX512] in { @@ -4421,7 +4426,7 @@ let Predicates = [HasAVX512] in { // 256-bit types def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; @@ -4437,7 +4442,7 @@ let Predicates = [HasAVX512] in { // 512-bit types def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; @@ -4451,7 +4456,7 @@ let Predicates = [HasAVX512] in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; // Extract and store. def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), @@ -4487,11 +4492,11 @@ let Predicates = [HasAVX512] in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), @@ -4503,7 +4508,7 @@ let Predicates = [HasAVX512] in { def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v8i32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), @@ -4511,21 +4516,21 @@ let Predicates = [HasAVX512] in { def : Pat<(v2i64 (X86vzload addr:$src)), (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. def : Pat<(v16i32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v8i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; } //===----------------------------------------------------------------------===// @@ -5506,38 +5511,46 @@ defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>; let Predicates = [HasVLX,HasDQI] in { // Use packed logical operations for scalar ops. def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS (VANDPDZ128rr - (COPY_TO_REGCLASS FR64X:$src1, VR128X), - (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + (COPY_TO_REGCLASS + (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS (VORPDZ128rr - (COPY_TO_REGCLASS FR64X:$src1, VR128X), - (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + (COPY_TO_REGCLASS + (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS (VXORPDZ128rr - (COPY_TO_REGCLASS FR64X:$src1, VR128X), - (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + (COPY_TO_REGCLASS + (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS (VANDNPDZ128rr - (COPY_TO_REGCLASS FR64X:$src1, VR128X), - (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + (COPY_TO_REGCLASS + (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS (VANDPSZ128rr - (COPY_TO_REGCLASS FR32X:$src1, VR128X), - (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + (COPY_TO_REGCLASS + (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS (VORPSZ128rr - (COPY_TO_REGCLASS FR32X:$src1, VR128X), - (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + (COPY_TO_REGCLASS + (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS (VXORPSZ128rr - (COPY_TO_REGCLASS FR32X:$src1, VR128X), - (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + (COPY_TO_REGCLASS + (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS (VANDNPSZ128rr - (COPY_TO_REGCLASS FR32X:$src1, VR128X), - (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + (COPY_TO_REGCLASS + (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; } multiclass avx512_fp_scalef_p opc, string OpcodeStr, SDNode OpNode, @@ -6836,36 +6849,36 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zr_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast(Prefix#"231"#Suffix#"Zr_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (_.ScalarLdFrag addr:$src3)))))), (!cast(Prefix#"213"#Suffix#"Zm_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (_.ScalarLdFrag addr:$src3), _.FRC:$src2))))), (!cast(Prefix#"132"#Suffix#"Zm_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast(Prefix#"231"#Suffix#"Zm_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector @@ -6876,8 +6889,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zr_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6887,7 +6900,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zm_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6896,7 +6909,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"132"#Suffix#"Zm_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6905,8 +6918,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zr_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6915,7 +6928,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zm_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6925,8 +6938,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zr_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6935,8 +6948,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zr_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6946,7 +6959,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zm_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6955,7 +6968,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"132"#Suffix#"Zm_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6964,7 +6977,7 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zm_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; // Patterns with rounding mode. def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector @@ -6972,16 +6985,16 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zrb_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (i32 imm:$rc)))))), (!cast(Prefix#"231"#Suffix#"Zrb_Int") - VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -6991,8 +7004,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zrb_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -7002,8 +7015,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zrb_Intk") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -7013,8 +7026,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zrb_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, @@ -7024,8 +7037,8 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zrb_Intkz") VR128X:$src1, VK1WM:$mask, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X), - (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; } } @@ -8467,16 +8480,17 @@ let Predicates = [HasVLX] in { // more consistent with other instructions, which are always controlled by it. // It's encoded as 0b100. def : Pat<(fp_to_f16 FR32X:$src), - (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr - (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>; + (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr + (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>; def : Pat<(f16_to_fp GR16:$src), - (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr - (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >; + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr + (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >; def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), - (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr - (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >; + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr + (v8i16 (VCVTPS2PHZ128rr + (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >; } // Unordered/Ordered scalar fp compare with Sea and set EFLAGS @@ -10798,17 +10812,17 @@ let Predicates = [HasVLX] in { def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), - (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, - (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (bitconvert (v4i32 immAllZerosV))), - (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (v2f64 VR128X:$src0)), @@ -11521,7 +11535,7 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#Zrr_Int) _.VT:$dst, - (COPY_TO_REGCLASS _.FRC:$src, VR128X))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -11532,9 +11546,9 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#Zrr_Intk) - (COPY_TO_REGCLASS _.FRC:$src0, VR128X), + (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X))>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -11543,9 +11557,9 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#Zrr_Intkz) - VK1WM:$mask, _.VT:$src1, - (COPY_TO_REGCLASS _.FRC:$src2, VR128X))>; + (!cast("V"#OpcPrefix#Zrr_Intkz) + VK1WM:$mask, _.VT:$src1, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; } } diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index f1aa4fa..f360c0a 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -563,7 +563,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _FR32 : CMOVrr_PSEUDO; defm _FR64 : CMOVrr_PSEUDO; - defm _FR128 : CMOVrr_PSEUDO; + defm _F128 : CMOVrr_PSEUDO; defm _V4F32 : CMOVrr_PSEUDO; defm _V2F64 : CMOVrr_PSEUDO; defm _V2I64 : CMOVrr_PSEUDO; diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index f418bfa..7a35d07 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -337,29 +337,29 @@ multiclass scalar_fma_patterns(Prefix#"213"#Suffix#"r_Int") - VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), - (COPY_TO_REGCLASS RC:$src3, VR128))>; + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector (Op RC:$src2, (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), (mem_frag addr:$src3)))))), (!cast(Prefix#"213"#Suffix#"m_Int") - VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>; def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), (mem_frag addr:$src3), RC:$src2))))), (!cast(Prefix#"132"#Suffix#"m_Int") - VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>; def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector (Op RC:$src2, (mem_frag addr:$src3), (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))), (!cast(Prefix#"231"#Suffix#"m_Int") - VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>; } } @@ -598,23 +598,23 @@ multiclass scalar_fma4_patterns(Name#"rr_Int") - (COPY_TO_REGCLASS RC:$src1, VR128), - (COPY_TO_REGCLASS RC:$src2, VR128), - (COPY_TO_REGCLASS RC:$src3, VR128))>; + (VT (COPY_TO_REGCLASS RC:$src1, VR128)), + (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; def : Pat<(VT (X86vzmovl (VT (scalar_to_vector (Op RC:$src1, RC:$src2, (mem_frag addr:$src3)))))), (!cast(Name#"rm_Int") - (COPY_TO_REGCLASS RC:$src1, VR128), - (COPY_TO_REGCLASS RC:$src2, VR128), addr:$src3)>; + (VT (COPY_TO_REGCLASS RC:$src1, VR128)), + (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>; def : Pat<(VT (X86vzmovl (VT (scalar_to_vector (Op RC:$src1, (mem_frag addr:$src2), RC:$src3))))), (!cast(Name#"mr_Int") - (COPY_TO_REGCLASS RC:$src1, VR128), addr:$src2, - (COPY_TO_REGCLASS RC:$src3, VR128))>; + (VT (COPY_TO_REGCLASS RC:$src1, VR128)), addr:$src2, + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; } } diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index ae4b3ce..2d03a8d 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -308,21 +308,23 @@ let Predicates = [UseAVX, OptForSize] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; + (v4i32 (VMOVSSrr (v4i32 (V_SET0)), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; + (v2i64 (VMOVSDrr (v2i64 (V_SET0)), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; } let Predicates = [UseSSE1] in { @@ -2421,78 +2423,94 @@ let Predicates = [HasAVX1Only] in { let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { // Use packed logical operations for scalar ops. def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (VANDPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (VORPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (VXORPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (VANDNPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (VANDPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (VORPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (VXORPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (VANDNPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; } let Predicates = [UseSSE1] in { // Use packed logical operations for scalar ops. def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (ANDPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (ORPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (XORPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS (ANDNPSrr - (COPY_TO_REGCLASS FR32:$src1, VR128), - (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + (COPY_TO_REGCLASS + (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; } let Predicates = [UseSSE2] in { // Use packed logical operations for scalar ops. def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (ANDPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (ORPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (XORPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS (ANDNPDrr - (COPY_TO_REGCLASS FR64:$src1, VR128), - (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + (COPY_TO_REGCLASS + (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; } // Patterns for packed operations when we don't have integer type available. @@ -2679,7 +2697,7 @@ multiclass scalar_math_patterns(OpcPrefix#rr_Int) VT:$dst, - (COPY_TO_REGCLASS RC:$src, VR128))>; + (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; } // Repeat for AVX versions of the instructions. @@ -2690,7 +2708,7 @@ multiclass scalar_math_patterns("V"#OpcPrefix#rr_Int) VT:$dst, - (COPY_TO_REGCLASS RC:$src, VR128))>; + (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; } } @@ -4163,7 +4181,7 @@ let Predicates = [UseAVX] in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), @@ -4176,13 +4194,13 @@ let Predicates = [UseAVX] in { (VMOVDI2PDIrm addr:$src)>; def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; def : Pat<(v8i32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; } let Predicates = [UseSSE2] in { @@ -4276,9 +4294,9 @@ let Predicates = [UseAVX] in { (VMOVQI2PQIrm addr:$src)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; } let Predicates = [UseSSE2] in { @@ -6448,25 +6466,25 @@ let Predicates = [HasAVX, OptForSpeed] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VBLENDPSrri (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm), - (i8 1)), sub_xmm)>; + (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), + (i8 1))), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VPBLENDWrri (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm), - (i8 3)), sub_xmm)>; + (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), + (i8 3))), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VBLENDPDrri (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm), - (i8 1)), sub_xmm)>; + (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), + (i8 1))), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), (SUBREG_TO_REG (i32 0), - (VPBLENDWrri (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm), - (i8 0xf)), sub_xmm)>; + (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), + (i8 0xf))), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were @@ -7111,10 +7129,10 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), } // SchedRW def : Pat<(nontemporalstore FR32:$src, addr:$dst), - (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; def : Pat<(nontemporalstore FR64:$src, addr:$dst), - (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; } // AddedComplexity } // HasSSE4A @@ -7535,16 +7553,16 @@ let Predicates = [HasF16C, NoVLX] in { // more consistent with other instructions, which are always controlled by it. // It's encoded as 0b100. def : Pat<(fp_to_f16 FR32:$src), - (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr - (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>; + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr + (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; def : Pat<(f16_to_fp GR16:$src), - (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr - (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr + (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), - (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr - (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >; + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr + (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; } //===----------------------------------------------------------------------===// @@ -7718,45 +7736,45 @@ let Predicates = [HasAVX2, NoVLX] in { // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v4f32 (X86VBroadcast FR32:$src)), - (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; def : Pat<(v8f32 (X86VBroadcast FR32:$src)), - (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), - (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBrr (COPY_TO_REGCLASS + (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit)), - VR128))>; + VR128)))>; def : Pat<(v32i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBYrr (COPY_TO_REGCLASS + (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit)), - VR128))>; + VR128)))>; def : Pat<(v8i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWrr (COPY_TO_REGCLASS + (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)), - VR128))>; + VR128)))>; def : Pat<(v16i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWYrr (COPY_TO_REGCLASS + (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)), - VR128))>; + VR128)))>; } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), - (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; def : Pat<(v2i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; } // AVX1 broadcast patterns @@ -7774,7 +7792,7 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), let Predicates = [HasAVX, NoVLX] in { // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), - (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPrm addr:$src)>; @@ -7786,29 +7804,29 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [HasAVX1Only] in { def : Pat<(v4f32 (X86VBroadcast FR32:$src)), - (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; + (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; def : Pat<(v8f32 (X86VBroadcast FR32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), - (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), - (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; + (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), + (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), - (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm), - (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>; + (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), + (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; + (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm), - (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>; + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), - (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), - (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; def : Pat<(v2i64 (X86VBroadcast i64:$src)), - (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>; + (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), (VMOVDDUPrm addr:$src)>; } @@ -7991,7 +8009,7 @@ multiclass maskmov_lowering(BlendStr#"rr") RC:$src0, - (!cast(InstrStr#"rm") RC:$mask, addr:$ptr), + (VT (!cast(InstrStr#"rm") RC:$mask, addr:$ptr)), RC:$mask)>; } let Predicates = [HasAVX] in { @@ -8171,49 +8189,49 @@ let Predicates = [UseAVX2] in { } //===----------------------------------------------------------------------===// -// Extra selection patterns for FR128, f128, f128mem +// Extra selection patterns for f128, f128mem // movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. -def : Pat<(alignedstore (f128 FR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; -def : Pat<(store (f128 FR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; +def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; +def : Pat<(store (f128 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; def : Pat<(alignedloadf128 addr:$src), - (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>; def : Pat<(loadf128 addr:$src), - (COPY_TO_REGCLASS (MOVUPSrm addr:$src), FR128)>; + (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>; // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 -def : Pat<(X86fand FR128:$src1, (memopf128 addr:$src2)), +def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), (COPY_TO_REGCLASS - (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), - FR128)>; + (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), + VR128)>; -def : Pat<(X86fand FR128:$src1, FR128:$src2), +def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), (COPY_TO_REGCLASS - (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), - (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), + (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; -def : Pat<(X86for FR128:$src1, (memopf128 addr:$src2)), +def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), (COPY_TO_REGCLASS - (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), - FR128)>; + (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), + VR128)>; -def : Pat<(X86for FR128:$src1, FR128:$src2), +def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), (COPY_TO_REGCLASS - (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), - (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), + (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; -def : Pat<(X86fxor FR128:$src1, (memopf128 addr:$src2)), +def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), (COPY_TO_REGCLASS - (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), - FR128)>; + (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), + VR128)>; -def : Pat<(X86fxor FR128:$src1, FR128:$src2), +def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), (COPY_TO_REGCLASS - (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), - (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), + (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; //===----------------------------------------------------------------------===// // GFNI instructions diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index ae36391..322bdb7 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -263,7 +263,7 @@ multiclass subvec_zero_lowering("VMOV"#MoveStr#"rr") RC:$src), SubIdx)>; + (SrcTy (!cast("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>; } let Predicates = [HasAVX, NoVLX] in { diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index b17b3d8..ee9e789 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -504,8 +504,6 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; -def FR128 : RegisterClass<"X86", [f128], 128, (add FR32)>; - // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -527,16 +525,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32)>; def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Special classes that help the assembly parser choose some alternate // instructions to favor 2-byte VEX encodings. -def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], +def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], 128, (sequence "XMM%u", 0, 7)>; -def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], +def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], 128, (sequence "XMM%u", 8, 15)>; def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 7)>; @@ -567,7 +565,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32X)>; def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 6277fdd..6e58ffe 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1772,11 +1772,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl ; X86: # %bb.0: ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X86-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_set_pd: @@ -1797,19 +1797,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3 ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; X86-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_set_ps: @@ -2391,10 +2391,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub ; X86: # %bb.0: ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; @@ -2418,16 +2418,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] -; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] -; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 55049ea..0482092 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -8798,12 +8798,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; X86-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0] ; X86-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2] ; X86-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 -; X86-NEXT: vmovapd %xmm0, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xe8] -; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xea] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X86-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0] +; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xe2] ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa9,0xc2] -; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x58,0xcc] -; X86-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd1,0x58,0xc0] -; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x58,0xc0] +; X86-NEXT: vaddpd %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc0] +; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_sd: @@ -8815,12 +8815,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; X64-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0] ; X64-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2] ; X64-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 -; X64-NEXT: vmovapd %xmm0, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xe8] -; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xea] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X64-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0] +; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xe2] ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa9,0xc2] -; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x58,0xcc] -; X64-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd1,0x58,0xc0] -; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x58,0xc0] +; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc0] +; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -8845,12 +8845,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; X86-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0] ; X86-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2] ; X86-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 -; X86-NEXT: vmovaps %xmm0, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xe8] -; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xea] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X86-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0] +; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xe2] ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa9,0xc2] -; X86-NEXT: vaddps %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe0,0x58,0xcc] -; X86-NEXT: vaddps %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd0,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc0] +; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss: @@ -8862,12 +8862,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; X64-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0] ; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2] ; X64-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 -; X64-NEXT: vmovaps %xmm0, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xe8] -; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xea] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X64-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0] +; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xe2] ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa9,0xc2] -; X64-NEXT: vaddps %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe0,0x58,0xcc] -; X64-NEXT: vaddps %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd0,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc0] +; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -8943,12 +8943,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] ; X86-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1] ; X86-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 -; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] -; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe9] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe1] ; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xb9,0xd1] -; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] -; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2] +; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: @@ -8960,12 +8960,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] ; X64-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1] ; X64-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 -; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] -; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe9] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe1] ; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xb9,0xd1] -; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] -; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2] +; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -8990,12 +8990,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] ; X86-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1] ; X86-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 -; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] -; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe9] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe1] ; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xb9,0xd1] -; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] -; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2] +; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: @@ -9007,12 +9007,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] ; X64-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1] ; X64-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 -; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] -; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe9] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe1] ; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xb9,0xd1] -; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] -; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] -; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2] +; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -9213,12 +9213,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] ; X86-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1] ; X86-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 -; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] -; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe9] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe1] ; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbb,0xd1] -; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] -; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2] +; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: @@ -9230,12 +9230,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] ; X64-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1] ; X64-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 -; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] -; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe9] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe1] ; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbb,0xd1] -; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] -; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2] +; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -9260,12 +9260,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] ; X86-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1] ; X86-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 -; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] -; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe9] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe1] ; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbb,0xd1] -; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] -; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2] +; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: @@ -9277,12 +9277,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] ; X64-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1] ; X64-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 -; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] -; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe9] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe1] ; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbb,0xd1] -; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] -; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] -; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2] +; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -9307,12 +9307,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] ; X86-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1] ; X86-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 -; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] -; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe9] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe1] ; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbf,0xd1] -; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] -; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2] +; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: @@ -9324,12 +9324,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] ; X64-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1] ; X64-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 -; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] -; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe9] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc] +; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe1] ; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbf,0xd1] -; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] -; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2] +; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -9354,12 +9354,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] ; X86-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1] ; X86-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 -; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] -; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe9] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe1] ; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbf,0xd1] -; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] -; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2] +; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: @@ -9371,12 +9371,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] ; X64-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1] ; X64-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 -; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] -; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe9] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc] +; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe1] ; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbf,0xd1] -; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] -; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] -; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2] +; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 1b30660..8494bf2 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -574,11 +574,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> % ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) @@ -600,11 +600,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) @@ -2605,8 +2605,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> % ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovqb %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovqb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovqb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2641,8 +2641,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsqb %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2677,8 +2677,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusqb %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2713,8 +2713,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> % ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovqw %zmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovqw %zmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2749,8 +2749,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2785,8 +2785,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2926,8 +2926,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovdb %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovdb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2962,8 +2962,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsdb %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2998,8 +2998,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusdb %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3333,10 +3333,10 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddps %xmm5, %xmm4, %xmm4 ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm4, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) @@ -3359,10 +3359,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm3, %xmm5, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) @@ -3505,10 +3505,10 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm4, %xmm4 ; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm5, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4) @@ -3529,10 +3529,10 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4) @@ -4203,9 +4203,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4328,9 +4328,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovapd %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4351,12 +4351,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2 -; CHECK-NEXT: vmovapd %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = extractelement <2 x double> %x0, i64 0 %2 = extractelement <2 x double> %x1, i64 0 @@ -4398,12 +4398,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2 -; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = extractelement <4 x float> %x0, i64 0 %2 = extractelement <4 x float> %x1, i64 0 @@ -4503,12 +4503,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = extractelement <2 x double> %x0, i64 0 %2 = extractelement <2 x double> %x1, i64 0 @@ -4550,12 +4550,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = extractelement <4 x float> %x0, i64 0 %2 = extractelement <4 x float> %x1, i64 0 @@ -4723,12 +4723,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = fsub <2 x double> , %x2 %2 = extractelement <2 x double> %x0, i64 0 @@ -4778,12 +4778,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = fsub <4 x float> , %x2 %2 = extractelement <4 x float> %x0, i64 0 @@ -4833,12 +4833,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = fsub <2 x double> , %x0 %2 = fsub <2 x double> , %x2 @@ -4892,12 +4892,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %1 = fsub <4 x float> , %x0 %2 = fsub <4 x float> , %x2 diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index 4463a00..d1eb47a 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -10,8 +10,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, ; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01] -; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] ; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca] +; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -21,8 +21,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, ; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01] -; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] ; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca] +; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index c9bea58..2f97492 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -468,9 +468,9 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float ; X86-AVX512DQ-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x19,0x51,0xd1,0x04] ; X86-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x18,0x51,0xd9,0x04] +; X86-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe8,0x58,0xd3] ; X86-AVX512DQ-NEXT: vrangess $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x51,0xc1,0x04] -; X86-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x58,0xcb] -; X86-AVX512DQ-NEXT: vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1] +; X86-AVX512DQ-NEXT: vaddps %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc2] ; X86-AVX512DQ-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_ss: @@ -488,9 +488,9 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float ; X64-AVX512DQ-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x19,0x51,0xd1,0x04] ; X64-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x18,0x51,0xd9,0x04] +; X64-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe8,0x58,0xd3] ; X64-AVX512DQ-NEXT: vrangess $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x51,0xc1,0x04] -; X64-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x58,0xcb] -; X64-AVX512DQ-NEXT: vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1] +; X64-AVX512DQ-NEXT: vaddps %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc2] ; X64-AVX512DQ-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_ss: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index fa3a995..f4ec74b 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1620,8 +1620,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> % ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2] ; X86-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1] -; X86-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1630,8 +1630,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2] ; X64-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1] -; X64-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -1673,8 +1673,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2] ; X86-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1] -; X86-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1683,8 +1683,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2] ; X64-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1] -; X64-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -1726,8 +1726,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2] ; X86-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1] -; X86-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1736,8 +1736,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2] ; X64-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1] -; X64-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -1779,8 +1779,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> % ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2] ; X86-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1] -; X86-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -1790,8 +1790,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2] ; X64-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1] -; X64-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -1836,8 +1836,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2] ; X86-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1] -; X86-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -1847,8 +1847,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2] ; X64-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1] -; X64-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -1893,8 +1893,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2] ; X86-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1] -; X86-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -1904,8 +1904,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2] ; X64-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1] -; X64-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -1951,8 +1951,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> % ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1] ; X86-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2] -; X86-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1961,8 +1961,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2] ; X64-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1] -; X64-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -2005,8 +2005,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1] ; X86-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2] -; X86-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2015,8 +2015,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2] ; X64-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1] -; X64-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -2059,8 +2059,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1] ; X86-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2] -; X86-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2069,8 +2069,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2] ; X64-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1] -; X64-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -2113,8 +2113,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> % ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1] ; X86-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2] -; X86-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2124,8 +2124,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2] ; X64-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1] -; X64-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -2171,8 +2171,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1] ; X86-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2] -; X86-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2182,8 +2182,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2] ; X64-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1] -; X64-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -2229,8 +2229,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1] ; X86-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2] -; X86-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2240,8 +2240,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2] ; X64-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1] -; X64-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -2622,8 +2622,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> % ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2] ; X86-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1] -; X86-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2632,8 +2632,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2] ; X64-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1] -; X64-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) @@ -2675,8 +2675,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2] ; X86-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1] -; X86-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2685,8 +2685,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2] ; X64-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1] -; X64-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) @@ -2728,8 +2728,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2] ; X86-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1] -; X86-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2738,8 +2738,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2] ; X64-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1] -; X64-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) @@ -2781,8 +2781,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> % ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2] ; X86-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1] -; X86-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2792,8 +2792,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2] ; X64-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1] -; X64-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -2838,8 +2838,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2] ; X86-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1] -; X86-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2849,8 +2849,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2] ; X64-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1] -; X64-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -2895,8 +2895,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2] ; X86-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1] -; X86-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0] ; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X86-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0] ; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2906,8 +2906,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2] ; X64-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1] -; X64-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0] ; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca] +; X64-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0] ; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -2953,8 +2953,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> % ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1] ; X86-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2] -; X86-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2963,8 +2963,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2] ; X64-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1] -; X64-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) @@ -3007,8 +3007,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1] ; X86-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2] -; X86-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3017,8 +3017,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2] ; X64-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1] -; X64-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) @@ -3061,8 +3061,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1] ; X86-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2] -; X86-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3071,8 +3071,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2] ; X64-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1] -; X64-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) @@ -3115,8 +3115,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> % ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1] ; X86-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2] -; X86-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -3126,8 +3126,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> % ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2] ; X64-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1] -; X64-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -3173,8 +3173,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1] ; X86-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2] -; X86-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -3184,8 +3184,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2] ; X64-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1] -; X64-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -3231,8 +3231,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1] ; X86-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2] -; X86-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0] ; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X86-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -3242,8 +3242,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2] ; X64-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1] -; X64-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0] ; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca] +; X64-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -4296,8 +4296,8 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] -; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] ; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] +; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4306,8 +4306,8 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] -; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] ; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] +; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) @@ -4327,8 +4327,8 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] -; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] ; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] +; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] ; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -4338,8 +4338,8 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] -; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] ; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] +; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] ; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index eff8319..882814b 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -107,11 +107,11 @@ define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* % ; SSE2: # %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4f32_load: diff --git a/llvm/test/CodeGen/X86/domain-reassignment.mir b/llvm/test/CodeGen/X86/domain-reassignment.mir index 4dffbbb..6cf7fa6 100644 --- a/llvm/test/CodeGen/X86/domain-reassignment.mir +++ b/llvm/test/CodeGen/X86/domain-reassignment.mir @@ -76,8 +76,8 @@ registers: - { id: 17, class: gr32, preferred-register: '' } - { id: 18, class: vk1wm, preferred-register: '' } - { id: 19, class: vr128x, preferred-register: '' } - - { id: 20, class: fr128, preferred-register: '' } - - { id: 21, class: fr128, preferred-register: '' } + - { id: 20, class: vr128, preferred-register: '' } + - { id: 21, class: vr128, preferred-register: '' } - { id: 22, class: fr32x, preferred-register: '' } liveins: - { reg: '$edi', virtual-reg: '%3' } @@ -141,8 +141,8 @@ body: | ; CHECK: [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]] ; CHECK: [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]] ; CHECK: [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]] - ; CHECK: [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF - ; CHECK: [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]] + ; CHECK: [[DEF:%[0-9]+]]:vr128 = IMPLICIT_DEF + ; CHECK: [[VMOVSSZrrk:%[0-9]+]]:vr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]] ; CHECK: [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]] ; CHECK: VMOVSSZmr [[COPY6]], 1, $noreg, 0, $noreg, killed [[COPY16]] :: (store 4 into %ir.fptr) ; CHECK: RET 0 diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index 42c6a2d..92a393b 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -644,11 +644,11 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pextrw $1, %xmm0, %ecx ; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE3-NEXT: pextrw $2, %xmm0, %eax ; SSE3-NEXT: pextrw $3, %xmm0, %ecx ; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE3-NEXT: pextrw $4, %xmm0, %eax ; SSE3-NEXT: pextrw $5, %xmm0, %r11d ; SSE3-NEXT: addl %eax, %r11d @@ -697,9 +697,9 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { ; SSE3-NEXT: movd %r13d, %xmm4 ; SSE3-NEXT: movd %r15d, %xmm10 ; SSE3-NEXT: movd %r11d, %xmm7 -; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm11 # 4-byte Folded Reload +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload ; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero -; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movd %ecx, %xmm6 @@ -908,15 +908,15 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: subss %xmm3, %xmm2 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE-NEXT: subss %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] -; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE-NEXT: subss %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] +; SSE-NEXT: subss %xmm3, %xmm2 +; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; @@ -927,14 +927,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %A, i32 2 %vecext1 = extractelement <4 x float> %A, i32 3 @@ -1289,11 +1289,11 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: pextrw $1, %xmm1, %ecx ; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE3-NEXT: pextrw $2, %xmm1, %eax ; SSE3-NEXT: pextrw $3, %xmm1, %ecx ; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE3-NEXT: pextrw $4, %xmm1, %eax ; SSE3-NEXT: pextrw $5, %xmm1, %r14d ; SSE3-NEXT: addl %eax, %r14d @@ -1338,9 +1338,9 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { ; SSE3-NEXT: movd %r9d, %xmm5 ; SSE3-NEXT: movd %r15d, %xmm14 ; SSE3-NEXT: movd %r14d, %xmm2 -; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm15 # 4-byte Folded Reload +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload ; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero -; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 3cbfe03..2bdd537 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -388,10 +388,10 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-NEXT: movq %rdi, %rbx ; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill @@ -399,8 +399,8 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-LIBCALL-NEXT: addq $48, %rsp @@ -453,11 +453,11 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-I686-NEXT: addl $56, %esp ; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index c286c46..b7c8820 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -288,9 +288,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_012u: @@ -337,9 +337,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_019u: @@ -1198,11 +1198,11 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile: diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll index 832ebd1..fa3c098 100644 --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -10,7 +10,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK: # %bb.0: ; CHECK-NEXT: subq $88, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm8 ; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3] @@ -19,7 +19,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0],xmm1[1],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] -; CHECK-NEXT: vmovaps %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3] @@ -37,7 +37,8 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3] @@ -49,20 +50,19 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0] ; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2 ; CHECK-NEXT: vmovaps %xmm15, %xmm1 -; CHECK-NEXT: vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm9 -; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0 ; CHECK-NEXT: vaddps %xmm15, %xmm15, %xmm8 -; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm0 ; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) -; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: addq $88, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index de95fae..90e31eb 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2024,16 +2024,16 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n ; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c] ; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08] -; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xf3,0x0f,0x10,0x5c,0x24,0x04] -; X86-SSE-NEXT: # xmm3 = mem[0],zero,zero,zero -; X86-SSE-NEXT: unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE-NEXT: unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE-NEXT: movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0] +; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08] +; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04] +; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_set_ps: @@ -2042,16 +2042,16 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c] ; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08] -; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04] -; X86-AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10] ; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X86-AVX1-NEXT: vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20] -; X86-AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X86-AVX1-NEXT: vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30] -; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,2],xmm3[0] +; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08] +; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20] +; X86-AVX1-NEXT: # xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] +; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_set_ps: @@ -2353,20 +2353,20 @@ define void @test_mm_setcsr(i32 %a0) nounwind { define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { ; X86-SSE-LABEL: test_mm_setr_ps: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x10] +; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10] +; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c] ; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x0c] +; X86-SSE-NEXT: unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08] ; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xf3,0x0f,0x10,0x5c,0x24,0x08] -; X86-SSE-NEXT: # xmm3 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04] ; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: unpcklps %xmm1, %xmm2 # encoding: [0x0f,0x14,0xd1] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE-NEXT: unpcklps %xmm3, %xmm0 # encoding: [0x0f,0x14,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X86-SSE-NEXT: movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0] +; X86-SSE-NEXT: unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_setr_ps: diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll index 2859387..5e383a3 100644 --- a/llvm/test/CodeGen/X86/sse1.ll +++ b/llvm/test/CodeGen/X86/sse1.ll @@ -57,8 +57,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X86-NEXT: jne .LBB1_8 ; X86-NEXT: .LBB1_7: ; X86-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: je .LBB1_10 ; X86-NEXT: jmp .LBB1_11 ; X86-NEXT: .LBB1_1: @@ -71,8 +71,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X86-NEXT: je .LBB1_7 ; X86-NEXT: .LBB1_8: # %entry ; X86-NEXT: xorps %xmm3, %xmm3 -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: jne .LBB1_11 ; X86-NEXT: .LBB1_10: ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -96,8 +96,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: jne .LBB1_8 ; X64-NEXT: .LBB1_7: ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: testl %esi, %esi ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi ; X64-NEXT: je .LBB1_10 ; X64-NEXT: jmp .LBB1_11 ; X64-NEXT: .LBB1_1: @@ -110,8 +110,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: je .LBB1_7 ; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 -; X64-NEXT: testl %esi, %esi ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi ; X64-NEXT: jne .LBB1_11 ; X64-NEXT: .LBB1_10: ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll index e7d83db..9f9fe23 100644 --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -430,48 +430,31 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: subss %xmm5, %xmm4 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE-NEXT: addss %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: test16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] -; AVX1-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vaddss %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: retq -; -; AVX512-LABEL: test16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vsubss %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] -; AVX512-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX512-NEXT: vaddss %xmm2, %xmm5, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512-NEXT: retq +; AVX-LABEL: test16: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 0 %2 = extractelement <4 x float> %B, i32 0 %sub = fsub float %1, 42.0 @@ -644,34 +627,34 @@ define <8 x double> @test18(<8 x double> %A, <8 x double> %B) { ; ; AVX512-LABEL: test18: ; AVX512: # %bb.0: -; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm8 +; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-NEXT: vsubsd %xmm4, %xmm3, %xmm9 +; AVX512-NEXT: vsubsd %xmm4, %xmm3, %xmm5 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] ; AVX512-NEXT: vaddsd %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm5[0],xmm3[0] ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm4 -; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm7 -; AVX512-NEXT: vsubsd %xmm7, %xmm4, %xmm2 +; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5 +; AVX512-NEXT: vsubsd %xmm5, %xmm4, %xmm6 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm5 +; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm7 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] -; AVX512-NEXT: vaddsd %xmm7, %xmm4, %xmm4 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vaddsd %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm6[0],xmm4[0] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm4[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm3[0] -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm8[0],xmm6[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = extractelement <8 x double> %A, i32 0 diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 661a478..39ccc5e 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -439,13 +439,13 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi ; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %ecx ; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE3-NEXT: andl $3, %edx -; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %esi -; SSE3-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v4f32: @@ -655,15 +655,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d @@ -742,15 +742,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: andl $31, %r11d ; SSE3-NEXT: movzbl 448(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $31, %eax ; SSE3-NEXT: movzbl 480(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $31, %eax ; SSE3-NEXT: movzbl 512(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $31, %eax ; SSE3-NEXT: movzbl 544(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -793,15 +793,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d @@ -880,15 +880,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: andl $31, %r11d ; SSSE3-NEXT: movzbl 448(%rsp,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $31, %eax ; SSSE3-NEXT: movzbl 480(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $31, %eax ; SSSE3-NEXT: movzbl 512(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $31, %eax ; SSSE3-NEXT: movzbl 544(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 38dcce8..cc92f21 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4210,7 +4210,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: ; AVX1-NEXT: movq %rax, %rcx @@ -4218,22 +4218,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX1-NEXT: .LBB80_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # %bb.8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax @@ -4263,29 +4263,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # %bb.20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx @@ -4293,25 +4293,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: .LBB80_21: +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4337,7 +4337,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: ; AVX2-NEXT: movq %rax, %rcx @@ -4345,22 +4345,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX2-NEXT: .LBB80_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # %bb.8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax @@ -4390,29 +4390,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vmovq %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # %bb.20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx @@ -4420,25 +4420,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: .LBB80_21: +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index f13c048..1243768 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -77,13 +77,13 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: andl $3, %esi ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: @@ -97,13 +97,13 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: andl $3, %esi ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: andl $3, %edx -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: andl $3, %ecx -; SSSE3-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll index 8eb22d3..87ae539 100644 --- a/llvm/test/CodeGen/X86/vector-sqrt.ll +++ b/llvm/test/CodeGen/X86/vector-sqrt.ll @@ -33,13 +33,13 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vsqrtss %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vsqrtss %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: retq entry: %0 = load float, float* %v, align 4 -- 2.7.4