From f2d2cedab48f07d63b219836fcb7b653c9aeb27b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 May 2018 17:56:43 +0000 Subject: [PATCH] [X86] Split WriteVecShift/WriteVarVecShift into MMX, XMM and YMM/ZMM scheduler classes This took a bit of extra work as on Intel targets the old (V)PSLLDrr/(V)PSLLDrm style instructions act differently - I ended up creating WriteVecShiftImm classes for XMM/YMM/ZMM vector shift by immediate and retaining WriteVecShift as the default (used only by MMX) plus WriteVecShiftX/WriteVecShiftY. X86SchedWriteWidths hides most of this thank goodness. llvm-svn: 331472 --- llvm/lib/Target/X86/X86InstrAVX512.td | 16 +- llvm/lib/Target/X86/X86InstrMMX.td | 29 ++- llvm/lib/Target/X86/X86InstrSSE.td | 39 ++-- llvm/lib/Target/X86/X86InstrXOP.td | 12 +- llvm/lib/Target/X86/X86SchedBroadwell.td | 76 ++----- llvm/lib/Target/X86/X86SchedHaswell.td | 88 ++------ llvm/lib/Target/X86/X86SchedSandyBridge.td | 47 +--- llvm/lib/Target/X86/X86SchedSkylakeClient.td | 84 +------ llvm/lib/Target/X86/X86SchedSkylakeServer.td | 320 +-------------------------- llvm/lib/Target/X86/X86Schedule.td | 29 ++- llvm/lib/Target/X86/X86ScheduleAtom.td | 5 + llvm/lib/Target/X86/X86ScheduleBtVer2.td | 5 + llvm/lib/Target/X86/X86ScheduleSLM.td | 10 +- llvm/lib/Target/X86/X86ScheduleZnver1.td | 7 +- llvm/test/CodeGen/X86/avx2-schedule.ll | 52 ++--- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 4 +- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 16 +- llvm/test/CodeGen/X86/xop-schedule.ll | 56 ++--- 18 files changed, 234 insertions(+), 661 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index cea1e57..23bc1d8 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5587,24 +5587,24 @@ multiclass avx512_shift_rmi_dq opcd, bits<8> opcq, } defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli, - SchedWriteVecShift>, + SchedWriteVecShiftImm>, avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli, - SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, - SchedWriteVecShift>, + SchedWriteVecShiftImm>, avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli, - SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, - SchedWriteVecShift>, + SchedWriteVecShiftImm>, avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, - SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri, - SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, - SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SchedWriteVecShift>; diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index c21c00a..c410ddb 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -52,7 +52,8 @@ let Constraints = "$src1 = $dst" in { multiclass MMXI_binop_rmi_int opc, bits<8> opc2, Format ImmForm, string OpcodeStr, Intrinsic IntId, - Intrinsic IntId2, X86FoldableSchedWrite sched> { + Intrinsic IntId2, X86FoldableSchedWrite sched, + X86FoldableSchedWrite schedImm> { def rr : MMXI, - Sched<[sched]>; + Sched<[schedImm]>; } } @@ -412,30 +413,38 @@ defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_mmx_psrl_w, int_x86_mmx_psrli_w, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_mmx_psrl_d, int_x86_mmx_psrli_d, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_mmx_psll_w, int_x86_mmx_pslli_w, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_mmx_psll_d, int_x86_mmx_pslli_d, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_mmx_psll_q, int_x86_mmx_pslli_q, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_mmx_psra_w, int_x86_mmx_psrai_w, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_mmx_psra_d, int_x86_mmx_psrai_d, - SchedWriteVecShift.MMX>; + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; // Comparison Instructions defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 807a266..d668963b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3396,6 +3396,7 @@ multiclass PDI_binop_rmi opc, bits<8> opc2, Format ImmForm, string OpcodeStr, SDNode OpNode, SDNode OpNode2, RegisterClass RC, X86FoldableSchedWrite sched, + X86FoldableSchedWrite schedImm, ValueType DstVT, ValueType SrcVT, PatFrag ld_frag, bit Is2Addr = 1> { // src2 is always 128-bit @@ -3420,25 +3421,28 @@ multiclass PDI_binop_rmi opc, bits<8> opc2, Format ImmForm, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, - Sched<[sched]>; + Sched<[schedImm]>; } multiclass PDI_binop_rmi_all opc, bits<8> opc2, Format ImmForm, string OpcodeStr, SDNode OpNode, SDNode OpNode2, ValueType DstVT128, ValueType DstVT256, ValueType SrcVT, - X86SchedWriteWidths sched, Predicate prd> { + X86SchedWriteWidths sched, + X86SchedWriteWidths schedImm, Predicate prd> { let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rmi, VEX_4V, VEX_WIG; + OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, + DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rmi, VEX_4V, VEX_L, VEX_WIG; + OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, + DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L, + VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rmi; + VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, + memopv2i64>; } multiclass PDI_binop_ri opc, Format ImmForm, string OpcodeStr, @@ -3469,25 +3473,30 @@ let Constraints = "$src1 = $dst" in let ExeDomain = SSEPackedInt in { defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, v8i16, v16i16, v8i16, SchedWriteVecShift, - NoVLX_Or_NoBWI>; + SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, - v4i32, v8i32, v4i32, SchedWriteVecShift, NoVLX>; + v4i32, v8i32, v4i32, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, - v2i64, v4i64, v2i64, SchedWriteVecShift, NoVLX>; + v2i64, v4i64, v2i64, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, v8i16, v16i16, v8i16, SchedWriteVecShift, - NoVLX_Or_NoBWI>; + SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, - v4i32, v8i32, v4i32, SchedWriteVecShift, NoVLX>; + v4i32, v8i32, v4i32, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, - v2i64, v4i64, v2i64, SchedWriteVecShift, NoVLX>; + v2i64, v4i64, v2i64, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, v8i16, v16i16, v8i16, SchedWriteVecShift, - NoVLX_Or_NoBWI>; + SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, - v4i32, v8i32, v4i32, SchedWriteVecShift, NoVLX>; + v4i32, v8i32, v4i32, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, SchedWriteShuffle>; diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index 64a03a2..dd56759 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -155,10 +155,14 @@ multiclass xop3opimm opc, string OpcodeStr, SDNode OpNode, } let ExeDomain = SSEPackedInt in { - defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8, SchedWriteVecShift.XMM>; - defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32, SchedWriteVecShift.XMM>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64, SchedWriteVecShift.XMM>; - defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16, SchedWriteVecShift.XMM>; + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8, + SchedWriteVecShiftImm.XMM>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32, + SchedWriteVecShiftImm.XMM>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64, + SchedWriteVecShiftImm.XMM>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16, + SchedWriteVecShiftImm.XMM>; } // Instruction where second source can be memory, but third must be register diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 01a92dc..af7f2ac 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -203,7 +203,6 @@ defm : BWWriteResPair; // Vector intege defm : BWWriteResPair; // Vector integer ALU op, no logicals (YMM/ZMM). defm : BWWriteResPair; // Vector integer and/or/xor. defm : BWWriteResPair; // Vector integer and/or/xor (YMM/ZMM). -defm : BWWriteResPair; // Vector integer shifts. defm : BWWriteResPair; // Vector integer multiply. defm : BWWriteResPair; // Vector integer multiply. defm : BWWriteResPair; // Vector PMULLD. @@ -222,6 +221,17 @@ defm : BWWriteResPair; // Vector PSADBW. defm : BWWriteResPair; // Vector PSADBW (YMM/ZMM). defm : BWWriteResPair; // Vector PHMINPOS. +// Vector integer shifts. +defm : BWWriteResPair; +defm : BWWriteResPair; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : BWWriteResPair; // Vector integer immediate shifts (XMM). +defm : BWWriteResPair; // Vector integer immediate shifts (YMM/ZMM). +defm : BWWriteResPair; // Variable vector shifts. +defm : BWWriteResPair; // Variable vector shifts (YMM/ZMM). + // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -347,7 +357,6 @@ defm : BWWriteResPair; // Fp 256-bit defm : BWWriteResPair; // Fp 256-bit width vector variable shuffles. defm : BWWriteResPair; // 256-bit width vector shuffles. defm : BWWriteResPair; // 256-bit width vector variable shuffles. -defm : BWWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -545,14 +554,6 @@ def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> { def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PS(Y?)rr", "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr", - "(V?)PSLLDrr", - "(V?)PSLLQrr", - "(V?)PSLLWrr", - "(V?)PSRADrr", - "(V?)PSRAWrr", - "(V?)PSRLDrr", - "(V?)PSRLQrr", - "(V?)PSRLWrr", "(V?)PTESTrr")>; def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> { @@ -676,15 +677,6 @@ def: InstRW<[BWWriteResGroup30], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, XCHG16ar, XCHG32ar, XCHG64ar)>; -def BWWriteResGroup31 : SchedWriteRes<[BWPort0,BWPort5]> { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [2,1]; -} -def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVD(Y?)rr", - "VPSRAVD(Y?)rr", - "VPSRLVD(Y?)rr")>; - def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -757,14 +749,6 @@ def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> { let ResourceCycles = [1,1]; } def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr", - "VPSLLDYrr", - "VPSLLQYrr", - "VPSLLWYrr", - "VPSRADYrr", - "VPSRAWYrr", - "VPSRLDYrr", - "VPSRLQYrr", - "VPSRLWYrr", "VPTESTYrr")>; def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> { @@ -1066,16 +1050,8 @@ def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup73], (instregex "VPSLLDYrm", - "VPSLLQYrm", - "VPSLLVQYrm", - "VPSLLWYrm", - "VPSRADYrm", - "VPSRAWYrm", - "VPSRLDYrm", - "VPSRLQYrm", +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm", "VPSRLVQYrm", - "VPSRLWYrm", "VTESTPDYrm", "VTESTPSYrm")>; @@ -1122,15 +1098,7 @@ def BWWriteResGroup81 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[BWWriteResGroup81], (instregex "(V?)PSLLDrm", - "(V?)PSLLQrm", - "(V?)PSLLWrm", - "(V?)PSRADrm", - "(V?)PSRAWrm", - "(V?)PSRLDrm", - "(V?)PSRLQrm", - "(V?)PSRLWrm", - "(V?)PTESTrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "(V?)PTESTrm")>; def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> { let Latency = 7; @@ -1233,15 +1201,6 @@ def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm", "VPMASKMOVDYrm", "VPMASKMOVQYrm")>; -def BWWriteResGroup95 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[BWWriteResGroup95], (instregex "VPSLLVDrm", - "VPSRAVDrm", - "VPSRLVDrm")>; - def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> { let Latency = 8; let NumMicroOps = 5; @@ -1359,15 +1318,6 @@ def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> { def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm", "VPBROADCASTW(Y?)rm")>; -def BWWriteResGroup109 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[BWWriteResGroup109], (instregex "VPSLLVDYrm", - "VPSRAVDYrm", - "VPSRLVDYrm")>; - def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> { let Latency = 9; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 085034b..11ce9e9 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -194,7 +194,6 @@ def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; -defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; @@ -213,13 +212,23 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +// Vector integer shifts. +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; + // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -834,16 +843,8 @@ def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLDYrm", - "VPSLLQYrm", - "VPSLLVQYrm", - "VPSLLWYrm", - "VPSRADYrm", - "VPSRAWYrm", - "VPSRLDYrm", - "VPSRLQYrm", +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm", "VPSRLVQYrm", - "VPSRLWYrm", "VTESTPDYrm", "VTESTPSYrm")>; @@ -943,12 +944,12 @@ def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", "MOVBE(16|32|64)rm", - "MMX_PABS(B|D|W)rm", - "MMX_P(ADD|SUB)(B|D|W|Q)irm", - "MMX_P(ADD|SUB)(U?)S(B|W)irm", - "MMX_PAVG(B|W)irm", - "MMX_PCMP(EQ|GT)(B|D|W)irm", - "MMX_P(MAX|MIN)(SW|UB)irm", + "MMX_PABS(B|D|W)rm", + "MMX_P(ADD|SUB)(B|D|W|Q)irm", + "MMX_P(ADD|SUB)(U?)S(B|W)irm", + "MMX_PAVG(B|W)irm", + "MMX_PCMP(EQ|GT)(B|D|W)irm", + "MMX_P(MAX|MIN)(SW|UB)irm", "MMX_PSIGN(B|D|W)rm")>; def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> { @@ -1082,14 +1083,6 @@ def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr", "VCVTPH2PSrr", "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr", - "(V?)PSLLDrr", - "(V?)PSLLQrr", - "(V?)PSLLWrr", - "(V?)PSRADrr", - "(V?)PSRAWrr", - "(V?)PSRLDrr", - "(V?)PSRLQrr", - "(V?)PSRLWrr", "(V?)PTESTrr")>; def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> { @@ -1176,15 +1169,7 @@ def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[HWWriteResGroup38], (instregex "(V?)PSLLDrm", - "(V?)PSLLQrm", - "(V?)PSLLWrm", - "(V?)PSRADrm", - "(V?)PSRAWrm", - "(V?)PSRLDrm", - "(V?)PSRLQrm", - "(V?)PSRLWrm", - "(V?)PTESTrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "(V?)PTESTrm")>; def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> { let Latency = 7; @@ -1338,15 +1323,6 @@ def: InstRW<[HWWriteResGroup54], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, XCHG16ar, XCHG32ar, XCHG64ar)>; -def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [2,1]; -} -def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVD(Y?)rr", - "VPSRAVD(Y?)rr", - "VPSRLVD(Y?)rr")>; - def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -1400,24 +1376,6 @@ def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> { def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m", "IST_F(16|32)m")>; -def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { - let Latency = 10; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm", - "VPSRAVDYrm", - "VPSRLVDYrm")>; - -def HWWriteResGroup63_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[HWWriteResGroup63_1], (instregex "VPSLLVDrm", - "VPSRAVDrm", - "VPSRLVDrm")>; - def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { let Latency = 8; let NumMicroOps = 4; @@ -1491,14 +1449,6 @@ def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> { let ResourceCycles = [1,1]; } def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr", - "VPSLLDYrr", - "VPSLLQYrr", - "VPSLLWYrr", - "VPSRADYrr", - "VPSRAWYrr", - "VPSRLDYrr", - "VPSRLQYrr", - "VPSRLWYrr", "VPTESTYrr")>; def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> { diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 2d4985f..876afdb 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -174,7 +174,6 @@ def : WriteRes; def : WriteRes { let Latency = 6; } def : WriteRes; -defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -197,6 +196,15 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +// Vector integer shifts. +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; + // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -336,7 +344,6 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -349,14 +356,6 @@ def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> { let ResourceCycles = [1]; } def: InstRW<[SBWriteResGroup0], (instregex "(V?)CVTSS2SDrr", - "(V?)PSLLDri", - "(V?)PSLLQri", - "(V?)PSLLWri", - "(V?)PSRADri", - "(V?)PSRAWri", - "(V?)PSRLDri", - "(V?)PSRLQri", - "(V?)PSRLWri", "VTESTPD(Y?)rr", "VTESTPS(Y?)rr")>; @@ -496,20 +495,6 @@ def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort5]> { def: InstRW<[SBWriteResGroup13], (instregex "(V?)CVTPS2PD(Y?)rr", "(V?)PTEST(Y?)rr")>; -def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup14], (instregex "(V?)PSLLDrr", - "(V?)PSLLQrr", - "(V?)PSLLWrr", - "(V?)PSRADrr", - "(V?)PSRAWrr", - "(V?)PSRLDrr", - "(V?)PSRLQrr", - "(V?)PSRLWrr")>; - def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> { let Latency = 2; let NumMicroOps = 2; @@ -1073,20 +1058,6 @@ def SBWriteResGroup78 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> { } def: InstRW<[SBWriteResGroup78], (instregex "(V?)PTESTrm")>; -def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> { - let Latency = 8; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SBWriteResGroup79], (instregex "(V?)PSLLDrm", - "(V?)PSLLQrm", - "(V?)PSLLWrm", - "(V?)PSRADrm", - "(V?)PSRAWrm", - "(V?)PSRLDrm", - "(V?)PSRLQrm", - "(V?)PSRLWrm")>; - def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> { let Latency = 8; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 402ae1f..0ef7938 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -199,7 +199,6 @@ defm : SKLWriteResPair; // Vector inte defm : SKLWriteResPair; // Vector integer ALU op, no logicals (YMM/ZMM). defm : SKLWriteResPair; // Vector integer and/or/xor. defm : SKLWriteResPair; // Vector integer and/or/xor (YMM/ZMM). -defm : SKLWriteResPair; // Vector integer shifts. defm : SKLWriteResPair; // Vector integer multiply. defm : SKLWriteResPair; // Vector integer multiply (YMM/ZMM). defm : SKLWriteResPair; // Vector PMULLD. @@ -218,6 +217,17 @@ defm : SKLWriteResPair; // Vector PSADBW defm : SKLWriteResPair; // Vector PSADBW. defm : SKLWriteResPair; // Vector PHMINPOS. +// Vector integer shifts. +defm : SKLWriteResPair; +defm : SKLWriteResPair; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : SKLWriteResPair; // Vector integer immediate shifts (XMM). +defm : SKLWriteResPair; // Vector integer immediate shifts (YMM/ZMM). +defm : SKLWriteResPair; // Variable vector shifts. +defm : SKLWriteResPair; // Variable vector shifts (YMM/ZMM). + // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -353,7 +363,6 @@ defm : SKLWriteResPair; // Fp 256-bi defm : SKLWriteResPair; // Fp 256-bit width vector variable shuffles. defm : SKLWriteResPair; // 256-bit width vector shuffles. defm : SKLWriteResPair; // 256-bit width vector variable shuffles. -defm : SKLWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -426,25 +435,6 @@ def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> { } def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>; -def SKLWriteResGroup5 : SchedWriteRes<[SKLPort01]> { - let Latency = 1; - let NumMicroOps = 1; - let ResourceCycles = [1]; -} -def: InstRW<[SKLWriteResGroup5], (instregex "(V?)PSLLD(Y?)ri", - "(V?)PSLLQ(Y?)ri", - "VPSLLVD(Y?)rr", - "VPSLLVQ(Y?)rr", - "(V?)PSLLW(Y?)ri", - "(V?)PSRAD(Y?)ri", - "VPSRAVD(Y?)rr", - "(V?)PSRAW(Y?)ri", - "(V?)PSRLD(Y?)ri", - "(V?)PSRLQ(Y?)ri", - "VPSRLVD(Y?)rr", - "VPSRLVQ(Y?)rr", - "(V?)PSRLW(Y?)ri")>; - def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> { let Latency = 1; let NumMicroOps = 1; @@ -619,20 +609,6 @@ def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPD(Y?)mr", "VPMASKMOVD(Y?)mr", "VPMASKMOVQ(Y?)mr")>; -def SKLWriteResGroup19 : SchedWriteRes<[SKLPort5,SKLPort01]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKLWriteResGroup19], (instregex "(V?)PSLLDrr", - "(V?)PSLLQrr", - "(V?)PSLLWrr", - "(V?)PSRADrr", - "(V?)PSRAWrr", - "(V?)PSRLDrr", - "(V?)PSRLQrr", - "(V?)PSRLWrr")>; - def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -895,20 +871,6 @@ def SKLWriteResGroup51_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { } def: InstRW<[SKLWriteResGroup51_16], (instrs IMUL16r, MUL16r)>; -def SKLWriteResGroup52 : SchedWriteRes<[SKLPort5,SKLPort01]> { - let Latency = 4; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLDYrr", - "VPSLLQYrr", - "VPSLLWYrr", - "VPSRADYrr", - "VPSRAWYrr", - "VPSRLDYrr", - "VPSRLQYrr", - "VPSRLWYrr")>; - def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { let Latency = 4; let NumMicroOps = 3; @@ -1263,16 +1225,11 @@ def SKLWriteResGroup90 : SchedWriteRes<[SKLPort01,SKLPort23]> { } def: InstRW<[SKLWriteResGroup90], (instregex "(V?)PSLLDrm", "(V?)PSLLQrm", - "VPSLLVDrm", - "VPSLLVQrm", "(V?)PSLLWrm", "(V?)PSRADrm", - "VPSRAVDrm", "(V?)PSRAWrm", "(V?)PSRLDrm", "(V?)PSRLQrm", - "(V?)PSRLVDrm", - "VPSRLVQrm", "(V?)PSRLWrm")>; def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> { @@ -1431,25 +1388,6 @@ def: InstRW<[SKLWriteResGroup108], (instregex "FCOM32m", "VPMOVSXBQYrm", "VPMOVSXWQYrm")>; -def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> { - let Latency = 8; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLDYrm", - "VPSLLQYrm", - "VPSLLVDYrm", - "VPSLLVQYrm", - "VPSLLWYrm", - "VPSRADYrm", - "VPSRAVDYrm", - "VPSRAWYrm", - "VPSRLDYrm", - "VPSRLQYrm", - "VPSRLVDYrm", - "VPSRLVQYrm", - "VPSRLWYrm")>; - def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> { let Latency = 8; let NumMicroOps = 2; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index b1bd040..41b7582 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -199,7 +199,6 @@ defm : SKXWriteResPair; // Vector inte defm : SKXWriteResPair; // Vector integer ALU op, no logicals (YMM/ZMM). defm : SKXWriteResPair; // Vector integer and/or/xor. defm : SKXWriteResPair; // Vector integer and/or/xor (YMM/ZMM). -defm : SKXWriteResPair; // Vector integer shifts. defm : SKXWriteResPair; // Vector integer multiply. defm : SKXWriteResPair; // Vector integer multiply (YMM/ZMM). defm : SKXWriteResPair; // Vector PMULLD. @@ -218,6 +217,18 @@ defm : SKXWriteResPair; // Vector PSADBW defm : SKXWriteResPair; // Vector PSADBW. defm : SKXWriteResPair; // Vector PHMINPOS. +// Vector integer shifts. +defm : SKXWriteResPair; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + +defm : SKXWriteResPair; // Vector integer immediate shifts (XMM). +defm : SKXWriteResPair; // Vector integer immediate shifts (YMM/ZMM). +defm : SKXWriteResPair; // Variable vector shifts. +defm : SKXWriteResPair; // Variable vector shifts (YMM/ZMM). + // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -353,7 +364,6 @@ defm : SKXWriteResPair; // Fp 256-bi defm : SKXWriteResPair; // Fp 256-bit width vector variable shuffles. defm : SKXWriteResPair; // 256-bit width vector shuffles. defm : SKXWriteResPair; // 256-bit width vector variable shuffles. -defm : SKXWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -474,116 +484,6 @@ def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> { } def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>; -def SKXWriteResGroup5 : SchedWriteRes<[SKXPort01]> { - let Latency = 1; - let NumMicroOps = 1; - let ResourceCycles = [1]; -} -def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ128ri", - "VPROLDZ256ri", - "VPROLDZri", - "VPROLQZ128ri", - "VPROLQZ256ri", - "VPROLQZri", - "VPROLVDZ128rr", - "VPROLVDZ256rr", - "VPROLVDZrr", - "VPROLVQZ128rr", - "VPROLVQZ256rr", - "VPROLVQZrr", - "VPRORDZ128ri", - "VPRORDZ256ri", - "VPRORDZri", - "VPRORQZ128ri", - "VPRORQZ256ri", - "VPRORQZri", - "VPRORVDZ128rr", - "VPRORVDZ256rr", - "VPRORVDZrr", - "VPRORVQZ128rr", - "VPRORVQZ256rr", - "VPRORVQZrr", - "(V?)PSLLDYri", - "VPSLLDZ128ri", - "VPSLLDZ256ri", - "VPSLLDZri", - "(V?)PSLLDri", - "VPSLLQYri", - "VPSLLQZ128ri", - "VPSLLQZ256ri", - "VPSLLQZri", - "(V?)PSLLQri", - "VPSLLVDYrr", - "VPSLLVDZ128rr", - "VPSLLVDZ256rr", - "VPSLLVDZrr", - "VPSLLVDrr", - "VPSLLVQYrr", - "VPSLLVQZ128rr", - "VPSLLVQZ256rr", - "VPSLLVQZrr", - "VPSLLVQrr", - "VPSLLVWZ128rr", - "VPSLLVWZ256rr", - "VPSLLVWZrr", - "VPSLLWYri", - "VPSLLWZ128ri", - "VPSLLWZ256ri", - "VPSLLWZri", - "(V?)PSLLWri", - "VPSRADYri", - "VPSRADZ128ri", - "VPSRADZ256ri", - "VPSRADZri", - "(V?)PSRADri", - "VPSRAQZ128ri", - "VPSRAQZ256ri", - "VPSRAQZri", - "VPSRAVDYrr", - "VPSRAVDZ128rr", - "VPSRAVDZ256rr", - "VPSRAVDZrr", - "VPSRAVDrr", - "VPSRAVQZ128rr", - "VPSRAVQZ256rr", - "VPSRAVQZrr", - "VPSRAVWZ128rr", - "VPSRAVWZ256rr", - "VPSRAVWZrr", - "VPSRAWYri", - "VPSRAWZ128ri", - "VPSRAWZ256ri", - "VPSRAWZri", - "(V?)PSRAWri", - "VPSRLDYri", - "VPSRLDZ128ri", - "VPSRLDZ256ri", - "VPSRLDZri", - "(V?)PSRLDri", - "VPSRLQYri", - "VPSRLQZ128ri", - "VPSRLQZ256ri", - "VPSRLQZri", - "(V?)PSRLQri", - "VPSRLVDYrr", - "VPSRLVDZ128rr", - "VPSRLVDZ256rr", - "VPSRLVDZrr", - "VPSRLVDrr", - "VPSRLVQYrr", - "VPSRLVQZ128rr", - "VPSRLVQZ256rr", - "VPSRLVQZrr", - "VPSRLVQrr", - "VPSRLVWZ128rr", - "VPSRLVWZ256rr", - "VPSRLVWZrr", - "VPSRLWYri", - "VPSRLWZ128ri", - "VPSRLWZ256ri", - "VPSRLWZri", - "(V?)PSRLWri")>; - def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> { let Latency = 1; let NumMicroOps = 1; @@ -915,28 +815,6 @@ def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDYmr", "VPMASKMOVQYmr", "VPMASKMOVQmr")>; -def SKXWriteResGroup19 : SchedWriteRes<[SKXPort5,SKXPort01]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDZ128rr", - "(V?)PSLLDrr", - "VPSLLQZ128rr", - "(V?)PSLLQrr", - "VPSLLWZ128rr", - "(V?)PSLLWrr", - "VPSRADZ128rr", - "(V?)PSRADrr", - "VPSRAQZ128rr", - "VPSRAWZ128rr", - "(V?)PSRAWrr", - "VPSRLDZ128rr", - "(V?)PSRLDrr", - "VPSRLQZ128rr", - "(V?)PSRLQrr", - "(V?)PSRLWrr")>; - def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -1462,38 +1340,6 @@ def SKXWriteResGroup52_16 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { } def: InstRW<[SKXWriteResGroup52_16], (instrs IMUL16r, MUL16r)>; -def SKXWriteResGroup53 : SchedWriteRes<[SKXPort5,SKXPort01]> { - let Latency = 4; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDYrr", - "VPSLLDZ256rr", - "VPSLLDZrr", - "VPSLLQYrr", - "VPSLLQZ256rr", - "VPSLLQZrr", - "VPSLLWYrr", - "VPSLLWZ256rr", - "VPSLLWZrr", - "VPSRADYrr", - "VPSRADZ256rr", - "VPSRADZrr", - "VPSRAQZ256rr", - "VPSRAQZrr", - "VPSRAWYrr", - "VPSRAWZ256rr", - "VPSRAWZrr", - "VPSRLDYrr", - "VPSRLDZ256rr", - "VPSRLDZrr", - "VPSRLQYrr", - "VPSRLQZ256rr", - "VPSRLQZrr", - "VPSRLWYrr", - "VPSRLWZ256rr", - "VPSRLWZrr")>; - def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { let Latency = 4; let NumMicroOps = 3; @@ -2066,59 +1912,6 @@ def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr", "VCVTUQQ2PSZ256rr", "VCVTUQQ2PSZrr")>; -def SKXWriteResGroup94 : SchedWriteRes<[SKXPort01,SKXPort23]> { - let Latency = 7; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKXWriteResGroup94], (instregex "VPROLDZ128m(b?)i", - "VPROLQZ128m(b?)i", - "VPROLVDZ128rm(b?)", - "VPROLVQZ128rm(b?)", - "VPRORDZ128m(b?)i", - "VPRORQZ128m(b?)i", - "VPRORVDZ128rm(b?)", - "VPRORVQZ128rm(b?)", - "VPSLLDZ128m(b?)i", - "VPSLLDZ128rm(b?)", - "(V?)PSLLDrm", - "VPSLLQZ128m(b?)i", - "VPSLLQZ128rm(b?)", - "(V?)PSLLQrm", - "VPSLLVDZ128rm(b?)", - "VPSLLVDrm", - "VPSLLVQZ128rm(b?)", - "VPSLLVQrm", - "VPSLLVWZ128rm(b?)", - "VPSLLWZ128mi(b?)", - "VPSLLWZ128rm(b?)", - "(V?)PSLLWrm", - "VPSRADZ128m(b?)i", - "VPSRADZ128rm(b?)", - "(V?)PSRADrm", - "VPSRAQZ128m(b?)i", - "VPSRAQZ128rm(b?)", - "VPSRAVDZ128rm(b?)", - "VPSRAVDrm", - "VPSRAVQZ128rm(b?)", - "VPSRAVWZ128rm(b?)", - "VPSRAWZ128mi(b?)", - "VPSRAWZ128rm(b?)", - "(V?)PSRAWrm", - "VPSRLDZ128m(b?)i", - "VPSRLDZ128rm(b?)", - "(V?)PSRLDrm", - "VPSRLQZ128m(b?)i", - "VPSRLQZ128rm(b?)", - "(V?)PSRLQrm", - "VPSRLVDZ128rm(b?)", - "VPSRLVDrm", - "VPSRLVQZ128rm(b?)", - "VPSRLVQrm", - "VPSRLVWZ128rm(b?)", - "VPSRLWZ128mi(b?)", - "VPSRLWZ128rm(b?)", - "(V?)PSRLWrm")>; def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 7; @@ -2406,95 +2199,6 @@ def: InstRW<[SKXWriteResGroup119], (instregex "FCOM32m", "VPMOVSXBQYrm", "VPMOVSXWQYrm")>; -def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> { - let Latency = 8; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZ256m(b?)i", - "VPROLDZm(b?)i", - "VPROLQZ256m(b?)i", - "VPROLQZm(b?)i", - "VPROLVDZ256rm(b?)", - "VPROLVDZrm(b?)", - "VPROLVQZ256rm(b?)", - "VPROLVQZrm(b?)", - "VPRORDZ256m(b?)i", - "VPRORDZm(b?)i", - "VPRORQZ256m(b?)i", - "VPRORQZm(b?)i", - "VPRORVDZ256rm(b?)", - "VPRORVDZrm(b?)", - "VPRORVQZ256rm(b?)", - "VPRORVQZrm(b?)", - "VPSLLDYrm", - "VPSLLDZ256m(b?)i", - "VPSLLDZ256rm(b?)", - "VPSLLDZm(b?)i", - "VPSLLDZrm(b?)", - "VPSLLQYrm", - "VPSLLQZ256m(b?)i", - "VPSLLQZ256rm(b?)", - "VPSLLQZm(b?)i", - "VPSLLQZrm(b?)", - "VPSLLVDYrm", - "VPSLLVDZ256rm(b?)", - "VPSLLVDZrm(b?)", - "VPSLLVQYrm", - "VPSLLVQZ256rm(b?)", - "VPSLLVQZrm(b?)", - "VPSLLVWZ256rm(b?)", - "VPSLLVWZrm(b?)", - "VPSLLWYrm", - "VPSLLWZ256mi(b?)", - "VPSLLWZ256rm(b?)", - "VPSLLWZmi(b?)", - "VPSLLWZrm(b?)", - "VPSRADYrm", - "VPSRADZ256m(b?)i", - "VPSRADZ256rm(b?)", - "VPSRADZm(b?)i", - "VPSRADZrm(b?)", - "VPSRAQZ256m(b?)i", - "VPSRAQZ256rm(b?)", - "VPSRAQZm(b?)i", - "VPSRAQZrm(b?)", - "VPSRAVDYrm", - "VPSRAVDZ256rm(b?)", - "VPSRAVDZrm(b?)", - "VPSRAVQZ256rm(b?)", - "VPSRAVQZrm(b?)", - "VPSRAVWZ256rm(b?)", - "VPSRAVWZrm(b?)", - "VPSRAWYrm", - "VPSRAWZ256mi(b?)", - "VPSRAWZ256rm(b?)", - "VPSRAWZmi(b?)", - "VPSRAWZrm(b?)", - "VPSRLDYrm", - "VPSRLDZ256m(b?)i", - "VPSRLDZ256rm(b?)", - "VPSRLDZm(b?)i", - "VPSRLDZrm(b?)", - "VPSRLQYrm", - "VPSRLQZ256m(b?)i", - "VPSRLQZ256rm(b?)", - "VPSRLQZm(b?)i", - "VPSRLQZrm(b?)", - "VPSRLVDYrm", - "VPSRLVDZ256rm(b?)", - "VPSRLVDZrm(b?)", - "VPSRLVQYrm", - "VPSRLVQZ256rm(b?)", - "VPSRLVQZrm(b?)", - "VPSRLVWZ256rm(b?)", - "VPSRLVWZrm(b?)", - "VPSRLWYrm", - "VPSRLWZ256mi(b?)", - "VPSRLWZ256rm(b?)", - "VPSRLWZmi(b?)", - "VPSRLWZrm(b?)")>; - def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 8; let NumMicroOps = 2; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 7e984a5..7f6a38f 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -19,6 +19,17 @@ def ReadAfterLd : SchedRead; // load + WriteRMW. def WriteRMW : SchedWrite; +// Helper to set SchedWrite ExePorts/Latency/ResourceCycles/NumMicroOps. +multiclass X86WriteRes ExePorts, + int Lat, list Res, int UOps> { + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } +} + // Most instructions can fold loads, so almost every SchedWrite comes in two // variants: With and without a folded load. // An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite @@ -137,7 +148,11 @@ defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecALUY : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM/ZMM). defm WriteVecLogic : X86SchedWritePair; // Vector integer and/or/xor logicals. defm WriteVecLogicY: X86SchedWritePair; // Vector integer and/or/xor logicals (YMM/ZMM). -defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. +defm WriteVecShift : X86SchedWritePair; // Vector integer shifts (default). +defm WriteVecShiftX : X86SchedWritePair; // Vector integer shifts (XMM). +defm WriteVecShiftY : X86SchedWritePair; // Vector integer shifts (YMM/ZMM). +defm WriteVecShiftImmX: X86SchedWritePair; // Vector integer immediate shifts (XMM). +defm WriteVecShiftImmY: X86SchedWritePair; // Vector integer immediate shifts (YMM/ZMM). defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. defm WriteVecIMulY : X86SchedWritePair; // Vector integer multiply (YMM/ZMM). defm WritePMULLD : X86SchedWritePair; // Vector PMULLD. @@ -205,7 +220,8 @@ defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles. defm WriteFVarShuffle256 : X86SchedWritePair; // Fp 256-bit width variable shuffles. defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles. defm WriteVarShuffle256 : X86SchedWritePair; // 256-bit width vector variable shuffles. -defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts. +defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts. +defm WriteVarVecShiftY : X86SchedWritePair; // Variable vector shifts (YMM/ZMM). // Old microcoded instructions that nobody use. def WriteMicrocoded : SchedWrite; @@ -258,11 +274,14 @@ def SchedWriteVecLogic : X86SchedWriteWidths; def SchedWriteVecShift - : X86SchedWriteWidths; + : X86SchedWriteWidths; +def SchedWriteVecShiftImm + : X86SchedWriteWidths; def SchedWriteVarVecShift : X86SchedWriteWidths; + WriteVarVecShiftY, WriteVarVecShiftY>; def SchedWriteVecIMul : X86SchedWriteWidths; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 26bad49..da19ad7 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -256,6 +256,10 @@ defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; +defm : AtomWriteResPair; +defm : AtomWriteResPair; +defm : AtomWriteResPair; +defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; @@ -276,6 +280,7 @@ defm : AtomWriteResPair; // NOTE: defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. +defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. //////////////////////////////////////////////////////////////////////////////// // Vector insert/extract operations. diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index c35a53e..d930ed0 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -406,6 +406,10 @@ def : WriteRes; defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; +defm : JWriteResFpuPair; +defm : JWriteResFpuPair; +defm : JWriteResFpuPair; +defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; @@ -428,6 +432,7 @@ defm : JWriteResFpuPair; // NOTE: Doesn't defm : JWriteResFpuPair; defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. +defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. //////////////////////////////////////////////////////////////////////////////// // Vector insert/extract operations. diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 6e7a010..51ced28 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -162,7 +162,11 @@ def : WriteRes; def : WriteRes { let Latency = 3; } def : WriteRes; -defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; @@ -288,7 +292,6 @@ def : WriteRes { let ResourceCycles = [10, 1]; } - def : WriteRes { let Latency = 100; } def : WriteRes { let Latency = 100; } def : WriteRes; @@ -306,7 +309,8 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 2f4a3ef..8c4c960 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -235,6 +235,10 @@ def : WriteRes; def : WriteRes { let Latency = 8; } defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -258,7 +262,8 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; // Vector Shift Operations -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector insert/extract operations. defm : ZnWriteResFpuPair; diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll index 4eb4e25..94c8fbf 100644 --- a/llvm/test/CodeGen/X86/avx2-schedule.ll +++ b/llvm/test/CodeGen/X86/avx2-schedule.ll @@ -5450,8 +5450,8 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn define <8 x i32> @test_pslld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_pslld: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5534,8 +5534,8 @@ define <32 x i8> @test_pslldq(<32 x i8> %a0) { define <4 x i64> @test_psllq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-LABEL: test_psllq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5585,7 +5585,7 @@ define <4 x i32> @test_psllvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psllvd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvd: @@ -5628,7 +5628,7 @@ define <8 x i32> @test_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) ; GENERIC-LABEL: test_psllvd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvd_ymm: @@ -5671,7 +5671,7 @@ define <2 x i64> @test_psllvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-LABEL: test_psllvq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvq: @@ -5714,7 +5714,7 @@ define <4 x i64> @test_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) ; GENERIC-LABEL: test_psllvq_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvq_ymm: @@ -5756,8 +5756,8 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read define <16 x i16> @test_psllw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psllw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5806,8 +5806,8 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon define <8 x i32> @test_psrad(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psrad: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5857,7 +5857,7 @@ define <4 x i32> @test_psravd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psravd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psravd: @@ -5900,7 +5900,7 @@ define <8 x i32> @test_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) ; GENERIC-LABEL: test_psravd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psravd_ymm: @@ -5942,8 +5942,8 @@ declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind read define <16 x i16> @test_psraw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psraw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5992,8 +5992,8 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon define <8 x i32> @test_psrld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psrld: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6076,8 +6076,8 @@ define <32 x i8> @test_psrldq(<32 x i8> %a0) { define <4 x i64> @test_psrlq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-LABEL: test_psrlq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6127,7 +6127,7 @@ define <4 x i32> @test_psrlvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psrlvd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvd: @@ -6170,7 +6170,7 @@ define <8 x i32> @test_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) ; GENERIC-LABEL: test_psrlvd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvd_ymm: @@ -6213,7 +6213,7 @@ define <2 x i64> @test_psrlvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-LABEL: test_psrlvq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvq: @@ -6256,7 +6256,7 @@ define <4 x i64> @test_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) ; GENERIC-LABEL: test_psrlvq_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvq_ymm: @@ -6298,8 +6298,8 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read define <16 x i16> @test_psrlw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psrlw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index a5d424d..1e9ba6c 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -3404,8 +3404,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 @@ -3424,8 +3424,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 9306f61..ce99b58 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2864,8 +2864,8 @@ declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xc1] ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] @@ -2884,8 +2884,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xc1] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] @@ -2904,8 +2904,8 @@ declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xc1] ; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] @@ -2924,8 +2924,8 @@ declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xc1] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] @@ -3024,8 +3024,8 @@ declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xc1] ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] @@ -3044,8 +3044,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xc1] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] @@ -3064,8 +3064,8 @@ declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xc1] ; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] @@ -3084,8 +3084,8 @@ declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xd9] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xc1] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] diff --git a/llvm/test/CodeGen/X86/xop-schedule.ll b/llvm/test/CodeGen/X86/xop-schedule.ll index f1db8bc..7179cd4 100644 --- a/llvm/test/CodeGen/X86/xop-schedule.ll +++ b/llvm/test/CodeGen/X86/xop-schedule.ll @@ -869,22 +869,22 @@ define void @test_vprot(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-NEXT: vprotd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vprotb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotb %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotd %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotq %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotw %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotb (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotb %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotd %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotq %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotw %xmm0, (%rdi), %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: vprotb $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotd $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotq $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotw $7, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vprotb $7, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotd $7, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotq $7, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vprotw $7, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotb $7, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotd $7, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotq $7, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotw $7, (%rdi), %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -925,14 +925,14 @@ define void @test_vpsha(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-NEXT: vpshad %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshaq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshaw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpshab (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshaq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshaw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshab %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshad %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshaq %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshaw %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshab (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshad (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshaq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshaw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshab %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshad %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshaq %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshaw %xmm0, (%rdi), %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -965,14 +965,14 @@ define void @test_vpshl(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-NEXT: vpshld %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshlq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshlw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpshlb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshlb %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshld %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshlq %xmm0, (%rdi), %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpshlw %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshlb (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshlq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshlw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshlb %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshld %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshlq %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshlw %xmm0, (%rdi), %xmm0 # sched: [7:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; -- 2.7.4