From 30c38c38497763d5660fde146e1185c0dbb082d5 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 19 Mar 2018 14:46:07 +0000 Subject: [PATCH] [X86] Generalize schedule classes to support multiple stages Currently the WriteResPair style multi-classes take a single pipeline stage and latency, this patch generalizes this to make it easier to create complex schedules with ResourceCycles and NumMicroOps be overriden from their defaults. This has already been done for the Jaguar scheduler to remove a number of custom schedule classes and adding it to the other x86 targets will make it much tidier as we add additional classes in the future to try and replace so many custom cases. I've converted some instructions but a lot of the models need a bit of cleanup after the patch has been committed - memory latencies not being consistent, the class not actually being used when we could remove some/all customs, etc. I'd prefer to keep this as NFC as possible so later patches can be smaller and target specific. Differential Revision: https://reviews.llvm.org/D44612 llvm-svn: 327855 --- llvm/lib/Target/X86/X86SchedBroadwell.td | 146 +++++++++------------------ llvm/lib/Target/X86/X86SchedHaswell.td | 134 ++++++++---------------- llvm/lib/Target/X86/X86SchedSandyBridge.td | 129 ++++++++--------------- llvm/lib/Target/X86/X86SchedSkylakeClient.td | 124 ++++++++--------------- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 124 ++++++++--------------- llvm/lib/Target/X86/X86ScheduleSLM.td | 113 ++++++++------------- llvm/lib/Target/X86/X86ScheduleZnver1.td | 105 ++++++++++--------- llvm/test/CodeGen/X86/avx2-schedule.ll | 2 +- 8 files changed, 309 insertions(+), 568 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index f7b6ade..77e595b 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -66,6 +66,9 @@ def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4, let BufferSize=60; } +// Integer division issued on port 0. +def BWDivider : ProcResource<1>; // Integer division issued on port 0. + // Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance; @@ -76,15 +79,21 @@ def : ReadAdvance; // This multiclass defines the resource usage for variants with and without // folded loads. multiclass BWWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -93,23 +102,15 @@ multiclass BWWriteResPair; // Arithmetic. -defm : BWWriteResPair; // Simple integer ALU op. -defm : BWWriteResPair; // Integer multiplication. +defm : BWWriteResPair; // Simple integer ALU op. +defm : BWWriteResPair; // Integer multiplication. +defm : BWWriteResPair; def : WriteRes { let Latency = 3; } // Integer multiplication, high part. -def BWDivider : ProcResource<1>; // Integer division issued on port 0. -def : WriteRes { // Integer division. - let Latency = 25; - let ResourceCycles = [1, 10]; -} -def : WriteRes { - let Latency = 29; - let ResourceCycles = [1, 1, 10]; -} def : WriteRes; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : BWWriteResPair; +defm : BWWriteResPair; // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } @@ -125,30 +126,23 @@ def : InstRW<[WriteMove], (instrs COPY)>; // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : BWWriteResPair; +defm : BWWriteResPair; // Floating point. This covers both scalar and vector operations. def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : BWWriteResPair; // Floating point add/sub/compare. -defm : BWWriteResPair; // Floating point multiplication. -defm : BWWriteResPair; // 10-14 cycles. // Floating point division. -defm : BWWriteResPair; // Floating point square root. -defm : BWWriteResPair; // Floating point reciprocal estimate. -defm : BWWriteResPair; // Floating point reciprocal square root estimate. -defm : BWWriteResPair; // Fused Multiply Add. -defm : BWWriteResPair; // Floating point vector shuffles. -defm : BWWriteResPair; // Floating point vector blends. -def : WriteRes { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : BWWriteResPair; // Floating point add/sub/compare. +defm : BWWriteResPair; // Floating point multiplication. +defm : BWWriteResPair; // 10-14 cycles. // Floating point division. +defm : BWWriteResPair; // Floating point square root. +defm : BWWriteResPair; // Floating point reciprocal estimate. +defm : BWWriteResPair; // Floating point reciprocal square root estimate. +defm : BWWriteResPair; // Fused Multiply Add. +defm : BWWriteResPair; // Floating point vector shuffles. +defm : BWWriteResPair; // Floating point vector blends. +defm : BWWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -158,38 +152,22 @@ def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : BWWriteResPair; // Vector integer ALU op, no logicals. -defm : BWWriteResPair; // Vector integer shifts. -defm : BWWriteResPair; // Vector integer multiply. -defm : BWWriteResPair; // Vector shuffles. -defm : BWWriteResPair; // Vector blends. - -def : WriteRes { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : BWWriteResPair; // Vector integer ALU op, no logicals. +defm : BWWriteResPair; // Vector integer shifts. +defm : BWWriteResPair; // Vector integer multiply. +defm : BWWriteResPair; // Vector shuffles. +defm : BWWriteResPair; // Vector blends. +defm : BWWriteResPair; // Vector variable blends. +defm : BWWriteResPair; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : BWWriteResPair; // Vector and/or/xor. +defm : BWWriteResPair; // Vector and/or/xor. // Conversion between integer and float. -defm : BWWriteResPair; // Float -> Integer. -defm : BWWriteResPair; // Integer -> Float. -defm : BWWriteResPair; // Float -> Float size conversion. +defm : BWWriteResPair; // Float -> Integer. +defm : BWWriteResPair; // Integer -> Float. +defm : BWWriteResPair; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -257,29 +235,15 @@ def : WriteRes { } // Carry-less multiplication instructions. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 7; - let ResourceCycles = [2, 1, 1]; -} +defm : BWWriteResPair; // Catch-all for expensive system instructions. def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : BWWriteResPair; // Fp 256-bit width vector shuffles. -defm : BWWriteResPair; // 256-bit width vector shuffles. -def : WriteRes { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : BWWriteResPair; // Fp 256-bit width vector shuffles. +defm : BWWriteResPair; // 256-bit width vector shuffles. +defm : BWWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -293,27 +257,9 @@ def : WriteRes; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : BWWriteResPair; +defm : BWWriteResPair; // Remaining instrs. diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 8a60c7b..8813e1f 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -80,15 +80,21 @@ def : ReadAdvance; // This multiclass defines the resource usage for variants with and without // folded loads. multiclass HWWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -103,11 +109,11 @@ def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; def : WriteRes { let Latency = 3; } -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -129,68 +135,36 @@ def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; // 10-14 cycles. -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; - -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // 10-14 cycles. +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; - -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} - -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -544,34 +518,8 @@ def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} - -// x,m / v,v,m. -def : WriteRes { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -// v <- v,m. -def : WriteRes { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; -} +defm : HWWriteResPair; +defm : HWWriteResPair; //=== Floating Point XMM and YMM Instructions ===// diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index b7613f4..ee2d2c0 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -71,15 +71,21 @@ def : ReadAdvance; // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SBWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 4); + def : WriteRes { + let Latency = !add(Lat, 4); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -92,106 +98,57 @@ def : WriteRes { let Latency = 4; } def : WriteRes; def : WriteRes; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; def : WriteRes { let Latency = 3; } -defm : SBWriteResPair; -defm : SBWriteResPair; + +defm : SBWriteResPair; +defm : SBWriteResPair; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on // the port to read all inputs. We don't model that. def : WriteRes; -// This is quite rough, latency depends on the dividend. -def : WriteRes { - let Latency = 25; - let ResourceCycles = [1, 10]; -} -def : WriteRes { - let Latency = 29; - let ResourceCycles = [1, 1, 10]; -} - // Scalar and vector floating point. def : WriteRes; def : WriteRes { let Latency = 6; } def : WriteRes; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -def : WriteRes { - let Latency = 2; - let ResourceCycles = [1, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; -} +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 6; } def : WriteRes; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -def : WriteRes { - let Latency = 2; - let ResourceCycles = [1, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; -} -def : WriteRes { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1,2]; -} -def : WriteRes { - let Latency = 11; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} - -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SBWriteResPair; +defm : SBWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -286,10 +243,10 @@ def : WriteRes; // AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; // Remaining SNB instrs. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 0526a30..23ef6bf 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -77,15 +77,21 @@ def : ReadAdvance; // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SKLWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -94,8 +100,8 @@ multiclass SKLWriteResPair; // Arithmetic. -defm : SKLWriteResPair; // Simple integer ALU op. -defm : SKLWriteResPair; // Integer multiplication. +defm : SKLWriteResPair; // Simple integer ALU op. +defm : SKLWriteResPair; // Integer multiplication. def : WriteRes { let Latency = 3; } // Integer multiplication, high part. def SKLDivider : ProcResource<1>; // Integer division issued on port 0. def : WriteRes { // Integer division. @@ -110,7 +116,7 @@ def : WriteRes { def : WriteRes; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : SKLWriteResPair; +defm : SKLWriteResPair; // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } @@ -123,30 +129,23 @@ def : WriteRes; // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : SKLWriteResPair; +defm : SKLWriteResPair; // Floating point. This covers both scalar and vector operations. def : WriteRes { let Latency = 6; } def : WriteRes; def : WriteRes; -defm : SKLWriteResPair; // Floating point add/sub/compare. -defm : SKLWriteResPair; // Floating point multiplication. -defm : SKLWriteResPair; // 10-14 cycles. // Floating point division. -defm : SKLWriteResPair; // Floating point square root. -defm : SKLWriteResPair; // Floating point reciprocal estimate. -defm : SKLWriteResPair; // Floating point reciprocal square root estimate. -defm : SKLWriteResPair; // Fused Multiply Add. -defm : SKLWriteResPair; // Floating point vector shuffles. -defm : SKLWriteResPair; // Floating point vector blends. -def : WriteRes { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : SKLWriteResPair; // Floating point add/sub/compare. +defm : SKLWriteResPair; // Floating point multiplication. +defm : SKLWriteResPair; // 10-14 cycles. // Floating point division. +defm : SKLWriteResPair; // Floating point square root. +defm : SKLWriteResPair; // Floating point reciprocal estimate. +defm : SKLWriteResPair; // Floating point reciprocal square root estimate. +defm : SKLWriteResPair; // Fused Multiply Add. +defm : SKLWriteResPair; // Floating point vector shuffles. +defm : SKLWriteResPair; // Floating point vector blends. +defm : SKLWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -156,38 +155,22 @@ def : WriteRes { let Latency = 6; } def : WriteRes; def : WriteRes; -defm : SKLWriteResPair; // Vector integer ALU op, no logicals. -defm : SKLWriteResPair; // Vector integer shifts. -defm : SKLWriteResPair; // Vector integer multiply. -defm : SKLWriteResPair; // Vector shuffles. -defm : SKLWriteResPair; // Vector blends. - -def : WriteRes { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : SKLWriteResPair; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair; // Vector integer shifts. +defm : SKLWriteResPair; // Vector integer multiply. +defm : SKLWriteResPair; // Vector shuffles. +defm : SKLWriteResPair; // Vector blends. +defm : SKLWriteResPair; // Vector variable blends. +defm : SKLWriteResPair; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : SKLWriteResPair; // Vector and/or/xor. +defm : SKLWriteResPair; // Vector and/or/xor. // Conversion between integer and float. -defm : SKLWriteResPair; // Float -> Integer. -defm : SKLWriteResPair; // Integer -> Float. -defm : SKLWriteResPair; // Float -> Float size conversion. +defm : SKLWriteResPair; // Float -> Integer. +defm : SKLWriteResPair; // Integer -> Float. +defm : SKLWriteResPair; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -268,16 +251,9 @@ def : WriteRes { def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. -defm : SKLWriteResPair; // 256-bit width vector shuffles. -def : WriteRes { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair; // 256-bit width vector shuffles. +defm : SKLWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -291,27 +267,9 @@ def : WriteRes; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SKLWriteResPair; +defm : SKLWriteResPair; // Remaining instrs. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 130d0d6..aa1ed437 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -77,15 +77,21 @@ def : ReadAdvance; // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SKXWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -94,8 +100,8 @@ multiclass SKXWriteResPair; // Arithmetic. -defm : SKXWriteResPair; // Simple integer ALU op. -defm : SKXWriteResPair; // Integer multiplication. +defm : SKXWriteResPair; // Simple integer ALU op. +defm : SKXWriteResPair; // Integer multiplication. def : WriteRes { let Latency = 3; } // Integer multiplication, high part. def SKXDivider : ProcResource<1>; // Integer division issued on port 0. def : WriteRes { // Integer division. @@ -110,7 +116,7 @@ def : WriteRes { def : WriteRes; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : SKXWriteResPair; +defm : SKXWriteResPair; // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } @@ -123,30 +129,23 @@ def : WriteRes; // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : SKXWriteResPair; +defm : SKXWriteResPair; // Floating point. This covers both scalar and vector operations. def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : SKXWriteResPair; // Floating point add/sub/compare. -defm : SKXWriteResPair; // Floating point multiplication. -defm : SKXWriteResPair; // 10-14 cycles. // Floating point division. -defm : SKXWriteResPair; // Floating point square root. -defm : SKXWriteResPair; // Floating point reciprocal estimate. -defm : SKXWriteResPair; // Floating point reciprocal square root estimate. -defm : SKXWriteResPair; // Fused Multiply Add. -defm : SKXWriteResPair; // Floating point vector shuffles. -defm : SKXWriteResPair; // Floating point vector blends. -def : WriteRes { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : SKXWriteResPair; // Floating point add/sub/compare. +defm : SKXWriteResPair; // Floating point multiplication. +defm : SKXWriteResPair; // 10-14 cycles. // Floating point division. +defm : SKXWriteResPair; // Floating point square root. +defm : SKXWriteResPair; // Floating point reciprocal estimate. +defm : SKXWriteResPair; // Floating point reciprocal square root estimate. +defm : SKXWriteResPair; // Fused Multiply Add. +defm : SKXWriteResPair; // Floating point vector shuffles. +defm : SKXWriteResPair; // Floating point vector blends. +defm : SKXWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -156,38 +155,22 @@ def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : SKXWriteResPair; // Vector integer ALU op, no logicals. -defm : SKXWriteResPair; // Vector integer shifts. -defm : SKXWriteResPair; // Vector integer multiply. -defm : SKXWriteResPair; // Vector shuffles. -defm : SKXWriteResPair; // Vector blends. - -def : WriteRes { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : SKXWriteResPair; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair; // Vector integer shifts. +defm : SKXWriteResPair; // Vector integer multiply. +defm : SKXWriteResPair; // Vector shuffles. +defm : SKXWriteResPair; // Vector blends. +defm : SKXWriteResPair; // Vector variable blends. +defm : SKXWriteResPair; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : SKXWriteResPair; // Vector and/or/xor. +defm : SKXWriteResPair; // Vector and/or/xor. // Conversion between integer and float. -defm : SKXWriteResPair; // Float -> Integer. -defm : SKXWriteResPair; // Integer -> Float. -defm : SKXWriteResPair; // Float -> Float size conversion. +defm : SKXWriteResPair; // Float -> Integer. +defm : SKXWriteResPair; // Integer -> Float. +defm : SKXWriteResPair; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -268,16 +251,9 @@ def : WriteRes { def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKXWriteResPair; // Fp 256-bit width vector shuffles. -defm : SKXWriteResPair; // 256-bit width vector shuffles. -def : WriteRes { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : SKXWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKXWriteResPair; // 256-bit width vector shuffles. +defm : SKXWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -291,27 +267,9 @@ def : WriteRes; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SKXWriteResPair; +defm : SKXWriteResPair; // Remaining instrs. diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 0292ce4..8ba2cbc 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -56,15 +56,21 @@ def : ReadAdvance; // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SMWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 3); + def : WriteRes { + let Latency = !add(Lat, 3); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -80,10 +86,10 @@ def : WriteRes; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -105,74 +111,37 @@ def : WriteRes; def : WriteRes { let Latency = 3; } def : WriteRes; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; - -// This is quite rough, latency depends on precision -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 8; - let ResourceCycles = [1, 1, 2]; -} - -def : WriteRes { - let Latency = 34; - let ResourceCycles = [1, 34]; -} -def : WriteRes { - let Latency = 37; - let ResourceCycles = [1, 1, 34]; -} +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 3; } def : WriteRes; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD - -def : WriteRes { - let Latency = 3; - let ResourceCycles = [2]; -} - -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -// PHADD|PHSUB (S) W/D. -def : WriteRes { - let Latency = 1; - let ResourceCycles = [1]; -} - -def : WriteRes { - let Latency = 4; - let ResourceCycles = [1, 1]; -} +defm : SMWriteResPair; +defm : SMWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -262,10 +231,10 @@ def : WriteRes; // AVX/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. def : WriteRes; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 4ad05f3..c43dae4 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -99,30 +99,41 @@ def : ReadAdvance; // b. addpd // This multiclass is for folded loads for integer units. multiclass ZnWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on ZnAGU // adds 4 cycles to the latency. - def : WriteRes { - let NumMicroOps = 2; - let Latency = !add(Lat, 4); + def : WriteRes { + let Latency = !add(Lat, 4); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); } } // This multiclass is for folded loads for floating point units. multiclass ZnWriteResFpuPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on ZnAGU // adds 7 cycles to the latency. - def : WriteRes { - let Latency = !add(Lat, 7); + def : WriteRes { + let Latency = !add(Lat, 7); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -136,9 +147,10 @@ def : WriteRes { let Latency = 8; } def : WriteRes; def : WriteRes; -defm : ZnWriteResPair; -defm : ZnWriteResPair; -defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; @@ -154,67 +166,60 @@ def : WriteRes { let ResourceCycles = [1, 4, 41]; } -// IMUL +// IMULH def : WriteRes{ let Latency = 4; } -def : WriteRes { - let Latency = 4; -} - -def : WriteRes { - let Latency = 8; -} // Floating point operations def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector integer operations which uses FPU units def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector Shift Operations -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // AES Instructions. -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; def : WriteRes; def : WriteRes; // Following instructions with latency=100 are microcoded. // We set long latency so as to block the entire pipeline. -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; //Microcoded Instructions let Latency = 100 in { diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll index e7152f8..92c7dfd 100644 --- a/llvm/test/CodeGen/X86/avx2-schedule.ll +++ b/llvm/test/CodeGen/X86/avx2-schedule.ll @@ -609,7 +609,7 @@ define <16 x i16> @test_mpsadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_mpsadbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mpsadbw: -- 2.7.4