From 327fac4d75e1631d11d69184cdc3d80084e7e35b Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 7 Mar 2018 08:14:02 +0000 Subject: [PATCH] [X86] Add IMUL scheduling info on sandybridge, fix it on >=haswell. Summary: Only IMUL16rri uses an extra P0156. IMUL32* and IMUL16rr only use P1. This was computed using https://github.com/google/EXEgesis/blob/master/exegesis/tools/compute_itineraries.cc This can easily be validated by running perf on the following code: ``` int main(int argc, char**argv) { int a = argc; int b = argc; int c = argc; int d = argc; for (int i = 0; i < LOOP_ITERATIONS; ++i) { asm volatile( R"( .rept 10000 imull $0x2, %%edx, %%eax imull $0x2, %%ecx, %%ebx imull $0x2, %%eax, %%edx imull $0x2, %%ebx, %%ecx .endr )" : "+a"(a), "+b"(b), "+c"(c), "+d"(d) : :); } return a+b+c+d; } ``` -> test.cc perf stat -x, -e cycles --pfm-events=uops_executed_port:port_0:u,uops_executed_port:port_1:u,uops_executed_port:port_2:u,uops_executed_port:port_3:u,uops_executed_port:port_4:u,uops_executed_port:port_5:u,uops_executed_port:port_6:u,uops_executed_port:port_7:u test Reviewers: craig.topper, RKSimon, gadi.haber Subscribers: llvm-commits, gchatelet, chandlerc Differential Revision: https://reviews.llvm.org/D43460 llvm-svn: 326877 --- llvm/lib/Target/X86/X86SchedBroadwell.td | 4 ++-- llvm/lib/Target/X86/X86SchedHaswell.td | 15 +++++---------- llvm/lib/Target/X86/X86SchedSandyBridge.td | 8 ++++++++ llvm/lib/Target/X86/X86SchedSkylakeClient.td | 12 +++--------- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 11 +++-------- 5 files changed, 21 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 79deb4b..6441fd5 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -1217,7 +1217,7 @@ def: InstRW<[BWWriteResGroup27], (instregex "COMISSrr")>; def: InstRW<[BWWriteResGroup27], (instregex "CVTDQ2PSrr")>; def: InstRW<[BWWriteResGroup27], (instregex "CVTPS2DQrr")>; def: InstRW<[BWWriteResGroup27], (instregex "CVTTPS2DQrr")>; -def: InstRW<[BWWriteResGroup27], (instrs IMUL32rr, IMUL32rri, IMUL32rri8, IMUL64rr, IMUL64rri32, IMUL64rri8)>; +def: InstRW<[BWWriteResGroup27], (instrs IMUL16rr, IMUL32rr, IMUL32rri, IMUL32rri8, IMUL64rr, IMUL64rri32, IMUL64rri8)>; def: InstRW<[BWWriteResGroup27], (instrs IMUL8r)>; def: InstRW<[BWWriteResGroup27], (instregex "LZCNT(16|32|64)rr")>; def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PDrr")>; @@ -1298,7 +1298,7 @@ def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup27_16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; +def: InstRW<[BWWriteResGroup27_16], (instrs IMUL16rri, IMUL16rri8)>; def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> { let Latency = 3; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index ce30ee8..fbaf745 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -2498,7 +2498,7 @@ def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>; def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>; def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>; def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>; -def: InstRW<[HWWriteResGroup50], (instrs IMUL64rr, IMUL64rri32, IMUL64rri8)>; +def: InstRW<[HWWriteResGroup50], (instrs IMUL16rr, IMUL32rr, IMUL32rri, IMUL32rri8, IMUL64rr, IMUL64rri32, IMUL64rri8)>; def: InstRW<[HWWriteResGroup50], (instrs IMUL8r)>; def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>; def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PDrr")>; @@ -2574,17 +2574,12 @@ def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>; def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>; def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>; -def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> { +def HWWriteResGroup50_16i : SchedWriteRes<[HWPort1, HWPort0156]> { let Latency = 3; - let NumMicroOps = 4; -} -def: InstRW<[HWWriteResGroup50_16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; - -def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 3; - let NumMicroOps = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup50_32], (instrs IMUL32rr, IMUL32rri, IMUL32rri8)>; +def: InstRW<[HWWriteResGroup50_16i], (instrs IMUL16rri, IMUL16rri8)>; def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> { let Latency = 3; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index bd1a271..bec8f62 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -925,6 +925,7 @@ def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r64")>; def: InstRW<[SBWriteResGroup21], (instregex "CVTDQ2PSrr")>; def: InstRW<[SBWriteResGroup21], (instregex "CVTPS2DQrr")>; def: InstRW<[SBWriteResGroup21], (instregex "CVTTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instrs IMUL16rr, IMUL32rr, IMUL32rri, IMUL32rri8, IMUL64rr, IMUL64rri32, IMUL64rri8)>; def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PDrr")>; def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PSrr")>; def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SDrr")>; @@ -1000,6 +1001,13 @@ def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSrr")>; def: InstRW<[SBWriteResGroup21], (instregex "VSUBSDrr")>; def: InstRW<[SBWriteResGroup21], (instregex "VSUBSSrr")>; +def SBWriteResGroup21_16i : SchedWriteRes<[SBPort1, SBPort015]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup21_16i], (instrs IMUL16rri, IMUL16rri8)>; + def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> { let Latency = 3; let NumMicroOps = 2; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 1ff268a..abad9ce 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -1214,7 +1214,7 @@ def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> { } def: InstRW<[SKLWriteResGroup29], (instregex "BSF(16|32|64)rr")>; def: InstRW<[SKLWriteResGroup29], (instregex "BSR(16|32|64)rr")>; -def: InstRW<[SKLWriteResGroup29], (instrs IMUL64rr, IMUL64rri32, IMUL64rri8)>; +def: InstRW<[SKLWriteResGroup29], (instrs IMUL16rr, IMUL32rr, IMUL32rri, IMUL32rri8, IMUL64rr, IMUL64rri32, IMUL64rri8)>; def: InstRW<[SKLWriteResGroup29], (instrs IMUL8r)>; def: InstRW<[SKLWriteResGroup29], (instregex "LZCNT(16|32|64)rr")>; def: InstRW<[SKLWriteResGroup29], (instrs MUL8r)>; @@ -1225,18 +1225,12 @@ def: InstRW<[SKLWriteResGroup29], (instregex "SHLD(16|32|64)rri8")>; def: InstRW<[SKLWriteResGroup29], (instregex "SHRD(16|32|64)rri8")>; def: InstRW<[SKLWriteResGroup29], (instregex "TZCNT(16|32|64)rr")>; -def SKLWriteResGroup29_16 : SchedWriteRes<[SKLPort1, SKLPort0156]> { +def SKLWriteResGroup29_16i : SchedWriteRes<[SKLPort1, SKLPort0156]> { let Latency = 3; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup29_16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; - -def SKLWriteResGroup29_32 : SchedWriteRes<[SKLPort1]> { - let Latency = 3; - let NumMicroOps = 1; -} -def: InstRW<[SKLWriteResGroup29_32], (instrs IMUL32rr, IMUL32rri, IMUL32rri8)>; +def: InstRW<[SKLWriteResGroup29_16i], (instrs IMUL16rri, IMUL16rri8)>; def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> { let Latency = 3; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 44ba5b4..bc5ecc6 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1758,7 +1758,7 @@ def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> { } def: InstRW<[SKXWriteResGroup31], (instregex "BSF(16|32|64)rr")>; def: InstRW<[SKXWriteResGroup31], (instregex "BSR(16|32|64)rr")>; -def: InstRW<[SKXWriteResGroup31], (instrs IMUL64rr, IMUL64rri32, IMUL64rri8)>; +def: InstRW<[SKXWriteResGroup31], (instrs IMUL16rr, IMUL32rr, IMUL32rri, IMUL32rri8, IMUL64rr, IMUL64rri32, IMUL64rri8)>; def: InstRW<[SKXWriteResGroup31], (instrs IMUL8r)>; def: InstRW<[SKXWriteResGroup31], (instregex "LZCNT(16|32|64)rr")>; def: InstRW<[SKXWriteResGroup31], (instrs MUL8r)>; @@ -1769,18 +1769,13 @@ def: InstRW<[SKXWriteResGroup31], (instregex "SHLD(16|32|64)rri8")>; def: InstRW<[SKXWriteResGroup31], (instregex "SHRD(16|32|64)rri8")>; def: InstRW<[SKXWriteResGroup31], (instregex "TZCNT(16|32|64)rr")>; -def SKXWriteResGroup31_16 : SchedWriteRes<[SKXPort1, SKXPort0156]> { +def SKXWriteResGroup31_16i : SchedWriteRes<[SKXPort1, SKXPort0156]> { let Latency = 3; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup31_16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; +def: InstRW<[SKXWriteResGroup31_16i], (instrs IMUL16rri, IMUL16rri8)>; -def SKXWriteResGroup31_32 : SchedWriteRes<[SKXPort1]> { - let Latency = 3; - let NumMicroOps = 1; -} -def: InstRW<[SKXWriteResGroup31_32], (instrs IMUL32rr, IMUL32rri, IMUL32rri8)>; def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> { let Latency = 3; -- 2.7.4