From 05daae75ad353b050e58599fbebc68a950d23f67 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 19 Mar 2018 14:26:50 +0000
Subject: [PATCH] [x86] put nops into the WriteNop class and customize for
 Jaguar

1. Given that we already have a classification bucket with 'nop' in the name,
   that's where 'nop' belongs. Right now, it's only used for prefix bytes and 'pause'.
2. Make the latency of this class '1' for Jaguar to tell the scheduler (and presumably
   llvm-mca) how to model the resource requirements better even though a nop has no
   dependencies.

Differential Revision: https://reviews.llvm.org/D44608

llvm-svn: 327853
---
 llvm/lib/Target/X86/X86InstrFPStack.td   |  6 +++---
 llvm/lib/Target/X86/X86InstrInfo.td      |  2 +-
 llvm/lib/Target/X86/X86ScheduleBtVer2.td |  4 +++-
 llvm/test/CodeGen/X86/schedule-x86_64.ll | 16 ++++++++--------
 llvm/test/CodeGen/X86/sse-schedule.ll    |  4 ++--
 llvm/test/CodeGen/X86/x87-schedule.ll    |  4 ++--
 6 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 619b399..18071c6 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -667,10 +667,10 @@ def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
 } // Defs = [FPSW]
 } // SchedRW
 
-// Operandless floating-point instructions for the disassembler.
-let SchedRW = [WriteMicrocoded] in {
-def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+// Operand-less floating-point instructions for the disassembler.
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>, Sched<[WriteNop]>;
 
+let SchedRW = [WriteMicrocoded] in {
 let Defs = [FPSW] in {
 def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
 def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 72709fd..5291272 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -1133,7 +1133,7 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 //
 
 // Nop
-let hasSideEffects = 0, SchedRW = [WriteZero] in {
+let hasSideEffects = 0, SchedRW = [WriteNop] in {
   def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
   def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
                 "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 8811a5d..c092e60 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -286,7 +286,9 @@ defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
 def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
 def : WriteRes<WriteFence,  [JSAGU]>;
-def : WriteRes<WriteNop, []>;
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Floating point. This covers both scalar and vector operations.
diff --git a/llvm/test/CodeGen/X86/schedule-x86_64.ll b/llvm/test/CodeGen/X86/schedule-x86_64.ll
index 72a764e..02c4685 100644
--- a/llvm/test/CodeGen/X86/schedule-x86_64.ll
+++ b/llvm/test/CodeGen/X86/schedule-x86_64.ll
@@ -8389,13 +8389,13 @@ define void @test_nop(i16 %a0, i32 %a1, i64 %a2, i16 *%p0, i32 *%p1, i64 *%p2) o
 ; BTVER2-LABEL: test_nop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    nop # sched: [1:?]
-; BTVER2-NEXT:    nopw %di # sched: [1:?]
-; BTVER2-NEXT:    nopw (%rcx) # sched: [1:?]
-; BTVER2-NEXT:    nopl %esi # sched: [1:?]
-; BTVER2-NEXT:    nopl (%r8) # sched: [1:?]
-; BTVER2-NEXT:    nopq %rdx # sched: [1:?]
-; BTVER2-NEXT:    nopq (%r9) # sched: [1:?]
+; BTVER2-NEXT:    nop # sched: [1:0.50]
+; BTVER2-NEXT:    nopw %di # sched: [1:0.50]
+; BTVER2-NEXT:    nopw (%rcx) # sched: [1:0.50]
+; BTVER2-NEXT:    nopl %esi # sched: [1:0.50]
+; BTVER2-NEXT:    nopl (%r8) # sched: [1:0.50]
+; BTVER2-NEXT:    nopq %rdx # sched: [1:0.50]
+; BTVER2-NEXT:    nopq (%r9) # sched: [1:0.50]
 ; BTVER2-NEXT:    #NO_APP
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -9500,7 +9500,7 @@ define void @test_pause() optsize {
 ; BTVER2-LABEL: test_pause:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    pause # sched: [1:?]
+; BTVER2-NEXT:    pause # sched: [1:0.50]
 ; BTVER2-NEXT:    #NO_APP
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll
index 3a123df..a58bdd1 100644
--- a/llvm/test/CodeGen/X86/sse-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse-schedule.ll
@@ -3763,7 +3763,7 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
   ret <4 x float> %7
 }
 
-; 'WriteZero' class instructions.
+; 'WriteZero' and 'WriteNop' class instructions.
 
 define <4 x float> @test_fnop() nounwind {
 ; GENERIC-LABEL: test_fnop:
@@ -3840,7 +3840,7 @@ define <4 x float> @test_fnop() nounwind {
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    nop # sched: [1:?]
+; BTVER2-NEXT:    nop # sched: [1:0.50]
 ; BTVER2-NEXT:    #NO_APP
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
diff --git a/llvm/test/CodeGen/X86/x87-schedule.ll b/llvm/test/CodeGen/X86/x87-schedule.ll
index 2950eb5..09001fb 100644
--- a/llvm/test/CodeGen/X86/x87-schedule.ll
+++ b/llvm/test/CodeGen/X86/x87-schedule.ll
@@ -3209,7 +3209,7 @@ define void @test_fnop() optsize {
 ; SLM-LABEL: test_fnop:
 ; SLM:       # %bb.0:
 ; SLM-NEXT:    #APP
-; SLM-NEXT:    fnop # sched: [100:1.00]
+; SLM-NEXT:    fnop # sched: [1:?]
 ; SLM-NEXT:    #NO_APP
 ; SLM-NEXT:    retl # sched: [4:1.00]
 ;
@@ -3251,7 +3251,7 @@ define void @test_fnop() optsize {
 ; BTVER2-LABEL: test_fnop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    fnop # sched: [100:0.50]
+; BTVER2-NEXT:    fnop # sched: [1:0.50]
 ; BTVER2-NEXT:    #NO_APP
 ; BTVER2-NEXT:    retl # sched: [4:1.00]
 ;
-- 
2.7.4