From ff630c2cdc7f805091e8d8cce40692bb9441fd3a Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Sun, 15 Jul 2018 11:01:38 +0000
Subject: [PATCH] [llvm-mca][BtVer2] teach how to identify false dependencies
 on partially written registers.

The goal of this patch is to improve the throughput analysis in llvm-mca for the
case where instructions perform partial register writes.

On x86, partial register writes are quite difficult to model, mainly because
different processors tend to implement different register merging schemes in
hardware.

When the code contains partial register writes, the IPC (instructions per
cycles) estimated by llvm-mca tends to diverge quite significantly from the
observed IPC (using perf).

Modern AMD processors (at least, from Bulldozer onwards) don't rename partial
registers. Quoting Agner Fog's microarchitecture.pdf:
" The processor always keeps the different parts of an integer register together.
For example, AL and AH are not treated as independent by the out-of-order
execution mechanism. An instruction that writes to part of a register will
therefore have a false dependence on any previous write to the same register or
any part of it."

This patch is a first important step towards improving the analysis of partial
register updates. It changes the semantic of RegisterFile descriptors in
tablegen, and teaches llvm-mca how to identify false dependences in the presence
of partial register writes (for more details: see the new code comments in
include/Target/TargetSchedule.h - class RegisterFile).

This patch doesn't address the case where a write to a part of a register is
followed by a read from the whole register.  On Intel chips, high8 registers
(AH/BH/CH/DH)) can be stored in separate physical registers. However, a later
(dirty) read of the full register (example: AX/EAX) triggers a merge uOp, which
adds extra latency (and potentially affects the pipe usage).
This is a very interesting article on the subject with a very informative answer
from Peter Cordes:
https://stackoverflow.com/questions/45660139/how-exactly-do-partial-registers-on-haswell-skylake-perform-writing-al-seems-to

In future, the definition of RegisterFile can be extended with extra information
that may be used to identify delays caused by merge opcodes triggered by a dirty
read of a partial write.

Differential Revision: https://reviews.llvm.org/D49196

llvm-svn: 337123
---
 llvm/include/llvm/Target/TargetSchedule.td         |  63 ++++++++--
 llvm/lib/Target/X86/X86ScheduleBtVer2.td           |   9 +-
 .../llvm-mca/X86/BtVer2/partial-reg-update-2.s     |  15 +--
 .../llvm-mca/X86/BtVer2/partial-reg-update-3.s     |  35 +++---
 .../llvm-mca/X86/BtVer2/partial-reg-update-4.s     |  30 ++---
 .../llvm-mca/X86/BtVer2/partial-reg-update-5.s     |  14 +--
 .../llvm-mca/X86/BtVer2/partial-reg-update-6.s     |  32 ++---
 llvm/tools/llvm-mca/Instruction.cpp                |  25 ++--
 llvm/tools/llvm-mca/Instruction.h                  |  12 +-
 llvm/tools/llvm-mca/RegisterFile.cpp               | 135 +++++++++++++++------
 llvm/tools/llvm-mca/RegisterFile.h                 |  35 ++++--
 11 files changed, 275 insertions(+), 130 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 29daa02..6fd2d5b 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -453,15 +453,60 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
   SchedMachineModel SchedModel = ?;
 }
 
-// Allow the definition of processor register files.
-// Each processor register file declares the number of physical registers, as
-// well as a optional register cost information. The cost of a register R is the
-// number of physical registers used to rename R (at register renaming stage).
-// That value defaults to 1, to all the registers contained in the register
-// file. The set of target register files is inferred from the list of register
-// classes. Register costs are defined at register class granularity. An empty
-// list of register classes means that this register file contains all the
-// registers defined by the target.
+// Allow the definition of processor register files for register renaming
+// purposes.
+//
+// Each processor register file declares:
+//  - The set of registers that can be renamed.
+//  - The number of physical registers which can be used for register renaming
+//    purpose.
+//  - The cost of a register rename.
+//
+// The cost of a rename is the number of physical registers allocated by the
+// register alias table to map the new definition. By default, register can be
+// renamed at the cost of a single physical register.  Note that register costs
+// are defined at register class granularity (see field `Costs`).
+//
+// The set of registers that are subject to register renaming is declared using
+// a list of register classes (see field `RegClasses`). An empty list of
+// register classes means: all the logical registers defined by the target can
+// be fully renamed.
+//
+// A register R can be renamed if its register class appears in the `RegClasses`
+// set. When R is written, a new alias is allocated at the cost of one or more
+// physical registers; as a result, false dependencies on R are removed.
+//
+// A sub-register V of register R is implicitly part of the same register file.
+// However, V is only renamed if its register class is part of `RegClasses`.
+// Otherwise, the processor keeps it (as well as any other different part
+// of R) together with R, and a write of V always causes a compulsory read of R.
+//
+// This is what happens for example on AMD processors (at least from Bulldozer
+// onwards), where AL and AH are not treated as independent from AX, and AX is
+// not treated as independent from EAX. A write to AL has an implicity false
+// dependency on the last write to EAX (or a portion of EAX).  As a consequence,
+// a write to AL cannot go in parallel with a write to AH.
+//
+// There is no false dependency if the partial register write belongs to a
+// register class that is in `RegClasses`.
+// There is also no penalty for writes that "clear the content a super-register"
+// (see MC/MCInstrAnalysis.h - method MCInstrAnalysis::clearsSuperRegisters()).
+// On x86-64, 32-bit GPR writes implicitly zero the upper half of the underlying
+// physical register, effectively removing any false dependencies with the
+// previous register definition.
+//
+// TODO: This implementation assumes that there is no limit in the number of
+// renames per cycle, which might not be true for all hardware or register
+// classes. Also, there is no limit to how many times the same logical register
+// can be renamed during the same cycle.
+//
+// TODO: we don't currently model merge penalties for the case where a write to
+// a part of a register is followed by a read from a larger part of the same
+// register. On some Intel chips, different parts of a GPR can be stored in
+// different physical registers. However, there is a cost to pay for when the
+// partial write is combined with the previous super-register definition.  We
+// should add support for these cases, and correctly model merge problems with
+// partial register accesses.
 class RegisterFile<int numPhysRegs, list<RegisterClass> Classes = [],
                    list<int> Costs = []> {
   list<RegisterClass> RegClasses = Classes;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index ce1d49a..177aa01 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -41,7 +41,14 @@ def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
 // The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
 // speculative version of the 64-bit integer registers.
 // Reference: www.realworldtech.com/jaguar/4/
-def JIntegerPRF : RegisterFile<64, [GR8, GR16, GR32, GR64, CCR]>;
+//
+// The processor always keeps the different parts of an integer register
+// together. An instruction that writes to a part of a register will therefore
+// have a false dependence on any previous write to the same register or any
+// part of it.
+// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
+// access" - Agner Fog's "microarchitecture.pdf".
+def JIntegerPRF : RegisterFile<64, [GR64, CCR]>;
 
 // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
 // registers. Operations on 256-bit data types are cracked into two COPs.
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s
index c8cc463..fb6eb23 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s
@@ -7,9 +7,9 @@ add    %ecx, %ebx
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      3
-# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total Cycles:      11
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
@@ -26,11 +26,12 @@ add    %ecx, %ebx
 # CHECK-NEXT:  1      1     0.50                        addl	%ecx, %ebx
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER.   imulq	%rax, %rbx
-# CHECK-NEXT: [0,1]     .DeE----R.   lzcntw	%ax, %bx
-# CHECK-NEXT: [0,2]     .D=====eER   addl	%ecx, %ebx
+# CHECK:      [0,0]     DeeeeeeER .   imulq	%rax, %rbx
+# CHECK-NEXT: [0,1]     .D=====eER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     .D======eER   addl	%ecx, %ebx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -40,5 +41,5 @@ add    %ecx, %ebx
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulq	%rax, %rbx
-# CHECK-NEXT: 1.     1     1.0    1.0    4.0       lzcntw	%ax, %bx
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       addl	%ecx, %ebx
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s
index 2384528..0a9ffca 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s
@@ -12,9 +12,9 @@ xor %bx, %dx
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      2254
+# CHECK-NEXT: Total Cycles:      4503
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
@@ -52,22 +52,23 @@ xor %bx, %dx
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %dx
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     movw	%ax, %dx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %dx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     movw	%ax, %dx
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     xorw	%bx, %dx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .  .   addw	%cx, %dx
-# CHECK-NEXT: [0,1]     DeER .  .   movw	%ax, %dx
-# CHECK-NEXT: [0,2]     .DeER.  .   xorw	%bx, %dx
-# CHECK-NEXT: [1,0]     .D=eER  .   addw	%cx, %dx
-# CHECK-NEXT: [1,1]     . DeER  .   movw	%ax, %dx
-# CHECK-NEXT: [1,2]     . D=eER .   xorw	%bx, %dx
-# CHECK-NEXT: [2,0]     .  D=eER.   addw	%cx, %dx
-# CHECK-NEXT: [2,1]     .  DeE-R.   movw	%ax, %dx
-# CHECK-NEXT: [2,2]     .   DeE-R   xorw	%bx, %dx
+# CHECK:      [0,0]     DeER .    ..   addw	%cx, %dx
+# CHECK-NEXT: [0,1]     D=eER.    ..   movw	%ax, %dx
+# CHECK-NEXT: [0,2]     .D=eER    ..   xorw	%bx, %dx
+# CHECK-NEXT: [1,0]     .D==eER   ..   addw	%cx, %dx
+# CHECK-NEXT: [1,1]     . D==eER  ..   movw	%ax, %dx
+# CHECK-NEXT: [1,2]     . D===eER ..   xorw	%bx, %dx
+# CHECK-NEXT: [2,0]     .  D===eER..   addw	%cx, %dx
+# CHECK-NEXT: [2,1]     .  D====eER.   movw	%ax, %dx
+# CHECK-NEXT: [2,2]     .   D====eER   xorw	%bx, %dx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -76,6 +77,6 @@ xor %bx, %dx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.7    0.3    0.0       addw	%cx, %dx
-# CHECK-NEXT: 1.     3     1.0    1.0    0.3       movw	%ax, %dx
-# CHECK-NEXT: 2.     3     1.3    0.0    0.3       xorw	%bx, %dx
+# CHECK-NEXT: 0.     3     2.7    0.3    0.0       addw	%cx, %dx
+# CHECK-NEXT: 1.     3     3.3    0.0    0.0       movw	%ax, %dx
+# CHECK-NEXT: 2.     3     3.7    0.0    0.0       xorw	%bx, %dx
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s
index 6277ceb..cda173e 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s
@@ -12,9 +12,9 @@ add %cx, %bx
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      3006
+# CHECK-NEXT: Total Cycles:      7503
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               1.50
+# CHECK-NEXT: IPC:               0.60
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -57,18 +57,18 @@ add %cx, %bx
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %bx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    ..   imulw	%ax, %bx
-# CHECK-NEXT: [0,1]     .DeE-R    ..   lzcntw	%ax, %bx
-# CHECK-NEXT: [0,2]     .D=eE-R   ..   addw	%cx, %bx
-# CHECK-NEXT: [1,0]     . D=eeeER ..   imulw	%ax, %bx
-# CHECK-NEXT: [1,1]     .  DeE--R ..   lzcntw	%ax, %bx
-# CHECK-NEXT: [1,2]     .  D=eE--R..   addw	%cx, %bx
-# CHECK-NEXT: [2,0]     .   D=eeeER.   imulw	%ax, %bx
-# CHECK-NEXT: [2,1]     .    DeE--R.   lzcntw	%ax, %bx
-# CHECK-NEXT: [2,2]     .    D=eE--R   addw	%cx, %bx
+# CHECK:      [0,0]     DeeeER    .    . .   imulw	%ax, %bx
+# CHECK-NEXT: [0,1]     .D==eER   .    . .   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     .D===eER  .    . .   addw	%cx, %bx
+# CHECK-NEXT: [1,0]     . D===eeeER    . .   imulw	%ax, %bx
+# CHECK-NEXT: [1,1]     .  D=====eER   . .   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,2]     .  D======eER  . .   addw	%cx, %bx
+# CHECK-NEXT: [2,0]     .   D======eeeER .   imulw	%ax, %bx
+# CHECK-NEXT: [2,1]     .    D========eER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,2]     .    D=========eER   addw	%cx, %bx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -77,6 +77,6 @@ add %cx, %bx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.7    0.3    0.0       imulw	%ax, %bx
-# CHECK-NEXT: 1.     3     1.0    1.0    1.7       lzcntw	%ax, %bx
-# CHECK-NEXT: 2.     3     2.0    0.0    1.7       addw	%cx, %bx
+# CHECK-NEXT: 0.     3     4.0    0.3    0.0       imulw	%ax, %bx
+# CHECK-NEXT: 1.     3     6.0    0.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     3     7.0    0.0    0.0       addw	%cx, %bx
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s
index ca1419d..7669621 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s
@@ -7,9 +7,9 @@ lzcnt %ax, %bx  ## partial register stall.
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      1500
-# CHECK-NEXT: Total Cycles:      753
+# CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 0.5
 
 # CHECK:      Instruction Info:
@@ -48,11 +48,11 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     01234
+# CHECK-NEXT: Index     012345
 
-# CHECK:      [0,0]     DeER.   lzcntw	%ax, %bx
-# CHECK-NEXT: [1,0]     DeER.   lzcntw	%ax, %bx
-# CHECK-NEXT: [2,0]     .DeER   lzcntw	%ax, %bx
+# CHECK:      [0,0]     DeER .   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,0]     D=eER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,0]     .D=eER   lzcntw	%ax, %bx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -61,4 +61,4 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.0    1.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 0.     3     1.7    0.3    0.0       lzcntw	%ax, %bx
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s
index 3530f21..939af6d 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s
@@ -13,9 +13,9 @@ lzcnt 2(%rsp), %cx
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      4507
+# CHECK-NEXT: Total Cycles:      7504
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: IPC:               0.60
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -54,22 +54,22 @@ lzcnt 2(%rsp), %cx
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -     imull	%edx, %ecx
-# CHECK-NEXT: 0.99   0.01    -      -      -      -      -     1.00    -      -      -      -      -      -     lzcntw	(%rsp), %cx
+# CHECK-NEXT: 1.00    -      -      -      -      -      -     1.00    -      -      -      -      -      -     lzcntw	(%rsp), %cx
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -     lzcntw	2(%rsp), %cx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .   imull	%edx, %ecx
-# CHECK-NEXT: [0,1]     .DeeeeER  .    .   lzcntw	(%rsp), %cx
-# CHECK-NEXT: [0,2]     .D=eeeeER .    .   lzcntw	2(%rsp), %cx
-# CHECK-NEXT: [1,0]     . D====eeeER   .   imull	%edx, %ecx
-# CHECK-NEXT: [1,1]     .  DeeeeE--R   .   lzcntw	(%rsp), %cx
-# CHECK-NEXT: [1,2]     .  D=eeeeE--R  .   lzcntw	2(%rsp), %cx
-# CHECK-NEXT: [2,0]     .   D=====eeeER.   imull	%edx, %ecx
-# CHECK-NEXT: [2,1]     .    DeeeeE---R.   lzcntw	(%rsp), %cx
-# CHECK-NEXT: [2,2]     .    D=eeeeE---R   lzcntw	2(%rsp), %cx
+# CHECK:      [0,0]     DeeeER    .    .  .   imull	%edx, %ecx
+# CHECK-NEXT: [0,1]     .DeeeeER  .    .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [0,2]     .D=eeeeER .    .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [1,0]     . D====eeeER   .  .   imull	%edx, %ecx
+# CHECK-NEXT: [1,1]     .  D===eeeeER  .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [1,2]     .  D====eeeeER .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [2,0]     .   D=======eeeER .   imull	%edx, %ecx
+# CHECK-NEXT: [2,1]     .    D======eeeeER.   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [2,2]     .    D=======eeeeER   lzcntw	2(%rsp), %cx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -78,6 +78,6 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     4.0    0.3    0.0       imull	%edx, %ecx
-# CHECK-NEXT: 1.     3     1.0    1.0    1.7       lzcntw	(%rsp), %cx
-# CHECK-NEXT: 2.     3     2.0    2.0    1.7       lzcntw	2(%rsp), %cx
+# CHECK-NEXT: 0.     3     4.7    0.3    0.0       imull	%edx, %ecx
+# CHECK-NEXT: 1.     3     4.0    0.3    0.0       lzcntw	(%rsp), %cx
+# CHECK-NEXT: 2.     3     5.0    0.0    0.0       lzcntw	2(%rsp), %cx
diff --git a/llvm/tools/llvm-mca/Instruction.cpp b/llvm/tools/llvm-mca/Instruction.cpp
index 53f0d67..0c84767 100644
--- a/llvm/tools/llvm-mca/Instruction.cpp
+++ b/llvm/tools/llvm-mca/Instruction.cpp
@@ -132,7 +132,22 @@ void Instruction::execute() {
 
 void Instruction::update() {
   assert(isDispatched() && "Unexpected instruction stage found!");
-  if (llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+
+  if (!llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+    return;
+
+  // A partial register write cannot complete before a dependent write.
+  auto IsDefReady = [&](const UniqueDef &Def) {
+    if (const WriteState *Write = Def->getDependentWrite()) {
+      int WriteLatency = Write->getCyclesLeft();
+      if (WriteLatency == UNKNOWN_CYCLES)
+        return false;
+      return static_cast<unsigned>(WriteLatency) < Desc.MaxLatency;
+    }
+    return true;
+  };
+
+  if (llvm::all_of(Defs, IsDefReady))
     Stage = IS_READY;
 }
 
@@ -141,14 +156,10 @@ void Instruction::cycleEvent() {
     return;
 
   if (isDispatched()) {
-    bool IsReady = true;
-    for (UniqueUse &Use : Uses) {
+    for (UniqueUse &Use : Uses)
       Use->cycleEvent();
-      IsReady &= Use->isReady();
-    }
 
-    if (IsReady)
-      Stage = IS_READY;
+    update();
     return;
   }
 
diff --git a/llvm/tools/llvm-mca/Instruction.h b/llvm/tools/llvm-mca/Instruction.h
index 3588fb0b..ddf5c3a 100644
--- a/llvm/tools/llvm-mca/Instruction.h
+++ b/llvm/tools/llvm-mca/Instruction.h
@@ -101,6 +101,12 @@ class WriteState {
   // super-registers.
   bool ClearsSuperRegs;
 
+  // This field is set if this is a partial register write, and it has a false
+  // dependency on any previous write of the same register (or a portion of it).
+  // DependentWrite must be able to complete before this write completes, so
+  // that we don't break the WAW, and the two writes can be merged together.
+  const WriteState *DependentWrite;
+
   // A list of dependent reads. Users is a set of dependent
   // reads. A dependent read is added to the set only if CyclesLeft
   // is "unknown". As soon as CyclesLeft is 'known', each user in the set
@@ -113,7 +119,7 @@ public:
   WriteState(const WriteDescriptor &Desc, unsigned RegID,
              bool clearsSuperRegs = false)
       : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
-        ClearsSuperRegs(clearsSuperRegs) {}
+        ClearsSuperRegs(clearsSuperRegs), DependentWrite(nullptr) {}
   WriteState(const WriteState &Other) = delete;
   WriteState &operator=(const WriteState &Other) = delete;
 
@@ -126,6 +132,9 @@ public:
   unsigned getNumUsers() const { return Users.size(); }
   bool clearsSuperRegisters() const { return ClearsSuperRegs; }
 
+  const WriteState *getDependentWrite() const { return DependentWrite; }
+  void setDependentWrite(const WriteState *Write) { DependentWrite = Write; }
+
   // On every cycle, update CyclesLeft and notify dependent users.
   void cycleEvent();
   void onInstructionIssued();
@@ -315,6 +324,7 @@ public:
   const VecUses &getUses() const { return Uses; }
   const InstrDesc &getDesc() const { return Desc; }
   unsigned getRCUTokenID() const { return RCUTokenID; }
+  int getCyclesLeft() const { return CyclesLeft; }
 
   unsigned getNumUsers() const {
     unsigned NumUsers = 0;
diff --git a/llvm/tools/llvm-mca/RegisterFile.cpp b/llvm/tools/llvm-mca/RegisterFile.cpp
index 63fe0d2..44de105 100644
--- a/llvm/tools/llvm-mca/RegisterFile.cpp
+++ b/llvm/tools/llvm-mca/RegisterFile.cpp
@@ -26,7 +26,8 @@ namespace mca {
 
 RegisterFile::RegisterFile(const llvm::MCSchedModel &SM,
                            const llvm::MCRegisterInfo &mri, unsigned NumRegs)
-    : MRI(mri), RegisterMappings(mri.getNumRegs(), {WriteRef(), {0, 0}}) {
+    : MRI(mri), RegisterMappings(mri.getNumRegs(),
+                                 {WriteRef(), {IndexPlusCostPairTy(0, 1), 0}}) {
   initialize(SM, NumRegs);
 }
 
@@ -71,34 +72,46 @@ void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
   // Special case where there is no register class identifier in the set.
   // An empty set of register classes means: this register file contains all
   // the physical registers specified by the target.
-  if (Entries.empty()) {
-    for (std::pair<WriteRef, IndexPlusCostPairTy> &Mapping : RegisterMappings)
-      Mapping.second = std::make_pair(RegisterFileIndex, 1U);
+  // We optimistically assume that a register can be renamed at the cost of a
+  // single physical register. The constructor of RegisterFile ensures that
+  // a RegisterMapping exists for each logical register defined by the Target.
+  if (Entries.empty())
     return;
-  }
 
   // Now update the cost of individual registers.
   for (const MCRegisterCostEntry &RCE : Entries) {
     const MCRegisterClass &RC = MRI.getRegClass(RCE.RegisterClassID);
     for (const MCPhysReg Reg : RC) {
-      IndexPlusCostPairTy &Entry = RegisterMappings[Reg].second;
-      if (Entry.first) {
+      RegisterRenamingInfo &Entry = RegisterMappings[Reg].second;
+      IndexPlusCostPairTy &IPC = Entry.IndexPlusCost;
+      if (IPC.first && IPC.first != RegisterFileIndex) {
         // The only register file that is allowed to overlap is the default
         // register file at index #0. The analysis is inaccurate if register
         // files overlap.
         errs() << "warning: register " << MRI.getName(Reg)
                << " defined in multiple register files.";
       }
-      Entry.first = RegisterFileIndex;
-      Entry.second = RCE.Cost;
+      IPC = std::make_pair(RegisterFileIndex, RCE.Cost);
+      Entry.RenameAs = Reg;
+
+      // Assume the same cost for each sub-register.
+      for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) {
+        RegisterRenamingInfo &OtherEntry = RegisterMappings[*I].second;
+        if (!OtherEntry.IndexPlusCost.first &&
+            (!OtherEntry.RenameAs ||
+             MRI.isSuperRegister(*I, OtherEntry.RenameAs))) {
+          OtherEntry.IndexPlusCost = IPC;
+          OtherEntry.RenameAs = Reg;
+        }
+      }
     }
   }
 }
 
-void RegisterFile::allocatePhysRegs(IndexPlusCostPairTy Entry,
+void RegisterFile::allocatePhysRegs(const RegisterRenamingInfo &Entry,
                                     MutableArrayRef<unsigned> UsedPhysRegs) {
-  unsigned RegisterFileIndex = Entry.first;
-  unsigned Cost = Entry.second;
+  unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+  unsigned Cost = Entry.IndexPlusCost.second;
   if (RegisterFileIndex) {
     RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
     RMT.NumUsedPhysRegs += Cost;
@@ -110,10 +123,10 @@ void RegisterFile::allocatePhysRegs(IndexPlusCostPairTy Entry,
   UsedPhysRegs[0] += Cost;
 }
 
-void RegisterFile::freePhysRegs(IndexPlusCostPairTy Entry,
+void RegisterFile::freePhysRegs(const RegisterRenamingInfo &Entry,
                                 MutableArrayRef<unsigned> FreedPhysRegs) {
-  unsigned RegisterFileIndex = Entry.first;
-  unsigned Cost = Entry.second;
+  unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+  unsigned Cost = Entry.IndexPlusCost.second;
   if (RegisterFileIndex) {
     RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
     RMT.NumUsedPhysRegs -= Cost;
@@ -128,12 +141,48 @@ void RegisterFile::freePhysRegs(IndexPlusCostPairTy Entry,
 void RegisterFile::addRegisterWrite(WriteRef Write,
                                     MutableArrayRef<unsigned> UsedPhysRegs,
                                     bool ShouldAllocatePhysRegs) {
-  const WriteState &WS = *Write.getWriteState();
+  WriteState &WS = *Write.getWriteState();
   unsigned RegID = WS.getRegisterID();
   assert(RegID && "Adding an invalid register definition?");
 
-  RegisterMapping &Mapping = RegisterMappings[RegID];
-  Mapping.first = Write;
+  LLVM_DEBUG({
+    dbgs() << "RegisterFile: addRegisterWrite [ " << Write.getSourceIndex()
+           << ", " << MRI.getName(RegID) << "]\n";
+  });
+
+  // If RenameAs is equal to RegID, then RegID is subject to register renaming
+  // and false dependencies on RegID are all eliminated.
+
+  // If RenameAs references the invalid register, then we optimistically assume
+  // that it can be renamed. In the absence of tablegen descriptors for register
+  // files, RenameAs is always set to the invalid register ID.  In all other
+  // cases, RenameAs must be either equal to RegID, or it must reference a
+  // super-register of RegID.
+
+  // If RenameAs is a super-register of RegID, then a write to RegID has always
+  // a false dependency on RenameAs. The only exception is for when the write
+  // implicitly clears the upper portion of the underlying register.
+  // If a write clears its super-registers, then it is renamed as `RenameAs`.
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  if (RRI.RenameAs && RRI.RenameAs != RegID) {
+    RegID = RRI.RenameAs;
+    const WriteRef &OtherWrite = RegisterMappings[RegID].first;
+
+    if (!WS.clearsSuperRegisters()) {
+      // The processor keeps the definition of `RegID` together with register
+      // `RenameAs`. Since this partial write is not renamed, no physical
+      // register is allocated.
+      ShouldAllocatePhysRegs = false;
+
+      if (OtherWrite.getSourceIndex() != Write.getSourceIndex()) {
+        // This partial write has a false dependency on RenameAs.
+        WS.setDependentWrite(OtherWrite.getWriteState());
+      }
+    }
+  }
+
+  // Update the mapping for register RegID including its sub-registers.
+  RegisterMappings[RegID].first = Write;
   for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
     RegisterMappings[*I].first = Write;
 
@@ -141,9 +190,8 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   // hardware. For example, zero-latency data-dependency breaking instructions
   // don't consume physical registers.
   if (ShouldAllocatePhysRegs)
-    allocatePhysRegs(Mapping.second, UsedPhysRegs);
+    allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
 
-  // If this is a partial update, then we are done.
   if (!WS.clearsSuperRegisters())
     return;
 
@@ -155,42 +203,50 @@ void RegisterFile::removeRegisterWrite(const WriteState &WS,
                                        MutableArrayRef<unsigned> FreedPhysRegs,
                                        bool ShouldFreePhysRegs) {
   unsigned RegID = WS.getRegisterID();
-  bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters();
 
   assert(RegID != 0 && "Invalidating an already invalid register?");
-  assert(WS.getCyclesLeft() != -512 &&
+  assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
          "Invalidating a write of unknown cycles!");
   assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
-  RegisterMapping &Mapping = RegisterMappings[RegID];
-  WriteRef &WR = Mapping.first;
-  if (!WR.isValid())
-    return;
+
+  unsigned RenameAs = RegisterMappings[RegID].second.RenameAs;
+  if (RenameAs && RenameAs != RegID) {
+    RegID = RenameAs;
+
+    if (!WS.clearsSuperRegisters()) {
+      // Keep the definition of `RegID` together with register `RenameAs`.
+      ShouldFreePhysRegs = false;
+    }
+  }
 
   if (ShouldFreePhysRegs)
-    freePhysRegs(Mapping.second, FreedPhysRegs);
+    freePhysRegs(RegisterMappings[RegID].second, FreedPhysRegs);
 
+  WriteRef &WR = RegisterMappings[RegID].first;
   if (WR.getWriteState() == &WS)
     WR.invalidate();
 
   for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
-    WR = RegisterMappings[*I].first;
-    if (WR.getWriteState() == &WS)
-      WR.invalidate();
+    WriteRef &OtherWR = RegisterMappings[*I].first;
+    if (OtherWR.getWriteState() == &WS)
+      OtherWR.invalidate();
   }
 
-  if (!ShouldInvalidateSuperRegs)
+  if (!WS.clearsSuperRegisters())
     return;
 
   for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
-    WR = RegisterMappings[*I].first;
-    if (WR.getWriteState() == &WS)
-      WR.invalidate();
+    WriteRef &OtherWR = RegisterMappings[*I].first;
+    if (OtherWR.getWriteState() == &WS)
+      OtherWR.invalidate();
   }
 }
 
 void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
                                  unsigned RegID) const {
   assert(RegID && RegID < RegisterMappings.size());
+  LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
+                    << MRI.getName(RegID) << '\n');
   const WriteRef &WR = RegisterMappings[RegID].first;
   if (WR.isValid())
     Writes.push_back(WR);
@@ -225,7 +281,8 @@ unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
 
   // Find how many new mappings must be created for each register file.
   for (const unsigned RegID : Regs) {
-    const IndexPlusCostPairTy &Entry = RegisterMappings[RegID].second;
+    const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+    const IndexPlusCostPairTy &Entry = RRI.IndexPlusCost;
     if (Entry.first)
       NumPhysRegs[Entry.first] += Entry.second;
     NumPhysRegs[0] += Entry.second;
@@ -266,10 +323,10 @@ void RegisterFile::dump() const {
     const RegisterMapping &RM = RegisterMappings[I];
     if (!RM.first.getWriteState())
       continue;
-    const std::pair<unsigned, unsigned> &IndexPlusCost = RM.second;
-    dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << IndexPlusCost.first
-           << ", Cost=" << IndexPlusCost.second 
-           << ", ";
+    const RegisterRenamingInfo &RRI = RM.second;
+    dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << RRI.IndexPlusCost.first
+           << ", Cost=" << RRI.IndexPlusCost.second
+           << ", RenameAs=" << RRI.RenameAs << ", ";
     RM.first.dump();
     dbgs() << '\n';
   }
diff --git a/llvm/tools/llvm-mca/RegisterFile.h b/llvm/tools/llvm-mca/RegisterFile.h
index 8817b8b..349e978 100644
--- a/llvm/tools/llvm-mca/RegisterFile.h
+++ b/llvm/tools/llvm-mca/RegisterFile.h
@@ -61,24 +61,35 @@ class RegisterFile : public HardwareUnit {
   // regsiter file #0 specifying command line flag `-register-file-size=<uint>`.
   llvm::SmallVector<RegisterMappingTracker, 4> RegisterFiles;
 
-  // This pair is used to identify the owner of a register, as well as
-  // the "register cost". Register cost is defined as the number of physical
-  // registers required to allocate a user register.
+  // This type is used to propagate information about the owner of a register,
+  // and the cost of allocating it in the PRF. Register cost is defined as the
+  // number of physical registers consumed by the PRF to allocate a user
+  // register.
+  //
   // For example: on X86 BtVer2, a YMM register consumes 2 128-bit physical
   // registers. So, the cost of allocating a YMM register in BtVer2 is 2.
   using IndexPlusCostPairTy = std::pair<unsigned, unsigned>;
 
+  // Struct RegisterRenamingInfo maps registers to register files.
+  // There is a RegisterRenamingInfo object for every register defined by
+  // the target. RegisteRenamingInfo objects are stored into vector
+  // RegisterMappings, and register IDs can be used to reference them.
+  struct RegisterRenamingInfo {
+    IndexPlusCostPairTy IndexPlusCost;
+    llvm::MCPhysReg RenameAs;
+  };
+
   // RegisterMapping objects are mainly used to track physical register
   // definitions. There is a RegisterMapping for every register defined by the
   // Target. For each register, a RegisterMapping pair contains a descriptor of
   // the last register write (in the form of a WriteRef object), as well as a
-  // IndexPlusCostPairTy to quickly identify owning register files.
+  // RegisterRenamingInfo to quickly identify owning register files.
   //
   // This implementation does not allow overlapping register files. The only
   // register file that is allowed to overlap with other register files is
   // register file #0. If we exclude register #0, every register is "owned" by
   // at most one register file.
-  using RegisterMapping = std::pair<WriteRef, IndexPlusCostPairTy>;
+  using RegisterMapping = std::pair<WriteRef, RegisterRenamingInfo>;
 
   // This map contains one entry for each register defined by the target.
   std::vector<RegisterMapping> RegisterMappings;
@@ -103,13 +114,12 @@ class RegisterFile : public HardwareUnit {
 
   // Consumes physical registers in each register file specified by the
   // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`.
-  void allocatePhysRegs(IndexPlusCostPairTy IPC,
+  void allocatePhysRegs(const RegisterRenamingInfo &Entry,
                         llvm::MutableArrayRef<unsigned> UsedPhysRegs);
 
-  // Releases previously allocated physical registers from the register file(s)
-  // referenced by the IndexPlusCostPairTy object. This method is called from
-  // `invalidateRegisterMapping()`.
-  void freePhysRegs(IndexPlusCostPairTy IPC,
+  // Releases previously allocated physical registers from the register file(s).
+  // This method is called from `invalidateRegisterMapping()`.
+  void freePhysRegs(const RegisterRenamingInfo &Entry,
                     llvm::MutableArrayRef<unsigned> FreedPhysRegs);
 
   // Create an instance of RegisterMappingTracker for every register file
@@ -140,8 +150,11 @@ public:
   // Returns a "response mask" where each bit represents the response from a
   // different register file.  A mask of all zeroes means that all register
   // files are available.  Otherwise, the mask can be used to identify which
-  // register file was busy.  This sematic allows us classify dispatch dispatch
+  // register file was busy.  This sematic allows us to classify dispatch
   // stalls caused by the lack of register file resources.
+  //
+  // Current implementation can simulate up to 32 register files (including the
+  // special register file at index #0).
   unsigned isAvailable(llvm::ArrayRef<unsigned> Regs) const;
   void collectWrites(llvm::SmallVectorImpl<WriteRef> &Writes,
                      unsigned RegID) const;
-- 
2.7.4