--- /dev/null
+llvm-mca - LLVM Machine Code Analyzer
+=====================================
+
+SYNOPSIS
+--------
+
+:program:`llvm-mca` [*options*] [input]
+
+DESCRIPTION
+-----------
+
+:program:`llvm-mca` is a performance analysis tool that uses information
+available in LLVM (e.g. scheduling models) to statically measure the performance
+of machine code in a specific CPU.
+
+Performance is measured in terms of throughput as well as processor resource
+consumption. The tool currently works for processors with an out-of-order
+backend, for which there is a scheduling model available in LLVM.
+
+The main goal of this tool is not just to predict the performance of the code
+when run on the target, but also help with diagnosing potential performance
+issues.
+
+Given an assembly code sequence, llvm-mca estimates the IPC (Instructions Per
+Cycle), as well as hardware resource pressure. The analysis and reporting style
+were inspired by the IACA tool from Intel.
+
+OPTIONS
+-------
+
+If ``input`` is "``-``" or omitted, :program:`llvm-mca` reads from standard
+input. Otherwise, it will read from the specified filename.
+
+If the :option:`-o` option is omitted, then :program:`llvm-mca` will send its output
+to standard output if the input is from standard input. If the :option:`-o`
+option specifies "``-``", then the output will also be sent to standard output.
+
+
+.. option:: -help
+
+ Print a summary of command line options.
+
+.. option:: -mtriple=<target triple>
+
+ Specify a target triple string.
+
+.. option:: -march=<arch>
+
+ Specify the architecture for which to analyze the code. It defaults to the
+ host default target.
+
+.. option:: -mcpu=<cpuname>
+
+ Specify the processor for whic to run the analysis.
+ By default this defaults to a "generic" processor. It is not autodetected to
+ the current architecture.
+
+.. option:: -output-asm-variant=<variant id>
+
+ Specify the output assembly variant for the report generated by the tool.
+ On x86, possible values are [0, 1]. A value of 0 (vic. 1) for this flag enables
+ the AT&T (vic. Intel) assembly format for the code printed out by the tool in
+ the analysis report.
+
+.. option:: -dispatch=<width>
+
+ Specify a different dispatch width for the processor. The dispatch width
+ defaults to the 'IssueWidth' specified by the processor scheduling model.
+ If width is zero, then the default dispatch width is used.
+
+.. option:: -max-retire-per-cycle=<retire throughput>
+
+ Specify the retire throughput (i.e. how many instructions can be retired by the
+ retire control unit every cycle).
+
+.. option:: -register-file-size=<size>
+
+ Specify the size of the register file. When specified, this flag limits
+ how many temporary registers are available for register renaming purposes. By
+ default, the number of temporary registers is unlimited. A value of zero for
+ this flag means "unlimited number of temporary registers".
+
+.. option:: -iterations=<number of iterations>
+
+ Specify the number of iterations to run. If this flag is set to 0, then the
+ tool sets the number of iterations to a default value (i.e. 70).
+
+.. option:: -noalias=<bool>
+
+ If set, the tool assumes that loads and stores don't alias. This is the
+ default behavior.
+
+.. option:: -lqueue=<load queue size>
+
+ Specify the size of the load queue in the load/store unit emulated by the tool.
+ By default, the tool assumes an unbound number of entries in the load queue.
+ A value of zero for this flag is ignored, and the default load queue size is
+ used instead.
+
+.. option:: -squeue=<store queue size>
+
+ Specify the size of the store queue in the load/store unit emulated by the
+ tool. By default, the tool assumes an unbound number of entries in the store
+ queue. A value of zero for this flag is ignored, and the default store queue
+ size is used instead.
+
+.. option:: -verbose
+
+ Enable verbose output. In particular, this flag enables a number of extra
+ statistics and performance counters for the dispatch logic, the reorder
+ buffer, the retire control unit and the register file.
+
+.. option:: -timeline
+
+ Enable the timeline view.
+
+.. option:: -timeline-max-iterations=<iterations>
+
+ Limit the number of iterations to print in the timeline view. By default, the
+ timeline view prints information for up to 10 iterations.
+
+.. option:: -timeline-max-cycles=<cycles>
+
+ Limit the number of cycles in the timeline view. By default, the number of
+ cycles is set to 80.
+
+
+EXIT STATUS
+-----------
+
+:program:`llvm-mca` returns 0 on success. Otherwise, an error message is printed
+to standard error, and the tool returns 1.
+
--- /dev/null
+if not 'ARM' in config.root.targets:
+ config.unsupported = True
--- /dev/null
+# RUN: llvm-mca -mtriple=arm-eabi -mcpu=cortex-a9 -iterations=100 < %s | FileCheck %s
+
+vadd.f32 s0, s2, s2
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 100
+# CHECK-NEXT: Total Cycles: 105
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC: 0.95
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - A9UnitAGU
+# CHECK-NEXT: [1.0] - A9UnitALU
+# CHECK-NEXT: [1.1] - A9UnitALU
+# CHECK-NEXT: [2] - A9UnitB
+# CHECK-NEXT: [3] - A9UnitFP
+# CHECK-NEXT: [4] - A9UnitLS
+# CHECK-NEXT: [5] - A9UnitMul
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5]
+# CHECK-NEXT: 1.00 - - - 1.00 - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] Instructions:
+# CHECK-NEXT: 1.00 - - - 1.00 - - vadd.f32 s0, s2, s2
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 4 1.00 vadd.f32 s0, s2, s2
--- /dev/null
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+vmulps %xmm0, %xmm1, %xmm2
+vhaddps %xmm2, %xmm2, %xmm3
+vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: Iterations: 300
+# CHECK-NEXT: Instructions: 900
+# CHECK-NEXT: Total Cycles: 610
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC: 1.48
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - 2.00 1.00 - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - 1.00 - - - - - - - vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - 1.00 - - - - - - - - vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: - - - - - 1.00 - - - - - - - - vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4
+
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: [1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1] . D=eeeE--R . vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,2] . D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: [2,0] . DeeE----R . vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [2,1] . D====eeeER . vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,2] . D======eeeER vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 3 1.0 1.0 3.0 vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1. 3 3.3 0.7 0.7 vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
--- /dev/null
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 800
+# CHECK-NEXT: Total Cycles: 2403
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC: 0.33
+
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - - - 4.00 - 4.00 - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps (%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 48(%rdi)
+
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
+
+# CHECK: [0,0] DeeeeeER . . . .. vmovaps (%rsi), %xmm0
+# CHECK-NEXT: [0,1] D=====eER . . . .. vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: [0,2] .D=====eeeeeER . . .. vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: [0,3] .D==========eER. . .. vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4] . D==========eeeeeER. .. vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: [0,5] . D===============eER .. vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6] . D===============eeeeeER. vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: [0,7] . D====================eER vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0
+# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: 3. 1 11.0 0.0 0.0 vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: 4. 1 11.0 0.0 0.0 vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
--- /dev/null
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 800
+# CHECK-NEXT: Total Cycles: 408
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC: 1.96
+
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - - - 4.00 - 4.00 - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps (%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 48(%rdi)
+
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi)
+
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeER .. vmovaps (%rsi), %xmm0
+# CHECK-NEXT: [0,1] D=====eER .. vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: [0,2] .DeeeeeER .. vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: [0,3] .D=====eER.. vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4] . DeeeeeER.. vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: [0,5] . D=====eER. vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6] . DeeeeeER. vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: [0,7] . D=====eER vmovaps %xmm0, 48(%rdi)
+
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0
+# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: 3. 1 6.0 0.0 0.0 vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: 5. 1 6.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: 7. 1 6.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
--- /dev/null
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 < %s | FileCheck %s
+
+add %edi, %eax
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 100
+# CHECK-NEXT: Total Cycles: 103
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC: 0.97
+
+# CHECK-LABEL: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addl %edi, %eax
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 1 0.50 addl %edi, %eax
--- /dev/null
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=ivybridge < %s | FileCheck --check-prefix=ALL --check-prefix=IVYBRIDGE %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=haswell < %s | FileCheck --check-prefix=ALL --check-prefix=HASWELL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=broadwell < %s | FileCheck --check-prefix=ALL --check-prefix=BROADWELL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=knl < %s | FileCheck --check-prefix=ALL --check-prefix=KNL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake < %s | FileCheck --check-prefix=ALL --check-prefix=SKX %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 < %s | FileCheck --check-prefix=ALL --check-prefix=SKX-AVX512 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=slm < %s | FileCheck --check-prefix=ALL --check-prefix=SLM %s
+
+add %edi, %eax
+
+# ALL: Iterations: 70
+# ALL-NEXT: Instructions: 70
+
+# BTVER2: Dispatch Width: 2
+# ZNVER1: Dispatch Width: 4
+# SANDYBRIDGE: Dispatch Width: 4
+# IVYBRIDGE: Dispatch Width: 4
+# HASWELL: Dispatch Width: 4
+# BROADWELL: Dispatch Width: 4
+# KNL: Dispatch Width: 4
+# SKX: Dispatch Width: 6
+# SKX-AVX512: Dispatch Width: 6
+# SLM: Dispatch Width: 2
+
--- /dev/null
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s
+
+add %eax, %eax
+
+# DEFAULT: Iterations: 70
+# DEFAULT-NEXT: Instructions: 70
+
+# CUSTOM: Iterations: 1
+# CUSTOM-NEXT: Instructions: 1
--- /dev/null
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s
+
+add %eax, %eax
+
+# DEFAULT: Dispatch Width: 2
+# CUSTOM: Dispatch Width: 1
--- /dev/null
+# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o %t1 2>&1 | FileCheck %s
+
+# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu.
--- /dev/null
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s
+
+invalid_instruction_mnemonic
--- /dev/null
+# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=foo -o %t1 2>&1 | FileCheck %s
+
+# CHECK: 'foo' is not a recognized processor for this target (ignoring processor)
--- /dev/null
+# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -o %t1 2>&1 | FileCheck %s
+
+# CHECK: error: no assembly instructions found.
--- /dev/null
+if not 'X86' in config.root.targets:
+ config.unsupported = True
+
--- /dev/null
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic < %s 2>&1 | FileCheck %s
+
+# CHECK: error: unable to find instruction-level scheduling information for target triple 'x86_64-unknown-unknown' and cpu 'generic'.
--- /dev/null
+# RUN: not llvm-mca %t.blah -o %t2 2>&1 | FileCheck --check-prefix=ENOENT %s
+
+# ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory
--- /dev/null
+# Requires a non-empty default triple for these tests
+if 'default_triple' not in config.available_features:
+ config.unsupported = True
+
llvm-link
llvm-lto
llvm-mc
+ llvm-mca
llvm-mcmarkup
llvm-modextract
llvm-mt
--- /dev/null
+//===--------------------- Backend.cpp --------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Implementation of class Backend which emulates an hardware OoO backend.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Backend.h"
+#include "HWEventListener.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+using namespace llvm;
+
+void Backend::addEventListener(HWEventListener *Listener) {
+ if (Listener)
+ Listeners.insert(Listener);
+}
+
+void Backend::runCycle(unsigned Cycle) {
+ notifyCycleBegin(Cycle);
+
+ if (!SM->hasNext()) {
+ notifyCycleEnd(Cycle);
+ return;
+ }
+
+ InstRef IR = SM->peekNext();
+ const InstrDesc *Desc = &IB->getOrCreateInstrDesc(STI, *IR.second);
+ while (DU->isAvailable(Desc->NumMicroOps) && DU->canDispatch(*Desc)) {
+ Instruction *NewIS = IB->createInstruction(STI, *DU, IR.first, *IR.second);
+ Instructions[IR.first] = std::unique_ptr<Instruction>(NewIS);
+ NewIS->setRCUTokenID(DU->dispatch(IR.first, NewIS));
+
+ // If this is a zero latency instruction, then we don't need to dispatch
+ // it. Instead, we can mark it as executed.
+ if (NewIS->isZeroLatency())
+ notifyInstructionExecuted(IR.first);
+
+ // Check if we have dispatched all the instructions.
+ SM->updateNext();
+ if (!SM->hasNext())
+ break;
+
+ // Prepare for the next round.
+ IR = SM->peekNext();
+ Desc = &IB->getOrCreateInstrDesc(STI, *IR.second);
+ }
+
+ notifyCycleEnd(Cycle);
+}
+
+void Backend::notifyCycleBegin(unsigned Cycle) {
+ DEBUG(dbgs() << "[E] Cycle begin: " << Cycle << '\n');
+ for (HWEventListener *Listener : Listeners)
+ Listener->onCycleBegin(Cycle);
+
+ DU->cycleEvent(Cycle);
+ HWS->cycleEvent(Cycle);
+}
+
+void Backend::notifyInstructionDispatched(unsigned Index) {
+ DEBUG(dbgs() << "[E] Instruction Dispatched: " << Index << '\n');
+ for (HWEventListener *Listener : Listeners)
+ Listener->onInstructionDispatched(Index);
+}
+
+void Backend::notifyInstructionReady(unsigned Index) {
+ DEBUG(dbgs() << "[E] Instruction Ready: " << Index << '\n');
+ for (HWEventListener *Listener : Listeners)
+ Listener->onInstructionReady(Index);
+}
+
+void Backend::notifyInstructionIssued(
+ unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
+ DEBUG(
+ dbgs() << "[E] Instruction Issued: " << Index << '\n';
+ for (const std::pair<ResourceRef, unsigned> &Resource : Used) {
+ dbgs() << "[E] Resource Used: [" << Resource.first.first << '.'
+ << Resource.first.second << "]\n";
+ dbgs() << " cycles: " << Resource.second << '\n';
+ }
+ );
+
+ for (HWEventListener *Listener : Listeners)
+ Listener->onInstructionIssued(Index, Used);
+}
+
+void Backend::notifyInstructionExecuted(unsigned Index) {
+ DEBUG(dbgs() << "[E] Instruction Executed: " << Index << '\n');
+ for (HWEventListener *Listener : Listeners)
+ Listener->onInstructionExecuted(Index);
+
+ const Instruction &IS = *Instructions[Index];
+ DU->onInstructionExecuted(IS.getRCUTokenID());
+}
+
+void Backend::notifyInstructionRetired(unsigned Index) {
+ DEBUG(dbgs() << "[E] Instruction Retired: " << Index << '\n');
+ for (HWEventListener *Listener : Listeners)
+ Listener->onInstructionRetired(Index);
+
+ const Instruction &IS = *Instructions[Index];
+ DU->invalidateRegisterMappings(IS);
+ Instructions.erase(Index);
+}
+
+void Backend::notifyResourceAvailable(const ResourceRef &RR) {
+ DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.' << RR.second
+ << "]\n");
+ for (HWEventListener *Listener : Listeners)
+ Listener->onResourceAvailable(RR);
+}
+
+void Backend::notifyCycleEnd(unsigned Cycle) {
+ DEBUG(dbgs() << "[E] Cycle end: " << Cycle << "\n\n");
+ for (HWEventListener *Listener : Listeners)
+ Listener->onCycleEnd(Cycle);
+}
+
+} // namespace mca.
--- /dev/null
+//===--------------------- Backend.h ----------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements an OoO backend for the llvm-mca tool.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BACKEND_H
+#define LLVM_TOOLS_LLVM_MCA_BACKEND_H
+
+#include "Dispatch.h"
+#include "InstrBuilder.h"
+#include "Scheduler.h"
+#include "SourceMgr.h"
+
+namespace mca {
+
+struct HWEventListener;
+
+/// \brief An out of order backend for a specific subtarget.
+///
+/// It emulates an out-of-order execution of instructions. Instructions are
+/// fetched from a MCInst sequence managed by an object of class SourceMgr.
+/// Instructions are firstly dispatched to the schedulers and then executed.
+/// This class tracks the lifetime of an instruction from the moment where
+/// it gets dispatched to the schedulers, to the moment where it finishes
+/// executing and register writes are architecturally committed.
+/// In particular, it monitors changes in the state of every instruction
+/// in flight.
+/// Instructions are executed in a loop of iterations. The number of iterations
+/// is defined by the SourceMgr object.
+/// The Backend entrypoint is method 'Run()' which execute cycles in a loop
+/// until there are new instructions to dispatch, and not every instruction
+/// has been retired.
+/// Internally, the Backend collects statistical information in the form of
+/// histograms. For example, it tracks how the dispatch group size changes
+/// over time.
+class Backend {
+ const llvm::MCSubtargetInfo &STI;
+
+ std::unique_ptr<InstrBuilder> IB;
+ std::unique_ptr<Scheduler> HWS;
+ std::unique_ptr<DispatchUnit> DU;
+ std::unique_ptr<SourceMgr> SM;
+ unsigned Cycles;
+
+ llvm::DenseMap<unsigned, std::unique_ptr<Instruction>> Instructions;
+ std::set<HWEventListener *> Listeners;
+
+ void runCycle(unsigned Cycle);
+
+public:
+ Backend(const llvm::MCSubtargetInfo &Subtarget, const llvm::MCInstrInfo &MCII,
+ const llvm::MCRegisterInfo &MRI, std::unique_ptr<SourceMgr> Source,
+ unsigned DispatchWidth = 0, unsigned RegisterFileSize = 0,
+ unsigned MaxRetirePerCycle = 0, unsigned LoadQueueSize = 0,
+ unsigned StoreQueueSize = 0, bool AssumeNoAlias = false)
+ : STI(Subtarget),
+ HWS(llvm::make_unique<Scheduler>(this, Subtarget.getSchedModel(),
+ LoadQueueSize, StoreQueueSize,
+ AssumeNoAlias)),
+ DU(llvm::make_unique<DispatchUnit>(
+ this, MRI, Subtarget.getSchedModel().MicroOpBufferSize,
+ RegisterFileSize, MaxRetirePerCycle, DispatchWidth, HWS.get())),
+ SM(std::move(Source)), Cycles(0) {
+ IB = llvm::make_unique<InstrBuilder>(MCII, getProcResourceMasks());
+ }
+
+ void run() {
+ while (SM->hasNext() || !DU->isRCUEmpty())
+ runCycle(Cycles++);
+ }
+
+ unsigned getNumIterations() const { return SM->getNumIterations(); }
+ unsigned getNumInstructions() const { return SM->size(); }
+ unsigned getNumCycles() const { return Cycles; }
+ unsigned getTotalRegisterMappingsCreated() const {
+ return DU->getTotalRegisterMappingsCreated();
+ }
+ unsigned getMaxUsedRegisterMappings() const {
+ return DU->getMaxUsedRegisterMappings();
+ }
+ unsigned getDispatchWidth() const { return DU->getDispatchWidth(); }
+
+ const llvm::MCSubtargetInfo &getSTI() const { return STI; }
+ const llvm::MCSchedModel &getSchedModel() const {
+ return STI.getSchedModel();
+ }
+ const llvm::ArrayRef<uint64_t> getProcResourceMasks() const {
+ return HWS->getProcResourceMasks();
+ }
+
+ double getRThroughput(const InstrDesc &ID) const {
+ return HWS->getRThroughput(ID);
+ }
+ void getBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
+ return HWS->getBuffersUsage(Usage);
+ }
+
+ unsigned getNumRATStalls() const { return DU->getNumRATStalls(); }
+ unsigned getNumRCUStalls() const { return DU->getNumRCUStalls(); }
+ unsigned getNumSQStalls() const { return DU->getNumSQStalls(); }
+ unsigned getNumLDQStalls() const { return DU->getNumLDQStalls(); }
+ unsigned getNumSTQStalls() const { return DU->getNumSTQStalls(); }
+ unsigned getNumDispatchGroupStalls() const {
+ return DU->getNumDispatchGroupStalls();
+ }
+
+ const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
+ return SM->getMCInstFromIndex(Index);
+ }
+
+ const InstrDesc &getInstrDesc(const llvm::MCInst &Inst) const {
+ return IB->getOrCreateInstrDesc(STI, Inst);
+ }
+
+ const SourceMgr &getSourceMgr() const { return *SM; }
+
+ void addEventListener(HWEventListener *Listener);
+ void notifyCycleBegin(unsigned Cycle);
+ void notifyInstructionDispatched(unsigned Index);
+ void notifyInstructionReady(unsigned Index);
+ void notifyInstructionIssued(
+ unsigned Index,
+ const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used);
+ void notifyInstructionExecuted(unsigned Index);
+ void notifyResourceAvailable(const ResourceRef &RR);
+ void notifyInstructionRetired(unsigned Index);
+ void notifyCycleEnd(unsigned Cycle);
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===--------------------- BackendPrinter.cpp -------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the BackendPrinter interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "BackendPrinter.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+
+namespace mca {
+
+using namespace llvm;
+
+std::unique_ptr<ToolOutputFile>
+BackendPrinter::getOutputStream(std::string OutputFile) {
+ if (OutputFile == "")
+ OutputFile = "-";
+ std::error_code EC;
+ auto Out = llvm::make_unique<ToolOutputFile>(OutputFile, EC, sys::fs::F_None);
+ if (!EC)
+ return Out;
+ errs() << EC.message() << '\n';
+ return nullptr;
+}
+
+void BackendPrinter::printGeneralStatistics(unsigned Iterations,
+ unsigned Cycles,
+ unsigned Instructions,
+ unsigned DispatchWidth) const {
+ unsigned TotalInstructions = Instructions * Iterations;
+ double IPC = (double)TotalInstructions / Cycles;
+
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "Iterations: " << Iterations;
+ TempStream << "\nInstructions: " << TotalInstructions;
+ TempStream << "\nTotal Cycles: " << Cycles;
+ TempStream << "\nDispatch Width: " << DispatchWidth;
+ TempStream << "\nIPC: " << format("%.2f", IPC) << '\n';
+ TempStream.flush();
+ File->os() << Buffer;
+}
+
+void BackendPrinter::printRATStatistics(unsigned TotalMappings,
+ unsigned MaxUsedMappings) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "\n\nRegister Alias Table:";
+ TempStream << "\nTotal number of mappings created: " << TotalMappings;
+ TempStream << "\nMax number of mappings used: " << MaxUsedMappings
+ << '\n';
+ TempStream.flush();
+ File->os() << Buffer;
+}
+
+void BackendPrinter::printDispatchStalls(unsigned RATStalls, unsigned RCUStalls,
+ unsigned SCHEDQStalls,
+ unsigned LDQStalls, unsigned STQStalls,
+ unsigned DGStalls) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "\n\nDynamic Dispatch Stall Cycles:\n";
+ TempStream << "RAT - Register unavailable: "
+ << RATStalls;
+ TempStream << "\nRCU - Retire tokens unavailable: "
+ << RCUStalls;
+ TempStream << "\nSCHEDQ - Scheduler full: "
+ << SCHEDQStalls;
+ TempStream << "\nLQ - Load queue full: "
+ << LDQStalls;
+ TempStream << "\nSQ - Store queue full: "
+ << STQStalls;
+ TempStream << "\nGROUP - Static restrictions on the dispatch group: "
+ << DGStalls;
+ TempStream << '\n';
+ TempStream.flush();
+ File->os() << Buffer;
+}
+
+void BackendPrinter::printSchedulerUsage(
+ const MCSchedModel &SM, const ArrayRef<BufferUsageEntry> &Usage) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "\n\nScheduler's queue usage:\n";
+ const ArrayRef<uint64_t> ResourceMasks = B.getProcResourceMasks();
+ for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+ const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+ if (!ProcResource.BufferSize)
+ continue;
+
+ for (const BufferUsageEntry &Entry : Usage)
+ if (ResourceMasks[I] == Entry.first)
+ TempStream << ProcResource.Name << ", " << Entry.second << '/'
+ << ProcResource.BufferSize << '\n';
+ }
+
+ TempStream.flush();
+ File->os() << Buffer;
+}
+
+void BackendPrinter::printInstructionInfo() const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+
+ TempStream << "\n\nInstruction Info:\n";
+ TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
+ << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects\n\n";
+
+ TempStream << "[1] [2] [3] [4] [5] [6]\tInstructions:\n";
+ for (unsigned I = 0, E = B.getNumInstructions(); I < E; ++I) {
+ const MCInst &Inst = B.getMCInstFromIndex(I);
+ const InstrDesc &ID = B.getInstrDesc(Inst);
+ unsigned NumMicroOpcodes = ID.NumMicroOps;
+ unsigned Latency = ID.MaxLatency;
+ double RThroughput = B.getRThroughput(ID);
+ TempStream << ' ' << NumMicroOpcodes << " ";
+ if (NumMicroOpcodes < 10)
+ TempStream << " ";
+ else if (NumMicroOpcodes < 100)
+ TempStream << ' ';
+ TempStream << Latency << " ";
+ if (Latency < 10.0)
+ TempStream << " ";
+ else if (Latency < 100.0)
+ TempStream << ' ';
+ if (RThroughput) {
+ TempStream << format("%.2f", RThroughput) << ' ';
+ if (RThroughput < 10.0)
+ TempStream << " ";
+ else if (RThroughput < 100.0)
+ TempStream << ' ';
+ } else {
+ TempStream << " - ";
+ }
+ TempStream << (ID.MayLoad ? " * " : " ");
+ TempStream << (ID.MayStore ? " * " : " ");
+ TempStream << (ID.HasSideEffects ? " * " : " ");
+ MCIP->printInst(&Inst, TempStream, "", B.getSTI());
+ TempStream << '\n';
+ }
+
+ TempStream.flush();
+ File->os() << Buffer;
+}
+
+void BackendPrinter::printReport() const {
+ assert(isFileValid());
+ unsigned Cycles = B.getNumCycles();
+ printGeneralStatistics(B.getNumIterations(), Cycles, B.getNumInstructions(),
+ B.getDispatchWidth());
+ if (EnableVerboseOutput) {
+ printDispatchStalls(B.getNumRATStalls(), B.getNumRCUStalls(),
+ B.getNumSQStalls(), B.getNumLDQStalls(),
+ B.getNumSTQStalls(), B.getNumDispatchGroupStalls());
+ printRATStatistics(B.getTotalRegisterMappingsCreated(),
+ B.getMaxUsedRegisterMappings());
+ BS->printHistograms(File->os());
+
+ std::vector<BufferUsageEntry> Usage;
+ B.getBuffersUsage(Usage);
+ printSchedulerUsage(B.getSchedModel(), Usage);
+ }
+
+ if (RPV) {
+ RPV->printResourcePressure(getOStream(), Cycles);
+ printInstructionInfo();
+ }
+
+ if (TV) {
+ TV->printTimeline(getOStream());
+ TV->printAverageWaitTimes(getOStream());
+ }
+}
+
+void BackendPrinter::addResourcePressureView() {
+ if (!RPV) {
+ RPV = llvm::make_unique<ResourcePressureView>(
+ B.getSTI(), *MCIP, B.getSourceMgr(), B.getProcResourceMasks());
+ B.addEventListener(RPV.get());
+ }
+}
+
+void BackendPrinter::addTimelineView(unsigned MaxIterations,
+ unsigned MaxCycles) {
+ if (!TV) {
+ TV = llvm::make_unique<TimelineView>(B.getSTI(), *MCIP, B.getSourceMgr(),
+ MaxIterations, MaxCycles);
+ B.addEventListener(TV.get());
+ }
+}
+
+void BackendPrinter::initialize(std::string OutputFileName) {
+ File = getOutputStream(OutputFileName);
+ MCIP->setPrintImmHex(false);
+ if (EnableVerboseOutput) {
+ BS = llvm::make_unique<BackendStatistics>();
+ B.addEventListener(BS.get());
+ }
+}
+
+} // namespace mca.
--- /dev/null
+//===--------------------- BackendPrinter.h ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements class BackendPrinter.
+/// BackendPrinter is able to collect statistics related to the code executed
+/// by the Backend class. Information is then printed out with the help of
+/// a MCInstPrinter (to pretty print MCInst objects) and other helper classes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H
+#define LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H
+
+#include "Backend.h"
+#include "BackendStatistics.h"
+#include "ResourcePressureView.h"
+#include "TimelineView.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+class ResourcePressureView;
+class TimelineView;
+
+/// \brief A printer class that knows how to collects statistics on the
+/// code analyzed by the llvm-mca tool.
+///
+/// This class knows how to print out the analysis information collected
+/// during the execution of the code. Internally, it delegates to other
+/// classes the task of printing out timeline information as well as
+/// resource pressure.
+class BackendPrinter {
+ Backend &B;
+ bool EnableVerboseOutput;
+
+ std::unique_ptr<llvm::MCInstPrinter> MCIP;
+ std::unique_ptr<llvm::ToolOutputFile> File;
+
+ std::unique_ptr<ResourcePressureView> RPV;
+ std::unique_ptr<TimelineView> TV;
+ std::unique_ptr<BackendStatistics> BS;
+
+ using Histogram = std::map<unsigned, unsigned>;
+ void printDUStatistics(const Histogram &Stats, unsigned Cycles) const;
+ void printDispatchStalls(unsigned RATStalls, unsigned RCUStalls,
+ unsigned SQStalls, unsigned LDQStalls,
+ unsigned STQStalls, unsigned DGStalls) const;
+ void printRATStatistics(unsigned Mappings, unsigned MaxUsedMappings) const;
+ void printRCUStatistics(const Histogram &Histogram, unsigned Cycles) const;
+ void printIssuePerCycle(const Histogram &IssuePerCycle,
+ unsigned TotalCycles) const;
+ void printSchedulerUsage(const llvm::MCSchedModel &SM,
+ const llvm::ArrayRef<BufferUsageEntry> &Usage) const;
+ void printGeneralStatistics(unsigned Iterations, unsigned Cycles,
+ unsigned Instructions,
+ unsigned DispatchWidth) const;
+ void printInstructionInfo() const;
+
+ std::unique_ptr<llvm::ToolOutputFile> getOutputStream(std::string OutputFile);
+ void initialize(std::string OputputFileName);
+
+public:
+ BackendPrinter(Backend &backend, std::string OutputFileName,
+ std::unique_ptr<llvm::MCInstPrinter> IP, bool EnableVerbose)
+ : B(backend), EnableVerboseOutput(EnableVerbose), MCIP(std::move(IP)) {
+ initialize(OutputFileName);
+ }
+
+ ~BackendPrinter() {
+ if (File)
+ File->keep();
+ }
+
+ bool isFileValid() const { return File.get(); }
+ llvm::raw_ostream &getOStream() const {
+ assert(isFileValid());
+ return File->os();
+ }
+
+ llvm::MCInstPrinter &getMCInstPrinter() const { return *MCIP; }
+
+ void addResourcePressureView();
+ void addTimelineView(unsigned MaxIterations = 3, unsigned MaxCycles = 80);
+
+ void printReport() const;
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===--------------------- BackendStatistics.cpp ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Functionalities used by the BackendPrinter to print out histograms
+/// related to number of {dispatch/issue/retire} per number of cycles.
+///
+//===----------------------------------------------------------------------===//
+
+#include "BackendStatistics.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void BackendStatistics::printRetireUnitStatistics(llvm::raw_ostream &OS) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "\n\nRetire Control Unit - "
+ << "number of cycles where we saw N instructions retired:\n";
+ TempStream << "[# retired], [# cycles]\n";
+
+ for (const std::pair<unsigned, unsigned> &Entry : RetiredPerCycle) {
+ TempStream << " " << Entry.first;
+ if (Entry.first < 10)
+ TempStream << ", ";
+ else
+ TempStream << ", ";
+ TempStream << Entry.second << " ("
+ << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+ << "%)\n";
+ }
+
+ TempStream.flush();
+ OS << Buffer;
+}
+
+void BackendStatistics::printDispatchUnitStatistics(llvm::raw_ostream &OS) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "\n\nDispatch Logic - "
+ << "number of cycles where we saw N instructions dispatched:\n";
+ TempStream << "[# dispatched], [# cycles]\n";
+ for (const std::pair<unsigned, unsigned> &Entry : DispatchGroupSizePerCycle) {
+ TempStream << " " << Entry.first << ", " << Entry.second
+ << " ("
+ << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+ << "%)\n";
+ }
+
+ TempStream.flush();
+ OS << Buffer;
+}
+
+void BackendStatistics::printSchedulerStatistics(llvm::raw_ostream &OS) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+ TempStream << "\n\nSchedulers - number of cycles where we saw N instructions "
+ "issued:\n";
+ TempStream << "[# issued], [# cycles]\n";
+ for (const std::pair<unsigned, unsigned> &Entry : IssuedPerCycle) {
+ TempStream << " " << Entry.first << ", " << Entry.second << " ("
+ << format("%.1f", ((double)Entry.second / NumCycles) * 100)
+ << "%)\n";
+ }
+
+ TempStream.flush();
+ OS << Buffer;
+}
+
+} // namespace mca
+
--- /dev/null
+//===--------------------- BackendStatistics.h ------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a printer class for printing generic Backend
+/// statistics related to the dispatch logic, scheduler and retire unit.
+///
+/// Example:
+/// ========
+///
+/// Dispatch Logic - number of cycles where we saw N instructions dispatched:
+/// [# dispatched], [# cycles]
+/// 0, 15 (11.5%)
+/// 5, 4 (3.1%)
+///
+/// Schedulers - number of cycles where we saw N instructions issued:
+/// [# issued], [# cycles]
+/// 0, 7 (5.4%)
+/// 1, 4 (3.1%)
+/// 2, 8 (6.2%)
+///
+/// Retire Control Unit - number of cycles where we saw N instructions retired:
+/// [# retired], [# cycles]
+/// 0, 9 (6.9%)
+/// 1, 6 (4.6%)
+/// 2, 1 (0.8%)
+/// 4, 3 (2.3%)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H
+
+#include "HWEventListener.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace mca {
+
+class BackendStatistics : public HWEventListener {
+ using Histogram = std::map<unsigned, unsigned>;
+ Histogram DispatchGroupSizePerCycle;
+ Histogram RetiredPerCycle;
+ Histogram IssuedPerCycle;
+
+ unsigned NumDispatched;
+ unsigned NumIssued;
+ unsigned NumRetired;
+ unsigned NumCycles;
+
+ void updateHistograms() {
+ DispatchGroupSizePerCycle[NumDispatched]++;
+ IssuedPerCycle[NumIssued]++;
+ RetiredPerCycle[NumRetired]++;
+ NumDispatched = 0;
+ NumIssued = 0;
+ NumRetired = 0;
+ }
+
+ void printRetireUnitStatistics(llvm::raw_ostream &OS) const;
+ void printDispatchUnitStatistics(llvm::raw_ostream &OS) const;
+ void printSchedulerStatistics(llvm::raw_ostream &OS) const;
+
+public:
+ BackendStatistics() : NumDispatched(0), NumIssued(0), NumRetired(0) {}
+
+ void onInstructionDispatched(unsigned Index) override { NumDispatched++; }
+ void
+ onInstructionIssued(unsigned Index,
+ const llvm::ArrayRef<std::pair<ResourceRef, unsigned>>
+ & /* unused */) override {
+ NumIssued++;
+ }
+ void onInstructionRetired(unsigned Index) override { NumRetired++; }
+
+ void onCycleBegin(unsigned Cycle) override { NumCycles++; }
+
+ void onCycleEnd(unsigned Cycle) override { updateHistograms(); }
+
+ void printHistograms(llvm::raw_ostream &OS) {
+ printDispatchUnitStatistics(OS);
+ printSchedulerStatistics(OS);
+ printRetireUnitStatistics(OS);
+ }
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+set(LLVM_LINK_COMPONENTS
+ AllTargetsAsmPrinters
+ AllTargetsAsmParsers
+ AllTargetsDescs
+ AllTargetsDisassemblers
+ AllTargetsInfos
+ MC
+ MCParser
+ Support
+ )
+
+add_llvm_tool(llvm-mca
+ Backend.cpp
+ BackendPrinter.cpp
+ BackendStatistics.cpp
+ Dispatch.cpp
+ HWEventListener.cpp
+ InstrBuilder.cpp
+ Instruction.cpp
+ LSUnit.cpp
+ llvm-mca.cpp
+ ResourcePressureView.cpp
+ Scheduler.cpp
+ TimelineView.cpp
+ )
--- /dev/null
+//===--------------------- Dispatch.cpp -------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods declared by class RegisterFile, DispatchUnit
+/// and RetireControlUnit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Dispatch.h"
+#include "Backend.h"
+#include "Scheduler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+void RegisterFile::addRegisterMapping(WriteState &WS) {
+ unsigned RegID = WS.getRegisterID();
+ assert(RegID && "Adding an invalid register definition?");
+
+ RegisterMappings[RegID] = &WS;
+ for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
+ RegisterMappings[*I] = &WS;
+ if (MaxUsedMappings == NumUsedMappings)
+ MaxUsedMappings++;
+ NumUsedMappings++;
+ TotalMappingsCreated++;
+ // If this is a partial update, then we are done.
+ if (!WS.fullyUpdatesSuperRegs())
+ return;
+
+ for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
+ RegisterMappings[*I] = &WS;
+}
+
+void RegisterFile::invalidateRegisterMapping(const WriteState &WS) {
+ unsigned RegID = WS.getRegisterID();
+ bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs();
+
+ assert(RegID != 0 && "Invalidating an already invalid register?");
+ assert(WS.getCyclesLeft() != -512 &&
+ "Invalidating a write of unknown cycles!");
+ assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
+ if (!RegisterMappings[RegID])
+ return;
+
+ assert(NumUsedMappings);
+ NumUsedMappings--;
+
+ if (RegisterMappings[RegID] == &WS)
+ RegisterMappings[RegID] = nullptr;
+
+ for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
+ if (RegisterMappings[*I] == &WS)
+ RegisterMappings[*I] = nullptr;
+
+ if (!ShouldInvalidateSuperRegs)
+ return;
+
+ for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
+ if (RegisterMappings[*I] == &WS)
+ RegisterMappings[*I] = nullptr;
+}
+
+// Update the number of used mappings in the event of instruction retired.
+// This mehod delegates to the register file the task of invalidating
+// register mappings that were created for instruction IS.
+void DispatchUnit::invalidateRegisterMappings(const Instruction &IS) {
+ for (const std::unique_ptr<WriteState> &WS : IS.getDefs()) {
+ DEBUG(dbgs() << "[RAT] Invalidating mapping for: ");
+ DEBUG(WS->dump());
+ RAT->invalidateRegisterMapping(*WS.get());
+ }
+}
+
+void RegisterFile::collectWrites(SmallVectorImpl<WriteState *> &Writes,
+ unsigned RegID) const {
+ assert(RegID && RegID < RegisterMappings.size());
+ WriteState *WS = RegisterMappings[RegID];
+ if (WS) {
+ DEBUG(dbgs() << "Found a dependent use of RegID=" << RegID << '\n');
+ Writes.push_back(WS);
+ }
+
+ // Handle potential partial register updates.
+ for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+ WS = RegisterMappings[*I];
+ if (WS && std::find(Writes.begin(), Writes.end(), WS) == Writes.end()) {
+ DEBUG(dbgs() << "Found a dependent use of subReg " << *I << " (part of "
+ << RegID << ")\n");
+ Writes.push_back(WS);
+ }
+ }
+}
+
+bool RegisterFile::isAvailable(unsigned NumRegWrites) {
+ if (!TotalMappings)
+ return true;
+ if (NumRegWrites > TotalMappings) {
+ // The user specified a too small number of registers.
+ // Artificially set the number of temporaries to NumRegWrites.
+ errs() << "warning: not enough temporaries in the register file. "
+ << "The register file size has been automatically increased to "
+ << NumRegWrites << '\n';
+ TotalMappings = NumRegWrites;
+ }
+
+ return NumRegWrites + NumUsedMappings <= TotalMappings;
+}
+
+#ifndef NDEBUG
+void RegisterFile::dump() const {
+ for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I)
+ if (RegisterMappings[I]) {
+ dbgs() << MRI.getName(I) << ", " << I << ", ";
+ RegisterMappings[I]->dump();
+ }
+
+ dbgs() << "TotalMappingsCreated: " << TotalMappingsCreated
+ << ", MaxUsedMappings: " << MaxUsedMappings
+ << ", NumUsedMappings: " << NumUsedMappings << '\n';
+}
+#endif
+
+// Reserves a number of slots, and returns a new token.
+unsigned RetireControlUnit::reserveSlot(unsigned Index, unsigned NumMicroOps) {
+ assert(isAvailable(NumMicroOps));
+ unsigned NormalizedQuantity =
+ std::min(NumMicroOps, static_cast<unsigned>(Queue.size()));
+ // Zero latency instructions may have zero mOps. Artificially bump this
+ // value to 1. Although zero latency instructions don't consume scheduler
+ // resources, they still consume one slot in the retire queue.
+ NormalizedQuantity = std::max(NormalizedQuantity, 1U);
+ unsigned TokenID = NextAvailableSlotIdx;
+ Queue[NextAvailableSlotIdx] = {Index, NormalizedQuantity, false};
+ NextAvailableSlotIdx += NormalizedQuantity;
+ NextAvailableSlotIdx %= Queue.size();
+ AvailableSlots -= NormalizedQuantity;
+ return TokenID;
+}
+
+void DispatchUnit::notifyInstructionDispatched(unsigned Index) {
+ Owner->notifyInstructionDispatched(Index);
+}
+
+void DispatchUnit::notifyInstructionRetired(unsigned Index) {
+ Owner->notifyInstructionRetired(Index);
+}
+
+void RetireControlUnit::cycleEvent() {
+ if (isEmpty())
+ return;
+
+ unsigned NumRetired = 0;
+ while (!isEmpty()) {
+ if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle)
+ break;
+ RUToken &Current = Queue[CurrentInstructionSlotIdx];
+ assert(Current.NumSlots && "Reserved zero slots?");
+ if (!Current.Executed)
+ break;
+ Owner->notifyInstructionRetired(Current.Index);
+ CurrentInstructionSlotIdx += Current.NumSlots;
+ CurrentInstructionSlotIdx %= Queue.size();
+ AvailableSlots += Current.NumSlots;
+ NumRetired++;
+ }
+}
+
+void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
+ assert(Queue.size() > TokenID);
+ assert(Queue[TokenID].Executed == false && Queue[TokenID].Index != ~0U);
+ Queue[TokenID].Executed = true;
+}
+
+#ifndef NDEBUG
+void RetireControlUnit::dump() const {
+ dbgs() << "Retire Unit: { Total Slots=" << Queue.size()
+ << ", Available Slots=" << AvailableSlots << " }\n";
+}
+#endif
+
+bool DispatchUnit::checkRAT(const InstrDesc &Desc) {
+ unsigned NumWrites = Desc.Writes.size();
+ if (RAT->isAvailable(NumWrites))
+ return true;
+ DispatchStalls[DS_RAT_REG_UNAVAILABLE]++;
+ return false;
+}
+
+bool DispatchUnit::checkRCU(const InstrDesc &Desc) {
+ unsigned NumMicroOps = Desc.NumMicroOps;
+ if (RCU->isAvailable(NumMicroOps))
+ return true;
+ DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]++;
+ return false;
+}
+
+bool DispatchUnit::checkScheduler(const InstrDesc &Desc) {
+ // If this is a zero-latency instruction, then it bypasses
+ // the scheduler.
+ switch (SC->canBeDispatched(Desc)) {
+ case Scheduler::HWS_AVAILABLE:
+ return true;
+ case Scheduler::HWS_QUEUE_UNAVAILABLE:
+ DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]++;
+ break;
+ case Scheduler::HWS_LD_QUEUE_UNAVAILABLE:
+ DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]++;
+ break;
+ case Scheduler::HWS_ST_QUEUE_UNAVAILABLE:
+ DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]++;
+ break;
+ case Scheduler::HWS_DISPATCH_GROUP_RESTRICTION:
+ DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]++;
+ }
+
+ return false;
+}
+
+unsigned DispatchUnit::dispatch(unsigned IID, Instruction *NewInst) {
+ assert(!CarryOver && "Cannot dispatch another instruction!");
+ unsigned NumMicroOps = NewInst->getDesc().NumMicroOps;
+ if (NumMicroOps > DispatchWidth) {
+ assert(AvailableEntries == DispatchWidth);
+ AvailableEntries = 0;
+ CarryOver = NumMicroOps - DispatchWidth;
+ } else {
+ assert(AvailableEntries >= NumMicroOps);
+ AvailableEntries -= NumMicroOps;
+ }
+
+ // Reserve slots in the RCU.
+ unsigned RCUTokenID = RCU->reserveSlot(IID, NumMicroOps);
+ Owner->notifyInstructionDispatched(IID);
+
+ SC->scheduleInstruction(IID, NewInst);
+ return RCUTokenID;
+}
+
+#ifndef NDEBUG
+void DispatchUnit::dump() const {
+ RAT->dump();
+ RCU->dump();
+
+ unsigned DSRAT = DispatchStalls[DS_RAT_REG_UNAVAILABLE];
+ unsigned DSRCU = DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE];
+ unsigned DSSCHEDQ = DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE];
+ unsigned DSLQ = DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE];
+ unsigned DSSQ = DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE];
+
+ dbgs() << "STALLS --- RAT: " << DSRAT << ", RCU: " << DSRCU
+ << ", SCHED_QUEUE: " << DSSCHEDQ << ", LOAD_QUEUE: " << DSLQ
+ << ", STORE_QUEUE: " << DSSQ << '\n';
+}
+#endif
+
+} // namespace mca
--- /dev/null
+//===----------------------- Dispatch.h -------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements classes that are used to model register files,
+/// reorder buffers and the hardware dispatch logic.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_H
+#define LLVM_TOOLS_LLVM_MCA_DISPATCH_H
+
+#include "Instruction.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include <map>
+
+namespace mca {
+
+class WriteState;
+class DispatchUnit;
+class Scheduler;
+class Backend;
+
+/// \brief Keeps track of register definitions.
+///
+/// This class tracks register definitions, and performs register renaming
+/// to break anti dependencies.
+/// By default, there is no limit in the number of register aliases which
+/// can be created for the purpose of register renaming. However, users can
+/// specify at object construction time a limit in the number of temporary
+/// registers which can be used by the register renaming logic.
+class RegisterFile {
+ const llvm::MCRegisterInfo &MRI;
+ // Currently used mappings and maximum used mappings.
+ // These are to generate statistics only.
+ unsigned NumUsedMappings;
+ unsigned MaxUsedMappings;
+ // Total number of mappings created over time.
+ unsigned TotalMappingsCreated;
+
+ // The maximum number of register aliases which can be used by the
+ // register renamer. Defaut value for this field is zero.
+ // A value of zero for this field means that there is no limit in the
+ // amount of register mappings which can be created. That is equivalent
+ // to having a theoretically infinite number of temporary registers.
+ unsigned TotalMappings;
+
+ // This map contains an entry for every physical register.
+ // A register index is used as a key value to access a WriteState.
+ // This is how we track RAW dependencies for dispatched
+ // instructions. For every register, we track the last seen write only.
+ // This assumes that all writes fully update both super and sub registers.
+ // We need a flag in MCInstrDesc to check if a write also updates super
+ // registers. We can then have a extra tablegen flag to set for instructions.
+ // This is a separate patch on its own.
+ std::vector<WriteState *> RegisterMappings;
+ // Assumptions are:
+ // a) a false dependencies is always removed by the register renamer.
+ // b) the register renamer can create an "infinite" number of mappings.
+ // Since we track the number of mappings created, in future we may
+ // introduce constraints on the number of mappings that can be created.
+ // For example, the maximum number of registers that are available for
+ // register renaming purposes may default to the size of the register file.
+
+ // In future, we can extend this design to allow multiple register files, and
+ // apply different restrictions on the register mappings and the number of
+ // temporary registers used by mappings.
+
+public:
+ RegisterFile(const llvm::MCRegisterInfo &mri, unsigned Mappings = 0)
+ : MRI(mri), NumUsedMappings(0), MaxUsedMappings(0),
+ TotalMappingsCreated(0), TotalMappings(Mappings),
+ RegisterMappings(MRI.getNumRegs(), nullptr) {}
+
+ // Creates a new register mapping for RegID.
+ // This reserves a temporary register in the register file.
+ void addRegisterMapping(WriteState &WS);
+
+ // Invalidates register mappings associated to the input WriteState object.
+ // This releases temporary registers in the register file.
+ void invalidateRegisterMapping(const WriteState &WS);
+
+ bool isAvailable(unsigned NumRegWrites);
+ void collectWrites(llvm::SmallVectorImpl<WriteState *> &Writes,
+ unsigned RegID) const;
+ void updateOnRead(ReadState &RS, unsigned RegID);
+ unsigned getMaxUsedRegisterMappings() const { return MaxUsedMappings; }
+ unsigned getTotalRegisterMappingsCreated() const {
+ return TotalMappingsCreated;
+ }
+
+#ifndef NDEBUG
+ void dump() const;
+#endif
+};
+
+/// \brief tracks which instructions are in-flight (i.e. dispatched but not
+/// retired) in the OoO backend.
+///
+/// This class checks on every cycle if/which instructions can be retired.
+/// Instructions are retired in program order.
+/// In the event of instruction retired, the DispatchUnit object that owns
+/// this RetireControlUnit gets notified.
+/// On instruction retired, register updates are all architecturally
+/// committed, and any temporary registers originally allocated for the
+/// retired instruction are freed.
+struct RetireControlUnit {
+ // A "token" (object of class RUToken) is created by the retire unit for every
+ // instruction dispatched to the schedulers. Flag 'Executed' is used to
+ // quickly check if an instruction has reached the write-back stage. A token
+ // also carries information related to the number of entries consumed by the
+ // instruction in the reorder buffer. The idea is that those entries will
+ // become available again once the instruction is retired. On every cycle,
+ // the RCU (Retire Control Unit) scans every token starting to search for
+ // instructions that are ready to retire. retired. Instructions are retired
+ // in program order. Only 'Executed' instructions are eligible for retire.
+ // Note that the size of the reorder buffer is defined by the scheduling model
+ // via field 'NumMicroOpBufferSize'.
+ struct RUToken {
+ unsigned Index; // Instruction index.
+ unsigned NumSlots; // Slots reserved to this instruction.
+ bool Executed; // True if the instruction is past the WB stage.
+ };
+
+private:
+ unsigned NextAvailableSlotIdx;
+ unsigned CurrentInstructionSlotIdx;
+ unsigned AvailableSlots;
+ unsigned MaxRetirePerCycle; // 0 means no limit.
+ std::vector<RUToken> Queue;
+ DispatchUnit *Owner;
+
+public:
+ RetireControlUnit(unsigned NumSlots, unsigned RPC, DispatchUnit *DU)
+ : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
+ AvailableSlots(NumSlots), MaxRetirePerCycle(RPC), Owner(DU) {
+ assert(NumSlots && "Expected at least one slot!");
+ Queue.resize(NumSlots);
+ }
+
+ bool isFull() const { return !AvailableSlots; }
+ bool isEmpty() const { return AvailableSlots == Queue.size(); }
+ bool isAvailable(unsigned Quantity = 1) const {
+ // Some instructions may declare a number of uOps which exceedes the size
+ // of the reorder buffer. To avoid problems, cap the amount of slots to
+ // the size of the reorder buffer.
+ Quantity = std::min(Quantity, static_cast<unsigned>(Queue.size()));
+ return AvailableSlots >= Quantity;
+ }
+
+ // Reserves a number of slots, and returns a new token.
+ unsigned reserveSlot(unsigned Index, unsigned NumMicroOps);
+
+ /// Retires instructions in program order.
+ void cycleEvent();
+
+ void onInstructionExecuted(unsigned TokenID);
+
+#ifndef NDEBUG
+ void dump() const;
+#endif
+};
+
+// \brief Implements the hardware dispatch logic.
+//
+// This class is responsible for the dispatch stage, in which instructions are
+// dispatched in groups to the Scheduler. An instruction can be dispatched if
+// functional units are available.
+// To be more specific, an instruction can be dispatched to the Scheduler if:
+// 1) There are enough entries in the reorder buffer (implemented by class
+// RetireControlUnit) to accomodate all opcodes.
+// 2) There are enough temporaries to rename output register operands.
+// 3) There are enough entries available in the used buffered resource(s).
+//
+// The number of micro opcodes that can be dispatched in one cycle is limited by
+// the value of field 'DispatchWidth'. A "dynamic dispatch stall" occurs when
+// processor resources are not available (i.e. at least one of the
+// abovementioned checks fails). Dispatch stall events are counted during the
+// entire execution of the code, and displayed by the performance report when
+// flag '-verbose' is specified.
+//
+// If the number of micro opcodes of an instruction is bigger than
+// DispatchWidth, then it can only be dispatched at the beginning of one cycle.
+// The DispatchUnit will still have to wait for a number of cycles (depending on
+// the DispatchWidth and the number of micro opcodes) before it can serve other
+// instructions.
+class DispatchUnit {
+ unsigned DispatchWidth;
+ unsigned AvailableEntries;
+ unsigned CarryOver;
+ Scheduler *SC;
+
+ std::unique_ptr<RegisterFile> RAT;
+ std::unique_ptr<RetireControlUnit> RCU;
+ Backend *Owner;
+
+ /// Dispatch stall event identifiers.
+ ///
+ /// The naming convention is:
+ /// * Event names starts with the "DS_" prefix
+ /// * For dynamic dispatch stalls, the "DS_" prefix is followed by the
+ /// the unavailable resource/functional unit acronym (example: RAT)
+ /// * The last substring is the event reason (example: REG_UNAVAILABLE means
+ /// that register renaming couldn't find enough spare registers in the
+ /// register file).
+ ///
+ /// List of acronyms used for processor resoures:
+ /// RAT - Register Alias Table (used by the register renaming logic)
+ /// RCU - Retire Control Unit
+ /// SQ - Scheduler's Queue
+ /// LDQ - Load Queue
+ /// STQ - Store Queue
+ enum {
+ DS_RAT_REG_UNAVAILABLE,
+ DS_RCU_TOKEN_UNAVAILABLE,
+ DS_SQ_TOKEN_UNAVAILABLE,
+ DS_LDQ_TOKEN_UNAVAILABLE,
+ DS_STQ_TOKEN_UNAVAILABLE,
+ DS_DISPATCH_GROUP_RESTRICTION,
+ DS_LAST
+ };
+
+ // The DispatchUnit track dispatch stall events caused by unavailable
+ // of hardware resources. Events are classified based on the stall kind;
+ // so we have a counter for every source of dispatch stall. Counters are
+ // stored into a vector `DispatchStall` which is always of size DS_LAST.
+ std::vector<unsigned> DispatchStalls;
+
+ bool checkRAT(const InstrDesc &Desc);
+ bool checkRCU(const InstrDesc &Desc);
+ bool checkScheduler(const InstrDesc &Desc);
+
+ void notifyInstructionDispatched(unsigned IID);
+
+public:
+ DispatchUnit(Backend *B, const llvm::MCRegisterInfo &MRI,
+ unsigned MicroOpBufferSize, unsigned RegisterFileSize,
+ unsigned MaxRetirePerCycle, unsigned MaxDispatchWidth,
+ Scheduler *Sched)
+ : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
+ CarryOver(0U), SC(Sched),
+ RAT(llvm::make_unique<RegisterFile>(MRI, RegisterFileSize)),
+ RCU(llvm::make_unique<RetireControlUnit>(MicroOpBufferSize,
+ MaxRetirePerCycle, this)),
+ Owner(B), DispatchStalls(DS_LAST, 0) {}
+
+ unsigned getDispatchWidth() const { return DispatchWidth; }
+
+ bool isAvailable(unsigned NumEntries) const {
+ return NumEntries <= AvailableEntries || AvailableEntries == DispatchWidth;
+ }
+
+ bool isRCUEmpty() const { return RCU->isEmpty(); }
+
+ bool canDispatch(const InstrDesc &Desc) {
+ assert(isAvailable(Desc.NumMicroOps));
+ return checkRCU(Desc) && checkRAT(Desc) && checkScheduler(Desc);
+ }
+
+ unsigned dispatch(unsigned IID, Instruction *NewInst);
+
+ void collectWrites(llvm::SmallVectorImpl<WriteState *> &Vec,
+ unsigned RegID) const {
+ return RAT->collectWrites(Vec, RegID);
+ }
+ unsigned getNumRATStalls() const {
+ return DispatchStalls[DS_RAT_REG_UNAVAILABLE];
+ }
+ unsigned getNumRCUStalls() const {
+ return DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE];
+ }
+ unsigned getNumSQStalls() const {
+ return DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE];
+ }
+ unsigned getNumLDQStalls() const {
+ return DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE];
+ }
+ unsigned getNumSTQStalls() const {
+ return DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE];
+ }
+ unsigned getNumDispatchGroupStalls() const {
+ return DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION];
+ }
+ unsigned getMaxUsedRegisterMappings() const {
+ return RAT->getMaxUsedRegisterMappings();
+ }
+ unsigned getTotalRegisterMappingsCreated() const {
+ return RAT->getTotalRegisterMappingsCreated();
+ }
+ void addNewRegisterMapping(WriteState &WS) { RAT->addRegisterMapping(WS); }
+
+ void cycleEvent(unsigned Cycle) {
+ RCU->cycleEvent();
+ AvailableEntries =
+ CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
+ CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U;
+ }
+
+ void notifyInstructionRetired(unsigned Index);
+
+ void onInstructionExecuted(unsigned TokenID) {
+ RCU->onInstructionExecuted(TokenID);
+ }
+
+ void invalidateRegisterMappings(const Instruction &Inst);
+#ifndef NDEBUG
+ void dump() const;
+#endif
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===----------------------- HWEventListener.cpp ----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a vtable anchor for struct HWEventListener.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HWEventListener.h"
+
+namespace mca {
+
+// Anchor the vtable here.
+void HWEventListener::anchor() {}
+
+} // namespace mca
--- /dev/null
+
+//===----------------------- HWEventListener.h ------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the main interface for hardware event listeners.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include <utility>
+
+namespace mca {
+
+struct HWEventListener {
+ // Events generated by the Retire Control Unit.
+ virtual void onInstructionRetired(unsigned Index) {};
+
+ // Events generated by the Scheduler.
+ using ResourceRef = std::pair<uint64_t, uint64_t>;
+ virtual void
+ onInstructionIssued(unsigned Index,
+ const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {}
+ virtual void onInstructionExecuted(unsigned Index) {}
+ virtual void onInstructionReady(unsigned Index) {}
+ virtual void onResourceAvailable(const ResourceRef &RRef) {};
+
+ // Events generated by the Dispatch logic.
+ virtual void onInstructionDispatched(unsigned Index) {}
+
+ // Generic events generated by the Backend.
+ virtual void onCycleBegin(unsigned Cycle) {}
+ virtual void onCycleEnd(unsigned Cycle) {}
+
+ virtual ~HWEventListener() = default;
+ virtual void anchor();
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the InstrBuilder interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstrBuilder.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+using namespace llvm;
+
+static void
+initializeUsedResources(InstrDesc &ID, const MCSchedClassDesc &SCDesc,
+ const MCSubtargetInfo &STI,
+ const ArrayRef<uint64_t> ProcResourceMasks) {
+ const MCSchedModel &SM = STI.getSchedModel();
+
+ // Populate resources consumed.
+ using ResourcePlusCycles = std::pair<uint64_t, ResourceUsage>;
+ std::vector<ResourcePlusCycles> Worklist;
+ for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) {
+ const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I;
+ const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
+ uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
+ if (PR.BufferSize != -1)
+ ID.Buffers.push_back(Mask);
+ CycleSegment RCy(0, PRE->Cycles, false);
+ Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
+ }
+
+ // Sort elements by mask popcount, so that we prioritize resource units over
+ // resource groups, and smaller groups over larger groups.
+ std::sort(Worklist.begin(), Worklist.end(),
+ [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) {
+ unsigned popcntA = countPopulation(A.first);
+ unsigned popcntB = countPopulation(B.first);
+ if (popcntA < popcntB)
+ return true;
+ if (popcntA > popcntB)
+ return false;
+ return A.first < B.first;
+ });
+
+ uint64_t UsedResourceUnits = 0;
+
+ // Remove cycles contributed by smaller resources.
+ for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
+ ResourcePlusCycles &A = Worklist[I];
+ if (!A.second.size()) {
+ A.second.NumUnits = 0;
+ A.second.setReserved();
+ ID.Resources.emplace_back(A);
+ continue;
+ }
+
+ ID.Resources.emplace_back(A);
+ uint64_t NormalizedMask = A.first;
+ if (countPopulation(A.first) == 1) {
+ UsedResourceUnits |= A.first;
+ } else {
+ // Remove the leading 1 from the resource group mask.
+ NormalizedMask ^= PowerOf2Floor(NormalizedMask);
+ }
+
+ for (unsigned J = I + 1; J < E; ++J) {
+ ResourcePlusCycles &B = Worklist[J];
+ if ((NormalizedMask & B.first) == NormalizedMask) {
+ B.second.CS.Subtract(A.second.size());
+ if (countPopulation(B.first) > 1)
+ B.second.NumUnits++;
+ }
+ }
+ }
+
+ // A SchedWrite may specify a number of cycles in which a resource group
+ // is reserved. For example (on target x86; cpu Haswell):
+ //
+ // SchedWriteRes<[HWPort0, HWPort1, HWPort01]> {
+ // let ResourceCycles = [2, 2, 3];
+ // }
+ //
+ // This means:
+ // Resource units HWPort0 and HWPort1 are both used for 2cy.
+ // Resource group HWPort01 is the union of HWPort0 and HWPort1.
+ // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01
+ // will not be usable for 2 entire cycles from instruction issue.
+ //
+ // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency
+ // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an
+ // extra delay on top of the 2 cycles latency.
+ // During those extra cycles, HWPort01 is not usable by other instructions.
+ for (ResourcePlusCycles &RPC : ID.Resources) {
+ if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) {
+ // Remove the leading 1 from the resource group mask.
+ uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
+ if ((Mask & UsedResourceUnits) == Mask)
+ RPC.second.setReserved();
+ }
+ }
+
+ DEBUG(
+ for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
+ dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
+ for (const uint64_t R : ID.Buffers)
+ dbgs() << "\t\tBuffer Mask=" << R << '\n';
+ );
+}
+
+static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
+ const MCSchedClassDesc &SCDesc,
+ const MCSubtargetInfo &STI) {
+ unsigned MaxLatency = 0;
+ unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
+ for (unsigned I = 0, E = NumWriteLatencyEntries; I < E; ++I) {
+ int Cycles = STI.getWriteLatencyEntry(&SCDesc, I)->Cycles;
+ // Check if this is an unknown latency. Conservatively (pessimistically)
+ // assume a latency of 100cy if late.
+ if (Cycles == -1)
+ Cycles = 100;
+ MaxLatency = std::max(MaxLatency, static_cast<unsigned>(Cycles));
+ }
+
+ if (MCDesc.isCall()) {
+ // We cannot estimate how long this call will take.
+ // Artificially set an arbitrarily high latency (100cy).
+ MaxLatency = std::max(100U, MaxLatency);
+ }
+
+ // Check if this instruction consumes any processor resources.
+ // If the latency is unknown, then conservatively set it equal to the maximum
+ // number of cycles for a resource consumed by this instruction.
+ if (!MaxLatency && ID.Resources.size()) {
+ // Check if this instruction consumes any processor resources.
+ // If so, then compute the max of the cycles spent for each resource, and
+ // use it as the MaxLatency.
+ for (const std::pair<uint64_t, ResourceUsage> &Resource : ID.Resources)
+ MaxLatency = std::max(MaxLatency, Resource.second.size());
+ }
+
+ if (SCDesc.isVariant() && MaxLatency == 0) {
+ errs() << "note: unknown latency for a variant opcode. Conservatively"
+ << " assume a default latency of 1cy.\n";
+ MaxLatency = 1;
+ }
+
+ ID.MaxLatency = MaxLatency;
+}
+
+static void populateWrites(InstrDesc &ID, const MCInst &MCI,
+ const MCInstrDesc &MCDesc,
+ const MCSchedClassDesc &SCDesc,
+ const MCSubtargetInfo &STI) {
+ computeMaxLatency(ID, MCDesc, SCDesc, STI);
+
+ // Set if writes through this opcode may update super registers.
+ // TODO: on x86-64, a 4 byte write of a general purpose register always
+ // fully updates the super-register.
+ // More in general, (at least on x86) not all register writes perform
+ // a partial (super-)register update.
+ // For example, an AVX instruction that writes on a XMM register implicitly
+ // zeroes the upper half of every aliasing super-register.
+ //
+ // For now, we pessimistically assume that writes are all potentially
+ // partial register updates. This is a good default for most targets, execept
+ // for those like x86 which implement a special semantic for certain opcodes.
+ // At least on x86, this may lead to an inaccurate prediction of the
+ // instruction level parallelism.
+ bool FullyUpdatesSuperRegisters = false;
+
+ // Now Populate Writes.
+
+ // This algorithm currently works under the strong (and potentially incorrect)
+ // assumption that information related to register def/uses can be obtained
+ // from MCInstrDesc.
+ //
+ // However class MCInstrDesc is used to describe MachineInstr objects and not
+ // MCInst objects. To be more specific, MCInstrDesc objects are opcode
+ // descriptors that are automatically generated via tablegen based on the
+ // instruction set information available from the target .td files. That
+ // means, the number of (explicit) definitions according to MCInstrDesc always
+ // matches the cardinality of the `(outs)` set in tablegen.
+ //
+ // By constructions, definitions must appear first in the operand sequence of
+ // a MachineInstr. Also, the (outs) sequence is preserved (example: the first
+ // element in the outs set is the first operand in the corresponding
+ // MachineInstr). That's the reason why MCInstrDesc only needs to declare the
+ // total number of register definitions, and not where those definitions are
+ // in the machine operand sequence.
+ //
+ // Unfortunately, it is not safe to use the information from MCInstrDesc to
+ // also describe MCInst objects. An MCInst object can be obtained from a
+ // MachineInstr through a lowering step which may restructure the operand
+ // sequence (and even remove or introduce new operands). So, there is a high
+ // risk that the lowering step breaks the assumptions that register
+ // definitions are always at the beginning of the machine operand sequence.
+ //
+ // This is a fundamental problem, and it is still an open problem. Essentially
+ // we have to find a way to correlate def/use operands of a MachineInstr to
+ // operands of an MCInst. Otherwise, we cannot correctly reconstruct data
+ // dependencies, nor we can correctly interpret the scheduling model, which
+ // heavily uses machine operand indices to define processor read-advance
+ // information, and to identify processor write resources. Essentially, we
+ // either need something like a MCInstrDesc, but for MCInst, or a way
+ // to map MCInst operands back to MachineInstr operands.
+ //
+ // Unfortunately, we don't have that information now. So, this prototype
+ // currently work under the strong assumption that we can always safely trust
+ // the content of an MCInstrDesc. For example, we can query a MCInstrDesc to
+ // obtain the number of explicit and implicit register defintions. We also
+ // assume that register definitions always come first in the operand sequence.
+ // This last assumption usually makes sense for MachineInstr, where register
+ // definitions always appear at the beginning of the operands sequence. In
+ // reality, these assumptions could be broken by the lowering step, which can
+ // decide to lay out operands in a different order than the original order of
+ // operand as specified by the MachineInstr.
+ //
+ // Things get even more complicated in the presence of "optional" register
+ // definitions. For MachineInstr, optional register definitions are always at
+ // the end of the operand sequence. Some ARM instructions that may update the
+ // status flags specify that register as a optional operand. Since we don't
+ // have operand descriptors for MCInst, we assume for now that the optional
+ // definition is always the last operand of a MCInst. Again, this assumption
+ // may be okay for most targets. However, there is no guarantee that targets
+ // would respect that.
+ //
+ // In conclusion: these are for now the strong assumptions made by the tool:
+ // * The number of explicit and implicit register definitions in a MCInst
+ // matches the number of explicit and implicit definitions according to
+ // the opcode descriptor (MCInstrDesc).
+ // * Register definitions take precedence over register uses in the operands
+ // list.
+ // * If an opcode specifies an optional definition, then the optional
+ // definition is always the last operand in the sequence, and it can be
+ // set to zero (i.e. "no register").
+ //
+ // These assumptions work quite well for most out-of-order in-tree targets
+ // like x86. This is mainly because the vast majority of instructions is
+ // expanded to MCInst using a straightforward lowering logic that preserves
+ // the ordering of the operands.
+ //
+ // In the longer term, we need to find a proper solution for this issue.
+ unsigned NumExplicitDefs = MCDesc.getNumDefs();
+ unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
+ unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
+ unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
+ if (MCDesc.hasOptionalDef())
+ TotalDefs++;
+ ID.Writes.resize(TotalDefs);
+ // Iterate over the operands list, and skip non-register operands.
+ // The first NumExplictDefs register operands are expected to be register
+ // definitions.
+ unsigned CurrentDef = 0;
+ unsigned i = 0;
+ for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
+ const MCOperand &Op = MCI.getOperand(i);
+ if (!Op.isReg())
+ continue;
+
+ WriteDescriptor &Write = ID.Writes[CurrentDef];
+ Write.OpIndex = i;
+ if (CurrentDef < NumWriteLatencyEntries) {
+ const MCWriteLatencyEntry &WLE =
+ *STI.getWriteLatencyEntry(&SCDesc, CurrentDef);
+ // Conservatively default to MaxLatency.
+ Write.Latency = WLE.Cycles == -1 ? ID.MaxLatency : WLE.Cycles;
+ Write.SClassOrWriteResourceID = WLE.WriteResourceID;
+ } else {
+ // Assign a default latency for this write.
+ Write.Latency = ID.MaxLatency;
+ Write.SClassOrWriteResourceID = 0;
+ }
+ Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters;
+ Write.IsOptionalDef = false;
+ DEBUG(
+ dbgs() << "\t\tOpIdx=" << Write.OpIndex
+ << ", Latency=" << Write.Latency << ", WriteResourceID="
+ << Write.SClassOrWriteResourceID << '\n';
+ );
+ CurrentDef++;
+ }
+
+ if (CurrentDef != NumExplicitDefs)
+ llvm::report_fatal_error(
+ "error: Expected more register operand definitions. ");
+
+ CurrentDef = 0;
+ for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
+ unsigned Index = NumExplicitDefs + CurrentDef;
+ WriteDescriptor &Write = ID.Writes[Index];
+ Write.OpIndex = -1;
+ Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef];
+ Write.Latency = ID.MaxLatency;
+ Write.SClassOrWriteResourceID = 0;
+ Write.IsOptionalDef = false;
+ assert(Write.RegisterID != 0 && "Expected a valid phys register!");
+ DEBUG(dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", PhysReg="
+ << Write.RegisterID << ", Latency=" << Write.Latency
+ << ", WriteResourceID=" << Write.SClassOrWriteResourceID
+ << '\n');
+ }
+
+ if (MCDesc.hasOptionalDef()) {
+ // Always assume that the optional definition is the last operand of the
+ // MCInst sequence.
+ const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
+ if (i == MCI.getNumOperands() || !Op.isReg())
+ llvm::report_fatal_error(
+ "error: expected a register operand for an optional "
+ "definition. Instruction has not be correctly analyzed.\n",
+ false);
+
+ WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
+ Write.OpIndex = MCI.getNumOperands() - 1;
+ // Assign a default latency for this write.
+ Write.Latency = ID.MaxLatency;
+ Write.SClassOrWriteResourceID = 0;
+ Write.IsOptionalDef = true;
+ }
+}
+
+static void populateReads(InstrDesc &ID, const MCInst &MCI,
+ const MCInstrDesc &MCDesc,
+ const MCSchedClassDesc &SCDesc,
+ const MCSubtargetInfo &STI) {
+ unsigned SchedClassID = MCDesc.getSchedClass();
+ bool HasReadAdvanceEntries = SCDesc.NumReadAdvanceEntries > 0;
+
+ unsigned i = 0;
+ unsigned NumExplicitDefs = MCDesc.getNumDefs();
+ // Skip explicit definitions.
+ for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
+ const MCOperand &Op = MCI.getOperand(i);
+ if (Op.isReg())
+ NumExplicitDefs--;
+ }
+
+ if (NumExplicitDefs)
+ llvm::report_fatal_error(
+ "error: Expected more register operand definitions. ", false);
+
+ unsigned NumExplicitUses = MCI.getNumOperands() - i;
+ unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
+ if (MCDesc.hasOptionalDef()) {
+ assert(NumExplicitUses);
+ NumExplicitUses--;
+ }
+ unsigned TotalUses = NumExplicitUses + NumImplicitUses;
+ if (!TotalUses)
+ return;
+
+ ID.Reads.resize(TotalUses);
+ for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
+ ReadDescriptor &Read = ID.Reads[CurrentUse];
+ Read.OpIndex = i + CurrentUse;
+ Read.HasReadAdvanceEntries = HasReadAdvanceEntries;
+ Read.SchedClassID = SchedClassID;
+ DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex);
+ }
+
+ for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
+ ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
+ Read.OpIndex = -1;
+ Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
+ Read.HasReadAdvanceEntries = false;
+ Read.SchedClassID = SchedClassID;
+ DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex
+ << ", RegisterID=" << Read.RegisterID << '\n');
+ }
+}
+
+void InstrBuilder::createInstrDescImpl(const MCSubtargetInfo &STI,
+ const MCInst &MCI) {
+ assert(STI.getSchedModel().hasInstrSchedModel() &&
+ "Itineraries are not yet supported!");
+
+ unsigned short Opcode = MCI.getOpcode();
+ // Obtain the instruction descriptor from the opcode.
+ const MCInstrDesc &MCDesc = MCII.get(Opcode);
+ const MCSchedModel &SM = STI.getSchedModel();
+
+ // Then obtain the scheduling class information from the instruction.
+ const MCSchedClassDesc &SCDesc =
+ *SM.getSchedClassDesc(MCDesc.getSchedClass());
+
+ // Create a new empty descriptor.
+ InstrDesc *ID = new InstrDesc();
+
+ if (SCDesc.isVariant()) {
+ errs() << "warning: don't know how to model variant opcodes.\n"
+ << "note: assume 1 micro opcode.\n";
+ ID->NumMicroOps = 1U;
+ } else {
+ ID->NumMicroOps = SCDesc.NumMicroOps;
+ }
+
+ if (MCDesc.isCall()) {
+ // We don't correctly model calls.
+ errs() << "warning: found a call in the input assembly sequence.\n"
+ << "note: call instructions are not correctly modeled. Assume a "
+ "latency of 100cy.\n";
+ }
+
+ if (MCDesc.isReturn()) {
+ errs() << "warning: found a return instruction in the input assembly "
+ "sequence.\n"
+ << "note: program counter updates are ignored.\n";
+ }
+
+ ID->MayLoad = MCDesc.mayLoad();
+ ID->MayStore = MCDesc.mayStore();
+ ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
+
+ initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
+ populateWrites(*ID, MCI, MCDesc, SCDesc, STI);
+ populateReads(*ID, MCI, MCDesc, SCDesc, STI);
+
+ DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
+ DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
+
+ // Now add the new descriptor.
+ Descriptors[Opcode] = std::unique_ptr<const InstrDesc>(ID);
+}
+
+const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCSubtargetInfo &STI,
+ const MCInst &MCI) {
+ auto it = Descriptors.find(MCI.getOpcode());
+ if (it == Descriptors.end())
+ createInstrDescImpl(STI, MCI);
+ return *Descriptors[MCI.getOpcode()].get();
+}
+
+Instruction *InstrBuilder::createInstruction(const MCSubtargetInfo &STI,
+ DispatchUnit &DU, unsigned Idx,
+ const MCInst &MCI) {
+ const InstrDesc &D = getOrCreateInstrDesc(STI, MCI);
+ Instruction *NewIS = new Instruction(D);
+
+ // Populate Reads first.
+ const MCSchedModel &SM = STI.getSchedModel();
+ SmallVector<WriteState *, 4> DependentWrites;
+ for (const ReadDescriptor &RD : D.Reads) {
+ int RegID = -1;
+ if (RD.OpIndex != -1) {
+ // explicit read.
+ const MCOperand &Op = MCI.getOperand(RD.OpIndex);
+ // Skip non-register operands.
+ if (!Op.isReg())
+ continue;
+ RegID = Op.getReg();
+ } else {
+ // Implicit read.
+ RegID = RD.RegisterID;
+ }
+
+ // Skip invalid register operands.
+ if (!RegID)
+ continue;
+
+ // Okay, this is a register operand. Create a ReadState for it.
+ assert(RegID > 0 && "Invalid register ID found!");
+ ReadState *NewRDS = new ReadState(RD);
+ NewIS->getUses().emplace_back(std::unique_ptr<ReadState>(NewRDS));
+ DU.collectWrites(DependentWrites, RegID);
+ NewRDS->setDependentWrites(DependentWrites.size());
+ DEBUG(dbgs() << "Found " << DependentWrites.size()
+ << " dependent writes\n");
+
+ // We know that this read depends on all the writes in DependentWrites.
+ // For each write, check if we have ReadAdvance information, and use it
+ // to figure out after how many cycles this read becomes available.
+ if (!RD.HasReadAdvanceEntries) {
+ for (WriteState *WS : DependentWrites)
+ WS->addUser(NewRDS, /* ReadAdvance */ 0);
+ // Prepare the set for another round.
+ DependentWrites.clear();
+ continue;
+ }
+
+ const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
+ for (WriteState *WS : DependentWrites) {
+ unsigned WriteResID = WS->getWriteResourceID();
+ int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.OpIndex, WriteResID);
+ WS->addUser(NewRDS, ReadAdvance);
+ }
+
+ // Prepare the set for another round.
+ DependentWrites.clear();
+ }
+
+ // Now populate writes.
+ for (const WriteDescriptor &WD : D.Writes) {
+ unsigned RegID =
+ WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg();
+ assert((RegID || WD.IsOptionalDef) && "Expected a valid register ID!");
+ // Special case where this is a optional definition, and the actual register
+ // is 0.
+ if (WD.IsOptionalDef && !RegID)
+ continue;
+
+ WriteState *NewWS = new WriteState(WD);
+ NewIS->getDefs().emplace_back(std::unique_ptr<WriteState>(NewWS));
+ NewWS->setRegisterID(RegID);
+ DU.addNewRegisterMapping(*NewWS);
+ }
+
+ // Update Latency.
+ NewIS->setCyclesLeft(D.MaxLatency);
+ return NewIS;
+}
+
+} // namespace mca
--- /dev/null
+//===--------------------- InstrBuilder.h -----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A builder class for instructions that are statically analyzed by llvm-mca.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+
+#include "Dispatch.h"
+#include "Instruction.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace mca {
+
+class DispatchUnit;
+
+/// \brief A builder class that knows how to construct Instruction objects.
+///
+/// Every llvm-mca Instruction is described by an object of class InstrDesc.
+/// An InstrDesc describes which registers are read/written by the instruction,
+/// as well as the instruction latency and hardware resources consumed.
+///
+/// This class is used by the tool to construct Instructions and instruction
+/// descriptors (i.e. InstrDesc objects).
+/// Information from the machine scheduling model is used to identify processor
+/// resources that are consumed by an instruction.
+class InstrBuilder {
+ const llvm::MCInstrInfo &MCII;
+ const llvm::ArrayRef<uint64_t> ProcResourceMasks;
+
+ llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
+ llvm::DenseMap<unsigned, std::unique_ptr<Instruction>> Instructions;
+
+ void createInstrDescImpl(const llvm::MCSubtargetInfo &STI,
+ const llvm::MCInst &MCI);
+
+public:
+ InstrBuilder(const llvm::MCInstrInfo &mcii,
+ const llvm::ArrayRef<uint64_t> Masks)
+ : MCII(mcii), ProcResourceMasks(Masks) {}
+
+ const InstrDesc &getOrCreateInstrDesc(const llvm::MCSubtargetInfo &STI,
+ const llvm::MCInst &MCI);
+
+ Instruction *createInstruction(const llvm::MCSubtargetInfo &STI,
+ DispatchUnit &DU, unsigned Idx,
+ const llvm::MCInst &MCI);
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===--------------------- Instruction.cpp ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines abstractions used by the Backend to model register reads,
+// register writes and instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void ReadState::writeStartEvent(unsigned Cycles) {
+ assert(DependentWrites);
+ assert(CyclesLeft == UNKNOWN_CYCLES);
+
+ // This read may be dependent on more than one write. This typically occurs
+ // when a definition is the result of multiple writes where at least one
+ // write does a partial register update.
+ // The HW is forced to do some extra bookkeeping to track of all the
+ // dependent writes, and implement a merging scheme for the partial writes.
+ --DependentWrites;
+ TotalCycles = std::max(TotalCycles, Cycles);
+
+ if (!DependentWrites)
+ CyclesLeft = TotalCycles;
+}
+
+void WriteState::onInstructionIssued() {
+ assert(CyclesLeft == UNKNOWN_CYCLES);
+ // Update the number of cycles left based on the WriteDescriptor info.
+ CyclesLeft = WD.Latency;
+
+ // Now that the time left before write-back is know, notify
+ // all the users.
+ for (const std::pair<ReadState *, int> &User : Users) {
+ ReadState *RS = User.first;
+ unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
+ RS->writeStartEvent(ReadCycles);
+ }
+}
+
+void WriteState::addUser(ReadState *User, int ReadAdvance) {
+ // If CyclesLeft is different than -1, then we don't need to
+ // update the list of users. We can just notify the user with
+ // the actual number of cycles left (which may be zero).
+ if (CyclesLeft != UNKNOWN_CYCLES) {
+ unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance);
+ User->writeStartEvent(ReadCycles);
+ return;
+ }
+
+ std::pair<ReadState *, int> NewPair(User, ReadAdvance);
+ Users.insert(NewPair);
+}
+
+void WriteState::cycleEvent() {
+ // Note: CyclesLeft can be a negative number. It is an error to
+ // make it an unsigned quantity because users of this write may
+ // specify a negative ReadAdvance.
+ if (CyclesLeft != UNKNOWN_CYCLES)
+ CyclesLeft--;
+}
+
+void ReadState::cycleEvent() {
+ // If CyclesLeft is unknown, then bail out immediately.
+ if (CyclesLeft == UNKNOWN_CYCLES)
+ return;
+
+ // If there are still dependent writes, or we reached cycle zero,
+ // then just exit.
+ if (DependentWrites || CyclesLeft == 0)
+ return;
+
+ CyclesLeft--;
+}
+
+#ifndef NDEBUG
+void WriteState::dump() const {
+ dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << WD.Latency << ", RegID "
+ << getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }\n";
+}
+#endif
+
+bool Instruction::isReady() {
+ if (Stage == IS_READY)
+ return true;
+
+ assert(Stage == IS_AVAILABLE);
+ for (const UniqueUse &Use : Uses)
+ if (!Use.get()->isReady())
+ return false;
+
+ setReady();
+ return true;
+}
+
+void Instruction::execute() {
+ assert(Stage == IS_READY);
+ Stage = IS_EXECUTING;
+ for (UniqueDef &Def : Defs)
+ Def->onInstructionIssued();
+}
+
+bool Instruction::isZeroLatency() const {
+ return Desc.MaxLatency == 0 && Defs.size() == 0 && Uses.size() == 0;
+}
+
+void Instruction::cycleEvent() {
+ if (isDispatched()) {
+ for (UniqueUse &Use : Uses)
+ Use->cycleEvent();
+ return;
+ }
+ if (isExecuting()) {
+ for (UniqueDef &Def : Defs)
+ Def->cycleEvent();
+ CyclesLeft--;
+ }
+ if (!CyclesLeft)
+ Stage = IS_EXECUTED;
+}
+
+} // namespace mca
--- /dev/null
+//===--------------------- Instruction.h ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines abstractions used by the Backend to model register reads,
+/// register writes and instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+
+#include "llvm/Support/MathExtras.h"
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace mca {
+
+struct WriteDescriptor;
+struct ReadDescriptor;
+class WriteState;
+class ReadState;
+
+constexpr int UNKNOWN_CYCLES = -512;
+
+/// \brief A register write descriptor.
+struct WriteDescriptor {
+ int OpIndex; // Operand index. -1 if this is an implicit write.
+ // Write latency. Number of cycles before write-back stage.
+ int Latency;
+ // This field is set to a value different than zero only if this
+ // is an implicit definition.
+ unsigned RegisterID;
+ // True if this write generates a partial update of a super-registers.
+ // On X86, this flag is set by byte/word writes on GPR registers. Also,
+ // a write of an XMM register only partially updates the corresponding
+ // YMM super-register if the write is associated to a legacy SSE instruction.
+ bool FullyUpdatesSuperRegs;
+ // Instruction itineraries would set this field to the SchedClass ID.
+ // Otherwise, it defaults to the WriteResourceID from teh MCWriteLatencyEntry
+ // element associated to this write.
+ // When computing read latencies, this value is matched against the
+ // "ReadAdvance" information. The hardware backend may implement
+ // dedicated forwarding paths to quickly propagate write results to dependent
+ // instructions waiting in the reservation station (effectively bypassing the
+ // write-back stage).
+ unsigned SClassOrWriteResourceID;
+ // True only if this is a write obtained from an optional definition.
+ // Optional definitions are allowed to reference regID zero (i.e. "no
+ // register").
+ bool IsOptionalDef;
+};
+
+/// \brief A register read descriptor.
+struct ReadDescriptor {
+ // This field defaults to -1 if this is an implicit read.
+ int OpIndex;
+ // This field is only set if this is an implicit read.
+ unsigned RegisterID;
+ // Scheduling Class Index. It is used to query the scheduling model for the
+ // MCSchedClassDesc object.
+ unsigned SchedClassID;
+ // True if there may be a local forwarding logic in hardware to serve a
+ // write used by this read. This information, along with SchedClassID, is
+ // used to dynamically check at Instruction creation time, if the input
+ // operands can benefit from a ReadAdvance bonus.
+ bool HasReadAdvanceEntries;
+};
+
+/// \brief Tracks uses of a register definition (e.g. register write).
+///
+/// Each implicit/explicit register write is associated with an instance of
+/// this class. A WriteState object tracks the dependent users of a
+/// register write. It also tracks how many cycles are left before the write
+/// back stage.
+class WriteState {
+ const WriteDescriptor &WD;
+ // On instruction issue, this field is set equal to the write latency.
+ // Before instruction issue, this field defaults to -512, a special
+ // value that represents an "unknown" number of cycles.
+ int CyclesLeft;
+
+ // Actual register defined by this write. This field is only used
+ // to speedup queries on the register file.
+ // For implicit writes, this field always matches the value of
+ // field RegisterID from WD.
+ unsigned RegisterID;
+
+ // A list of dependent reads. Users is a set of dependent
+ // reads. A dependent read is added to the set only if CyclesLeft
+ // is "unknown". As soon as CyclesLeft is 'known', each user in the set
+ // gets notified with the actual CyclesLeft.
+
+ // The 'second' element of a pair is a "ReadAdvance" number of cycles.
+ std::set<std::pair<ReadState *, int>> Users;
+
+public:
+ WriteState(const WriteDescriptor &Desc)
+ : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(Desc.RegisterID) {}
+ WriteState(const WriteState &Other) = delete;
+ WriteState &operator=(const WriteState &Other) = delete;
+
+ int getCyclesLeft() const { return CyclesLeft; }
+ unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; }
+ unsigned getRegisterID() const { return RegisterID; }
+ void setRegisterID(unsigned ID) { RegisterID = ID; }
+
+ void addUser(ReadState *Use, int ReadAdvance);
+ bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; }
+ bool isWrittenBack() const { return CyclesLeft == 0; }
+
+ // On every cycle, update CyclesLeft and notify dependent users.
+ void cycleEvent();
+ void onInstructionIssued();
+
+#ifndef NDEBUG
+ void dump() const;
+#endif
+};
+
+/// \brief Tracks register operand latency in cycles.
+///
+/// A read may be dependent on more than one write. This occurs when some
+/// writes only partially update the register associated to this read.
+class ReadState {
+ const ReadDescriptor &RD;
+ unsigned DependentWrites;
+ int CyclesLeft;
+ unsigned TotalCycles;
+
+public:
+ bool isReady() const {
+ if (DependentWrites)
+ return false;
+ return (CyclesLeft == UNKNOWN_CYCLES || CyclesLeft == 0);
+ }
+
+ ReadState(const ReadDescriptor &Desc)
+ : RD(Desc), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES),
+ TotalCycles(0) {}
+ ReadState(const ReadState &Other) = delete;
+ ReadState &operator=(const ReadState &Other) = delete;
+
+ const ReadDescriptor &getDescriptor() const { return RD; }
+ unsigned getSchedClass() const { return RD.SchedClassID; }
+ void cycleEvent();
+ void writeStartEvent(unsigned Cycles);
+ void setDependentWrites(unsigned Writes) { DependentWrites = Writes; }
+};
+
+/// \brief A sequence of cycles.
+///
+/// This class can be used as a building block to construct ranges of cycles.
+class CycleSegment {
+ unsigned Begin; // Inclusive.
+ unsigned End; // Exclusive.
+ bool Reserved; // Resources associated to this segment must be reserved.
+
+public:
+ CycleSegment(unsigned StartCycle, unsigned EndCycle, bool IsReserved = false)
+ : Begin(StartCycle), End(EndCycle), Reserved(IsReserved) {}
+
+ bool contains(unsigned Cycle) const { return Cycle >= Begin && Cycle < End; }
+ bool startsAfter(const CycleSegment &CS) const { return End <= CS.Begin; }
+ bool endsBefore(const CycleSegment &CS) const { return Begin >= CS.End; }
+ bool overlaps(const CycleSegment &CS) const {
+ return !startsAfter(CS) && !endsBefore(CS);
+ }
+ bool isExecuting() const { return Begin == 0 && End != 0; }
+ bool isExecuted() const { return End == 0; }
+ bool operator<(const CycleSegment &Other) const {
+ return Begin < Other.Begin;
+ }
+ CycleSegment &operator--(void) {
+ if (Begin)
+ Begin--;
+ if (End)
+ End--;
+ return *this;
+ }
+
+ bool isValid() const { return Begin <= End; }
+ unsigned size() const { return End - Begin; };
+ void Subtract(unsigned Cycles) {
+ assert(End >= Cycles);
+ End -= Cycles;
+ }
+
+ unsigned begin() const { return Begin; }
+ unsigned end() const { return End; }
+ void setEnd(unsigned NewEnd) { End = NewEnd; }
+ bool isReserved() const { return Reserved; }
+ void setReserved() { Reserved = true; }
+};
+
+/// \brief Helper used by class InstrDesc to describe how hardware resources
+/// are used.
+///
+/// This class describes how many resource units of a specific resource kind
+/// (and how many cycles) are "used" by an instruction.
+struct ResourceUsage {
+ CycleSegment CS;
+ unsigned NumUnits;
+ ResourceUsage(CycleSegment Cycles, unsigned Units = 1)
+ : CS(Cycles), NumUnits(Units) {}
+ unsigned size() const { return CS.size(); }
+ bool isReserved() const { return CS.isReserved(); }
+ void setReserved() { CS.setReserved(); }
+};
+
+/// \brief An instruction descriptor
+struct InstrDesc {
+ std::vector<WriteDescriptor> Writes; // Implicit writes are at the end.
+ std::vector<ReadDescriptor> Reads; // Implicit reads are at the end.
+
+ // For every resource used by an instruction of this kind, this vector
+ // reports the number of "consumed cycles".
+ std::vector<std::pair<uint64_t, ResourceUsage>> Resources;
+
+ // A list of buffered resources consumed by this instruction.
+ std::vector<uint64_t> Buffers;
+ unsigned MaxLatency;
+ // Number of MicroOps for this instruction.
+ unsigned NumMicroOps;
+
+ bool MayLoad;
+ bool MayStore;
+ bool HasSideEffects;
+};
+
+/// An instruction dispatched to the out-of-order backend.
+///
+/// This class is used to monitor changes in the internal state of instructions
+/// that are dispatched by the DispatchUnit to the hardware schedulers.
+class Instruction {
+ const InstrDesc &Desc;
+
+ enum InstrStage {
+ IS_INVALID, // Instruction in an invalid state.
+ IS_AVAILABLE, // Instruction dispatched but operands are not ready.
+ IS_READY, // Instruction dispatched and operands ready.
+ IS_EXECUTING, // Instruction issued.
+ IS_EXECUTED, // Instruction executed. Values are written back.
+ IS_RETIRED // Instruction retired.
+ };
+
+ // The current instruction stage.
+ enum InstrStage Stage;
+
+ // This value defaults to the instruction latency. This instruction is
+ // considered executed when field CyclesLeft goes to zero.
+ int CyclesLeft;
+
+ // Retire Unit token ID for this instruction.
+ unsigned RCUTokenID;
+
+ using UniqueDef = std::unique_ptr<WriteState>;
+ using UniqueUse = std::unique_ptr<ReadState>;
+ using VecDefs = std::vector<UniqueDef>;
+ using VecUses = std::vector<UniqueUse>;
+
+ // Output dependencies.
+ // One entry per each implicit and explicit register definition.
+ VecDefs Defs;
+
+ // Input dependencies.
+ // One entry per each implicit and explicit register use.
+ VecUses Uses;
+
+ // This instruction has already been dispatched, and all operands are ready.
+ void setReady() {
+ assert(Stage == IS_AVAILABLE);
+ Stage = IS_READY;
+ }
+
+public:
+ Instruction(const InstrDesc &D)
+ : Desc(D), Stage(IS_INVALID), CyclesLeft(-1) {}
+ Instruction(const Instruction &Other) = delete;
+ Instruction &operator=(const Instruction &Other) = delete;
+
+ VecDefs &getDefs() { return Defs; }
+ const VecDefs &getDefs() const { return Defs; }
+ VecUses &getUses() { return Uses; }
+ const VecUses &getUses() const { return Uses; }
+ const InstrDesc &getDesc() const { return Desc; }
+
+ unsigned getRCUTokenID() const { return RCUTokenID; }
+ int getCyclesLeft() const { return CyclesLeft; }
+ void setCyclesLeft(int Cycles) { CyclesLeft = Cycles; }
+ void setRCUTokenID(unsigned TokenID) { RCUTokenID = TokenID; }
+
+ // Transition to the dispatch stage.
+ // No definition is updated because the instruction is not "executing".
+ void dispatch() {
+ assert(Stage == IS_INVALID);
+ Stage = IS_AVAILABLE;
+ }
+
+ // Instruction issued. Transition to the IS_EXECUTING state, and update
+ // all the definitions.
+ void execute();
+
+ void forceExecuted() {
+ assert((Stage == IS_INVALID && isZeroLatency()) ||
+ (Stage == IS_READY && Desc.MaxLatency == 0));
+ Stage = IS_EXECUTED;
+ }
+
+ // Checks if operands are available. If all operands area ready,
+ // then this forces a transition from IS_AVAILABLE to IS_READY.
+ bool isReady();
+
+ bool isDispatched() const { return Stage == IS_AVAILABLE; }
+ bool isExecuting() const { return Stage == IS_EXECUTING; }
+ bool isExecuted() const { return Stage == IS_EXECUTED; }
+ bool isZeroLatency() const;
+
+ void retire() {
+ assert(Stage == IS_EXECUTED);
+ Stage = IS_RETIRED;
+ }
+
+ void cycleEvent();
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+;===- ./tools/llvm-mc/LLVMBuild.txt ----------------------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-mca
+parent = Tools
+required_libraries = MC MCParser Support all-targets
--- /dev/null
+//===----------------------- LSUnit.cpp --------------------------*- C++-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load-Store Unit for the llvm-mca tool.
+///
+//===----------------------------------------------------------------------===//
+
+#include "LSUnit.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+#ifndef NDEBUG
+void LSUnit::dump() const {
+ dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
+ dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n';
+ dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n';
+ dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n';
+}
+#endif
+
+void LSUnit::assignLQSlot(unsigned Index) {
+ assert(!isLQFull());
+ assert(LoadQueue.count(Index) == 0);
+
+ DEBUG(dbgs() << "[LSUnit] - AssignLQSlot <Idx=" << Index
+ << ",slot=" << LoadQueue.size() << ">\n");
+ LoadQueue.insert(Index);
+}
+
+void LSUnit::assignSQSlot(unsigned Index) {
+ assert(!isSQFull());
+ assert(StoreQueue.count(Index) == 0);
+
+ DEBUG(dbgs() << "[LSUnit] - AssignSQSlot <Idx=" << Index
+ << ",slot=" << StoreQueue.size() << ">\n");
+ StoreQueue.insert(Index);
+}
+
+bool LSUnit::isReady(unsigned Index) const {
+ bool IsALoad = LoadQueue.count(Index) != 0;
+ bool IsAStore = StoreQueue.count(Index) != 0;
+ unsigned LoadBarrierIndex = LoadBarriers.empty() ? 0 : *LoadBarriers.begin();
+ unsigned StoreBarrierIndex = StoreBarriers.empty() ? 0 : *StoreBarriers.begin();
+
+ if (IsALoad && LoadBarrierIndex) {
+ if (Index > LoadBarrierIndex)
+ return false;
+ if (Index == LoadBarrierIndex && Index != *LoadQueue.begin())
+ return false;
+ }
+
+ if (IsAStore && StoreBarrierIndex) {
+ if (Index > StoreBarrierIndex)
+ return false;
+ if (Index == StoreBarrierIndex && Index != *StoreQueue.begin())
+ return false;
+ }
+
+ if (NoAlias && IsALoad)
+ return true;
+
+ if (StoreQueue.size()) {
+ // Check if this memory operation is younger than the older store.
+ if (Index > *StoreQueue.begin())
+ return false;
+ }
+
+ // Okay, we are older than the oldest store in the queue.
+ // If there are no pending loads, then we can say for sure that this
+ // instruction is ready.
+ if (isLQEmpty())
+ return true;
+
+ // Check if there are no older loads.
+ if (Index <= *LoadQueue.begin())
+ return true;
+
+ // There is at least one younger load.
+ return !IsAStore;
+}
+
+void LSUnit::onInstructionExecuted(unsigned Index) {
+ std::set<unsigned>::iterator it = LoadQueue.find(Index);
+ if (it != LoadQueue.end()) {
+ DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+ << " has been removed from the load queue.\n");
+ LoadQueue.erase(it);
+ }
+
+ it = StoreQueue.find(Index);
+ if (it != StoreQueue.end()) {
+ DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+ << " has been removed from the store queue.\n");
+ StoreQueue.erase(it);
+ }
+
+ if (!StoreBarriers.empty() && Index == *StoreBarriers.begin())
+ StoreBarriers.erase(StoreBarriers.begin());
+ if (!LoadBarriers.empty() && Index == *LoadBarriers.begin())
+ LoadBarriers.erase(LoadBarriers.begin());
+}
+} // namespace mca
--- /dev/null
+//===------------------------- LSUnit.h --------------------------*- C++-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load/Store unit class that models load/store queues and that implements
+/// a simple weak memory consistency model.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+/// \brief A Load/Store Unit implementing a load and store queues.
+///
+/// This class implements a load queue and a store queue to emulate the
+/// out-of-order execution of memory operations.
+/// Each load (or store) consumes an entry in the load (or store) queue.
+///
+/// Rules are:
+/// 1) A younger load is allowed to pass an older load only if there are no
+/// stores nor barriers in between the two loads.
+/// 2) An younger store is not allowed to pass an older store.
+/// 3) A younger store is not allowed to pass an older load.
+/// 4) A younger load is allowed to pass an older store only if the load does
+/// not alias with the store.
+///
+/// This class optimistically assumes that loads don't alias store operations.
+/// Under this assumption, younger loads are always allowed to pass older
+/// stores (this would only affects rule 4).
+/// Essentially, this LSUnit doesn't attempt to run any sort alias analysis to
+/// predict when loads and stores don't alias with eachother.
+///
+/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
+/// set to `false` by the constructor of LSUnit.
+///
+/// In the case of write-combining memory, rule 2. could be relaxed to allow
+/// reordering of non-aliasing store operations. At the moment, this is not
+/// allowed.
+/// To put it in another way, there is no option to specify a different memory
+/// type for memory operations (example: write-through, write-combining, etc.).
+/// Also, there is no way to weaken the memory model, and this unit currently
+/// doesn't support write-combining behavior.
+///
+/// No assumptions are made on the size of the store buffer.
+/// As mentioned before, this class doesn't perform alias analysis.
+/// Consequently, LSUnit doesn't know how to identify cases where
+/// store-to-load forwarding may occur.
+///
+/// LSUnit doesn't attempt to predict whether a load or store hits or misses
+/// the L1 cache. To be more specific, LSUnit doesn't know anything about
+/// the cache hierarchy and memory types.
+/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
+/// scheduling model provides an "optimistic" load-to-use latency (which usually
+/// matches the load-to-use latency for when there is a hit in the L1D).
+///
+/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
+/// memory-barrier like instructions.
+/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
+/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
+/// serializes loads without forcing a flush of the load queue.
+/// Similarly, instructions that both `mayStore` and have `unmodeled side
+/// effects` are treated like store barriers. A full memory
+/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
+/// effects. This is obviously inaccurate, but this is the best that we can do
+/// at the moment.
+///
+/// Each load/store barrier consumes one entry in the load/store queue. A
+/// load/store barrier enforces ordering of loads/stores:
+/// - A younger load cannot pass a load barrier.
+/// - A younger store cannot pass a store barrier.
+///
+/// A younger load has to wait for the memory load barrier to execute.
+/// A load/store barrier is "executed" when it becomes the oldest entry in
+/// the load/store queue(s). That also means, all the older loads/stores have
+/// already been executed.
+class LSUnit {
+ // Load queue size.
+ // LQ_Size == 0 means that there are infinite slots in the load queue.
+ unsigned LQ_Size;
+
+ // Store queue size.
+ // SQ_Size == 0 means that there are infinite slots in the store queue.
+ unsigned SQ_Size;
+
+ // If true, loads will never alias with stores. This is the default.
+ bool NoAlias;
+
+ std::set<unsigned> LoadQueue;
+ std::set<unsigned> StoreQueue;
+
+ void assignLQSlot(unsigned Index);
+ void assignSQSlot(unsigned Index);
+ bool isReadyNoAlias(unsigned Index) const;
+
+ // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
+ // conservatively treated as a store barrier. It forces older store to be
+ // executed before newer stores are issued.
+ std::set<unsigned> StoreBarriers;
+
+ // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
+ // conservatively treated as a load barrier. It forces older loads to execute
+ // before newer loads are issued.
+ std::set<unsigned> LoadBarriers;
+
+public:
+ LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
+ : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+
+#ifndef NDEBUG
+ void dump() const;
+#endif
+
+ bool isSQEmpty() const { return StoreQueue.empty(); }
+ bool isLQEmpty() const { return LoadQueue.empty(); }
+ bool isSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; }
+ bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
+
+ void reserve(unsigned Index, bool MayLoad, bool MayStore, bool IsMemBarrier) {
+ if (!MayLoad && !MayStore)
+ return;
+ if (MayLoad) {
+ if (IsMemBarrier)
+ LoadBarriers.insert(Index);
+ assignLQSlot(Index);
+ }
+ if (MayStore) {
+ if (IsMemBarrier)
+ StoreBarriers.insert(Index);
+ assignSQSlot(Index);
+ }
+ }
+
+ // The rules are:
+ // 1. A store may not pass a previous store.
+ // 2. A load may not pass a previous store unless flag 'NoAlias' is set.
+ // 3. A load may pass a previous load.
+ // 4. A store may not pass a previous load (regardless of flag 'NoAlias').
+ // 5. A load has to wait until an older load barrier is fully executed.
+ // 6. A store has to wait until an older store barrier is fully executed.
+ bool isReady(unsigned Index) const;
+ void onInstructionExecuted(unsigned Index);
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+llvm-mca - LLVM Machine Code Analyzer
+-------------------------------------
+
+llvm-mca is a performance analysis tool that uses information which is already
+available in LLVM (e.g. scheduling models) to statically measure the performance
+of machine code in a specific cpu.
+
+Performance is measured in terms of throughput as well as processor resource
+consumption. The tool currently works for processors with an out-of-order
+backend, for which there is a scheduling model available in LLVM.
+
+The main goal of this tool is not just to predict the performance of the code
+when run on the target, but also help with diagnosing potential performance
+issues.
+
+Given an assembly code sequence, llvm-mca estimates the IPC (instructions Per
+cycle), as well as hardware resources pressure. The analysis and reporting style
+were inspired by the IACA tool from Intel.
+
+The presence of long data dependency chains, as well as poor usage of hardware
+resources may lead to bottlenecks in the back-end. The tool is able to generate
+a detailed report which should help with identifying and analyzing sources of
+bottlenecks.
+
+Scheduling models are mostly used to compute instruction latencies, to obtain
+read-advance information, and understand how processor resources are used by
+instructions. By design, the quality of the performance analysis conducted by
+the tool is inevitably affected by the quality of the target scheduling models.
+
+However, scheduling models intentionally do not describe all processors details,
+since the goal is just to enable the scheduling of machine instructions during
+compilation. That means, there are processor details which are not important for
+the purpose of scheduling instructions (and therefore not described by the
+scheduling model), but are very important for this tool.
+
+A few examples of details that are missing in scheduling models are:
+ - Maximum number of instructions retired per cycle.
+ - Actual dispatch width (it often differs from the issue width).
+ - Number of temporary registers available for renaming.
+ - Number of read/write ports in the register file(s).
+ - Length of the load/store queue in the LSUnit.
+
+It is also very difficult to find a "good" abstract model to describe the
+behavior of out-of-order processors. So, we have to keep in mind that all of
+these aspects are going to affect the quality of the static analysis performed
+by the tool.
+
+An extensive list of known limitations is reported in one of the last sections
+of this document. There is also a section related to design problems which must
+be addressed (hopefully with the help of the community). At the moment, the
+tool has been mostly tested for x86 targets, but there are still several
+limitations, some of which could be overcome by integrating extra information
+into the scheduling models.
+
+How the tool works
+------------------
+
+The tool takes assembly code as input. Assembly code is parsed into a sequence
+of MCInst with the help of the existing LLVM target assembly parsers. The parsed
+sequence of MCInst is then analyzed by a 'Backend' module to generate a
+performance report.
+
+The Backend module internally emulates the execution of the machine code
+sequence in a loop of iterations (which by default is 70). At the end of this
+process, the backend collects a number of statistics which are then printed out
+in the form of a report.
+
+Here is an example of performance report generated by the tool for a dot-product
+of two packed float vectors of four elements. The analysis is conducted for
+target x86, cpu btver2:
+
+///////////////////
+
+Iterations: 300
+Instructions: 900
+Total Cycles: 610
+Dispatch Width: 2
+IPC: 1.48
+
+
+Resources:
+[0] - JALU0
+[1] - JALU1
+[2] - JDiv
+[3] - JFPM
+[4] - JFPU0
+[5] - JFPU1
+[6] - JLAGU
+[7] - JSAGU
+[8] - JSTC
+[9] - JVIMUL
+
+
+Resource pressure per iteration:
+[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
+ - - - - 2.00 1.00 - - - -
+
+Resource pressure by instruction:
+[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
+ - - - - - 1.00 - - - - vmulps %xmm0, %xmm1, %xmm2
+ - - - - 1.00 - - - - - vhaddps %xmm2, %xmm2, %xmm3
+ - - - - 1.00 - - - - - vhaddps %xmm3, %xmm3, %xmm4
+
+
+Instruction Info:
+[1]: #uOps
+[2]: Latency
+[3]: RThroughput
+[4]: MayLoad
+[5]: MayStore
+[6]: HasSideEffects
+
+[1] [2] [3] [4] [5] [6] Instructions:
+ 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2
+ 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3
+ 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4
+
+///////////////////
+
+According to this report, the dot-product kernel has been executed 300 times,
+for a total of 900 instructions dynamically executed.
+
+The report is structured in three main sections. A first section collects a few
+performance numbers; the goal of this section is to give a very quick overview
+of the performance throughput. In this example, the two important perforamce
+indicators are a) the predicted total number of cycles, and b) the IPC.
+IPC is probably the most important throughput indicator. A big delta between the
+Dispatch Width and the computed IPC is an indicator of potential performance
+issues.
+
+The second section is the so-called "resource pressure view". This view reports
+the average number of resource cycles consumed every iteration by instructions
+for every processor resource unit available on the target. Information is
+structured in two tables. The first table reports the number of resource cycles
+spent on average every iteration. The second table correlates the resource
+cycles to the machine instruction in the sequence. For example, every iteration
+of the dot-product, instruction 'vmulps' always executes on resource unit [5]
+(JFPU1 - floating point pipeline #1), consuming an average of 1 resource cycle
+per iteration. Note that on Jaguar, vector FP multiply can only be issued to
+pipeline JFPU1, while horizontal FP adds can only be issued to pipeline JFPU0.
+
+The third (and last) section of the report shows the latency and reciprocal
+throughput of every instruction in the sequence. That section also reports extra
+information related to the number of micro opcodes, and opcode properties (i.e.
+'MayLoad', 'MayStore' and 'UnmodeledSideEffects').
+
+The resource pressure view helps with identifying bottlenecks caused by high
+usage of specific hardware resources. Situations with resource pressure mainly
+concentrated on a few resources should, in general, be avoided. Ideally,
+pressure should be uniformly distributed between multiple resources.
+
+Timeline View
+-------------
+
+A detailed report of each instruction's state transitions over time can be
+enabled using the command line flag '-timeline'. This prints an extra section
+in the report which contains the so-called "timeline view". Below is the
+timeline view for the dot-product example from the previous section.
+
+///////////////
+Timeline view:
+ 012345
+Index 0123456789
+
+[0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2
+[0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3
+[0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
+
+[1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
+[1,1] . D=eeeE---R . vhaddps %xmm2, %xmm2, %xmm3
+[1,2] . D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
+
+[2,0] . DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
+[2,1] . D====eeeER . vhaddps %xmm2, %xmm2, %xmm3
+[2,2] . D======eeeER vhaddps %xmm3, %xmm3, %xmm4
+
+
+Average Wait times (based on the timeline view):
+[0]: Executions
+[1]: Average time spent waiting in a scheduler's queue
+[2]: Average time spent waiting in a scheduler's queue while ready
+[3]: Average time elapsed from WB until retire stage
+
+ [0] [1] [2] [3]
+0. 3 1.0 1.0 3.3 vmulps %xmm0, %xmm1, %xmm2
+1. 3 3.3 0.7 1.0 vhaddps %xmm2, %xmm2, %xmm3
+2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
+///////////////
+
+The timeline view is very interesting because it shows how instructions changed
+in state during execution. It also gives an idea of how the tool "sees"
+instructions executed on the target.
+
+The timeline view is structured in two tables. The first table shows how
+instructions change in state over time (measured in cycles); the second table
+(named "Average Wait times") reports useful timing statistics which should help
+diagnose performance bottlenecks caused by long data dependencies and
+sub-optimal usage of hardware resources.
+
+An instruction in the timeline view is identified by a pair of indices, where
+the 'first' index identifies an iteration, and the 'second' index is the actual
+instruction index (i.e. where it appears in the code sequence).
+
+Excluding the first and last column, the remaining columns are in cycles. Cycles
+are numbered sequentially starting from 0. The following characters are used to
+describe the state of an instruction:
+
+ D : Instruction dispatched.
+ e : Instruction executing.
+ E : Instruction executed.
+ R : Instruction retired.
+ = : Instruction already dispatched, waiting to be executed.
+ - : Instruction executed, waiting to be retired.
+
+Based on the timeline view from the example, we know that:
+ - Instruction [1, 0] was dispatched at cycle 1.
+ - Instruction [1, 0] started executing at cycle 2.
+ - Instruction [1, 0] reached the write back stage at cycle 4.
+ - Instruction [1, 0] was retired at cycle 10.
+
+Instruction [1, 0] (i.e. the vmulps from iteration #1) doesn't have to wait in
+the scheduler's queue for the operands to become available. By the time the
+vmulps is dispatched, operands are already available, and pipeline JFPU1 is
+ready to serve another instruction. So the instruction can be immediately
+issued on the JFPU1 pipeline. That is demonstrated by the fact that the
+instruction only spent 1cy in the scheduler's queue.
+
+There is a gap of 5 cycles between the write-back stage and the retire event.
+That is because instructions must retire in program order, so [1,0] has to wait
+for [0, 2] to be retired first (i.e it has to wait unti cycle 10).
+
+In the dot-product example, all instructions are in a RAW (Read After Write)
+dependency chain. Register %xmm2 written by the vmulps is immediately used by
+the first vhaddps, and register %xmm3 written by the first vhaddps is used by
+the second vhaddps. Long data dependencies negatively affect the ILP
+(Instruction Level Parallelism).
+
+In the dot-product example, there are anti-dependencies introduced by
+instructions from different iterations. However, those dependencies can be
+removed at register renaming stage (at the cost of allocating register aliases,
+and therefore consuming temporary registers).
+
+Table "Average Wait times" helps diagnose performance issues that are caused by
+the presence of long latency instructions and potentially long data dependencies
+which may limit the ILP. Note that the tool by default assumes at least 1cy
+between the dispatch event and the issue event.
+
+When the performance is limited by data dependencies and/or long latency
+instructions, the number of cycles spent while in the "ready" state is expected
+to be very small when compared with the total number of cycles spent in the
+scheduler's queue. So the difference between the two counters is a good
+indicator of how big of an impact data dependencies had on the execution of
+instructions. When performance is mostly limited by the lack of hardware
+resources, the delta between the two counters is small. However, the number of
+cycles spent in the queue tends to be bigger (i.e. more than 1-3cy) especially
+when compared with other low latency instructions.
+
+Extra statistics to further diagnose performance issues.
+--------------------------------------------------------
+
+Flag '-verbose' enables extra statistics and performance counters for the
+dispatch logic, the reorder buffer, the retire control unit and the register
+file.
+
+Below is an example of verbose output generated by the tool for the dot-product
+example discussed in the previous sections.
+
+///////////////////
+Iterations: 300
+Instructions: 900
+Total Cycles: 610
+Dispatch Width: 2
+IPC: 1.48
+
+
+Dynamic Dispatch Stall Cycles:
+RAT - Register unavailable: 0
+RCU - Retire tokens unavailable: 0
+SCHEDQ - Scheduler full: 272
+LQ - Load queue full: 0
+SQ - Store queue full: 0
+GROUP - Static restrictions on the dispatch group: 0
+
+
+Register Alias Table:
+Total number of mappings created: 900
+Max number of mappings used: 35
+
+
+Dispatch Logic - number of cycles where we saw N instructions dispatched:
+[# dispatched], [# cycles]
+ 0, 24 (3.9%)
+ 1, 272 (44.6%)
+ 2, 314 (51.5%)
+
+
+Schedulers - number of cycles where we saw N instructions issued:
+[# issued], [# cycles]
+ 0, 7 (1.1%)
+ 1, 306 (50.2%)
+ 2, 297 (48.7%)
+
+
+Retire Control Unit - number of cycles where we saw N instructions retired:
+[# retired], [# cycles]
+ 0, 109 (17.9%)
+ 1, 102 (16.7%)
+ 2, 399 (65.4%)
+
+
+Scheduler's queue usage:
+JALU01, 0/20
+JFPU01, 18/18
+JLSAGU, 0/12
+///////////////////
+
+Based on the verbose report, the backend was only able to dispatch two
+instructions 51.5% of the time. The dispatch group was limited to one
+instruction 44.6% of the cycles, which corresponds to 272 cycles.
+
+If we look at section "Dynamic Dispatch Stall Cycles", we can see how counter
+SCHEDQ reports 272 cycles. Counter SCHEDQ is incremented every time the dispatch
+logic is unable to dispatch a full group of two instructions because the
+scheduler's queue is full.
+
+Section "Scheduler's queue usage" shows how the maximum number of buffer entries
+(i.e. scheduler's queue entries) used at runtime for resource JFPU01 reached its
+maximum. Note that AMD Jaguar implements three schedulers:
+ * JALU01 - A scheduler for ALU instructions
+ * JLSAGU - A scheduler for address generation
+ * JFPU01 - A scheduler floating point operations.
+
+The dot-product is a kernel of three floating point instructions (a vector
+multiply followed by two horizontal adds). That explains why only the floating
+point scheduler appears to be used according to section "Scheduler's queue
+usage".
+
+A full scheduler's queue is either caused by data dependency chains, or by a
+sub-optimal usage of hardware resources. Sometimes, resource pressure can be
+mitigated by rewriting the kernel using different instructions that consume
+different scheduler resources. Schedulers with a small queue are less resilient
+to bottlenecks caused by the presence of long data dependencies.
+
+In this example, we can conclude that the IPC is mostly limited by data
+dependencies, and not by resource pressure.
+
+LLVM-MCA instruction flow
+-------------------------
+
+This section describes the instruction flow through the out-of-order backend, as
+well as the functional units involved in the process.
+
+An instruction goes through a default sequence of stages:
+ - Dispatch (Instruction is dispatched to the schedulers).
+ - Issue (Instruction is issued to the processor pipelines).
+ - Write Back (Instruction is executed, and results are written back).
+ - Retire (Instruction is retired; writes are architecturally committed).
+
+The tool only models the out-of-order portion of a processor. Therefore, the
+instruction fetch and decode stages are not modeled. Performance bottlenecks in
+the frontend are not diagnosed by this tool. The tool assumes that instructions
+have all been decoded and placed in a queue. Also, the tool doesn't know
+anything about branch prediction.
+
+The long term plan is to make the process customizable, so that processors can
+define their own. This is a future work.
+
+Instruction Dispatch
+--------------------
+
+During the Dispatch stage, instructions are picked in program order from a queue
+of already decoded instructions, and dispatched in groups to the hardware
+schedulers. The dispatch logic is implemented by class DispatchUnit in file
+Dispatch.h.
+
+The size of a dispatch group depends on the availability of hardware resources,
+and it cannot exceed the value of field 'DispatchWidth' in class DispatchUnit.
+Note that field DispatchWidth defaults to the value of field 'IssueWidth' from
+the scheduling model.
+
+Users can override the DispatchWidth value with flag "-dispatch=<N>" (where 'N'
+is an unsigned quantity).
+
+An instruction can be dispatched if:
+ - The size of the dispatch group is smaller than DispatchWidth
+ - There are enough entries in the reorder buffer
+ - There are enough temporary registers to do register renaming
+ - Schedulers are not full.
+
+Scheduling models don't describe register files, and therefore the tool doesn't
+know if there is more than one register file, and how many temporaries are
+available for register renaming.
+
+By default, the tool (optimistically) assumes a single register file with an
+unbounded number of temporary registers. Users can limit the number of
+temporary registers available for register renaming using flag
+`-register-file-size=<N>`, where N is the number of temporaries. A value of
+zero for N means 'unbounded'. Knowing how many temporaries are available for
+register renaming, the tool can predict dispatch stalls caused by the lack of
+temporaries.
+
+The number of reorder buffer entries consumed by an instruction depends on the
+number of micro-opcodes it specifies in the target scheduling model (see field
+'NumMicroOpcodes' of tablegen class ProcWriteResources and its derived classes;
+TargetSchedule.td).
+
+The reorder buffer is implemented by class RetireControlUnit (see Dispatch.h).
+Its goal is to track the progress of instructions that are "in-flight", and
+retire instructions in program order. The number of entries in the reorder
+buffer defaults to the value of field 'MicroOpBufferSize' from the target
+scheduling model.
+
+Instructions that are dispatched to the schedulers consume scheduler buffer
+entries. The tool queries the scheduling model to figure out the set of
+buffered resources consumed by an instruction. Buffered resources are treated
+like "scheduler" resources, and the field 'BufferSize' (from the processor
+resource tablegen definition) defines the size of the scheduler's queue.
+
+Zero latency instructions (for example NOP instructions) don't consume scheduler
+resources. However, those instructions still reserve a number of slots in the
+reorder buffer.
+
+Instruction Issue
+-----------------
+
+As mentioned in the previous section, each scheduler resource implements a queue
+of instructions. An instruction has to wait in the scheduler's queue until
+input register operands become available. Only at that point, does the
+instruction becomes eligible for execution and may be issued (potentially
+out-of-order) to a pipeline for execution.
+
+Instruction latencies can be computed by the tool with the help of the
+scheduling model; latency values are defined by the scheduling model through
+ProcWriteResources objects.
+
+Class Scheduler (see file Scheduler.h) knows how to emulate multiple processor
+schedulers. A Scheduler is responsible for tracking data dependencies, and
+dynamically select which processor resources are consumed/used by instructions.
+
+Internally, the Scheduler class delegates the management of processor resource
+units and resource groups to the ResourceManager class. ResourceManager is also
+responsible for selecting resource units that are effectively consumed by
+instructions. For example, if an instruction consumes 1cy of a resource group,
+the ResourceManager object selects one of the available units from the group; by
+default, it uses a round-robin selector to guarantee that resource usage is
+uniformly distributed between all units of a group.
+
+Internally, class Scheduler implements three instruction queues:
+ - WaitQueue: a queue of instructions whose operands are not ready yet.
+ - ReadyQueue: a queue of instructions ready to execute.
+ - IssuedQueue: a queue of instructions executing.
+
+Depending on the operands availability, instructions that are dispatched to the
+Scheduler are either placed into the WaitQueue or into the ReadyQueue.
+
+Every cycle, class Scheduler checks if instructions can be moved from the
+WaitQueue to the ReadyQueue, and if instructions from the ReadyQueue can be
+issued to the underlying pipelines. The algorithm prioritizes older
+instructions over younger instructions.
+
+Objects of class ResourceState (see Scheduler.h) describe processor resources.
+There is an instance of class ResourceState for each single processor resource
+specified by the scheduling model. A ResourceState object for a processor
+resource with multiple units dynamically tracks the availability of every single
+unit. For example, the ResourceState of a resource group tracks the
+availability of every resource in that group. Internally, ResourceState
+implements a round-robin selector to dynamically pick the next unit to use from
+the group.
+
+Write-Back and Retire Stage
+---------------------------
+
+Issued instructions are moved from the ReadyQueue to the IssuedQueue. There,
+instructions wait until they reach the write-back stage. At that point, they
+get removed from the queue and the retire control unit is notified.
+
+On the event of "instruction executed", the retire control unit flags the
+instruction as "ready to retire".
+
+Instruction are retired in program order; an "instruction retired" event is sent
+to the register file which frees the temporary registers allocated for the
+instruction at register renaming stage.
+
+Load/Store Unit and Memory Consistency Model
+--------------------------------------------
+
+The tool attempts to emulate out-of-order execution of memory operations. Class
+LSUnit (see file LSUnit.h) emulates a load/store unit implementing queues for
+speculative execution of loads and stores.
+
+Each load (or store) consumes an entry in the load (or store) queue. The number
+of slots in the load/store queues is unknown by the tool, since there is no
+mention of it in the scheduling model. In practice, users can specify flag
+`-lqueue=N` (vic. `-squeue=N`) to limit the number of entries in the queue to be
+equal to exactly N (an unsigned value). If N is zero, then the tool assumes an
+unbounded queue (this is the default).
+
+LSUnit implements a relaxed consistency model for memory loads and stores. The
+rules are:
+1) A younger load is allowed to pass an older load only if there is no
+ intervening store in between the two loads.
+2) An younger store is not allowed to pass an older store.
+3) A younger store is not allowed to pass an older load.
+4) A younger load is allowed to pass an older store provided that the load does
+ not alias with the store.
+
+By default, this class conservatively (i.e. pessimistically) assumes that loads
+always may-alias store operations. Essentially, this LSUnit doesn't perform any
+sort of alias analysis to rule out cases where loads and stores don't overlap
+with each other. The downside of this approach however is that younger loads are
+never allowed to pass older stores. To make it possible for a younger load to
+pass an older store, users can use the command line flag -noalias. Under
+'noalias', a younger load is always allowed to pass an older store.
+
+Note that, in the case of write-combining memory, rule 2. could be relaxed a bit
+to allow reordering of non-aliasing store operations. That being said, at the
+moment, there is no way to further relax the memory model (flag -noalias is the
+only option). Essentially, there is no option to specify a different memory
+type (for example: write-back, write-combining, write-through; etc.) and
+consequently to weaken or strengthen the memory model.
+
+Other limitations are:
+ * LSUnit doesn't know when store-to-load forwarding may occur.
+ * LSUnit doesn't know anything about the cache hierarchy and memory types.
+ * LSUnit doesn't know how to identify serializing operations and memory fences.
+
+No assumption is made on the store buffer size. As mentioned before, LSUnit
+conservatively assumes a may-alias relation between loads and stores, and it
+doesn't attempt to identify cases where store-to-load forwarding would occur in
+practice.
+
+LSUnit doesn't attempt to predict whether a load or store hits or misses the L1
+cache. It only knows if an instruction "MayLoad" and/or "MayStore". For loads,
+the scheduling model provides an "optimistic" load-to-use latency (which usually
+matches the load-to-use latency for when there is a hit in the L1D).
+
+Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
+memory-barrier like instructions. LSUnit conservatively assumes that an
+instruction which has both 'MayLoad' and 'UnmodeledSideEffects' behaves like a
+"soft" load-barrier. That means, it serializes loads without forcing a flush of
+the load queue. Similarly, instructions flagged with both 'MayStore' and
+'UnmodeledSideEffects' are treated like store barriers. A full memory barrier
+is a 'MayLoad' and 'MayStore' instruction with 'UnmodeledSideEffects'. This is
+inaccurate, but it is the best that we can do at the moment with the current
+information available in LLVM.
+
+A load/store barrier consumes one entry of the load/store queue. A load/store
+barrier enforces ordering of loads/stores. A younger load cannot pass a load
+barrier. Also, a younger store cannot pass a store barrier. A younger load has
+to wait for the memory/load barrier to execute. A load/store barrier is
+"executed" when it becomes the oldest entry in the load/store queue(s). That
+also means, by construction, all the older loads/stores have been executed.
+
+In conclusion the full set of rules is:
+ 1. A store may not pass a previous store.
+ 2. A load may not pass a previous store unless flag 'NoAlias' is set.
+ 3. A load may pass a previous load.
+ 4. A store may not pass a previous load (regardless of flag 'NoAlias').
+ 5. A load has to wait until an older load barrier is fully executed.
+ 6. A store has to wait until an older store barrier is fully executed.
+
+Known limitations
+-----------------
+Previous sections described cases where the tool is missing information to give
+an accurate report. For example, the first sections of this document explained
+how the lack of knowledge about the processor negatively affects the performance
+analysis. The lack of knowledge is often a consequence of how scheduling models
+are defined; as mentioned before, scheduling models intentionally don't describe
+processors in fine details. That being said, the LLVM machine model can be
+extended to expose more details, as long as they are opt-in for targets.
+
+The accuracy of the performance analysis is also affected by assumptions made by
+the processor model used by the tool.
+
+Most recent Intel and AMD processors implement dedicated LoopBuffer/OpCache in
+the hardware frontend to speedup the throughput in the presence of tight loops.
+The presence of these buffers complicates the decoding logic, and requires
+knowledge on the branch predictor too. Class 'SchedMachineModel' in tablegen
+provides a field named 'LoopMicroOpBufferSize' which is used to describe loop
+buffers. However, the purpose of that field is to enable loop unrolling of
+tight loops; essentially, it affects the cost model used by pass loop-unroll.
+
+At the current state, the tool only describes the out-of-order portion of a
+processor, and consequently doesn't try to predict the frontend throughput. That
+being said, this tool could be definitely extended in future to also account for
+the hardware frontend when doing performance analysis. This would inevitably
+require extra (extensive) processor knowledge related to all the available
+decoding paths in the hardware frontend, as well as branch prediction.
+
+Currently, the tool assumes a zero-latency "perfect" fetch&decode
+stage; the full sequence of decoded instructions is immediately visible to the
+dispatch logic from the start.
+
+The tool doesn't know about simultaneous mutithreading. According to the tool,
+processor resources are not statically/dynamically partitioned. Processor
+resources are fully available to the hardware thread executing the
+microbenchmark.
+
+The execution model implemented by this tool assumes that instructions are
+firstly dispatched in groups to hardware schedulers, and then issued to
+pipelines for execution. The model assumes dynamic scheduling of instructions.
+Instructions are placed in a queue and potentially executed out-of-order (based
+on the operand availability). The dispatch stage is definitely distinct from the
+issue stage. This will change in future; as mentioned in the first section, the
+end goal is to let processors customize the process.
+
+This model doesn't correctly describe processors where the dispatch/issue is a
+single stage. This is what happens for example in VLIW processors, where
+instructions are packaged and statically scheduled at compile time; it is up to
+the compiler to predict the latency of instructions and package issue groups
+accordingly. For such targets, there is no dynamic scheduling done by the
+hardware.
+
+Existing classes (DispatchUnit, Scheduler, etc.) could be extended/adapted to
+support processors with a single dispatch/issue stage. The execution flow would
+require some changes in the way how existing components (i.e. DispatchUnit,
+Scheduler, etc.) interact. This can be a future development.
+
+The following sections describes other known limitations. The goal is not to
+provide an extensive list of limitations; we want to report what we believe are
+the most important limitations, and suggest possible methods to overcome them.
+
+Load/Store barrier instructions and serializing operations
+----------------------------------------------------------
+Section "Load/Store Unit and Memory Consistency Model" already mentioned how
+LLVM doesn't know about serializing operations and memory barriers. Most of it
+boils down to the fact that class MCInstrDesc (intentionally) doesn't expose
+those properties. Instead, both serializing operations and memory barriers
+"have side-effects" according to MCInstrDesc. That is because, at least for
+scheduling purposes, knowing that an instruction has unmodeled side effects is
+often enough to treat the instruction like a compiler scheduling barrier.
+
+A performance analysis tool could use the extra knowledge on barriers and
+serializing operations to generate a more accurate performance report. One way
+to improve this is by reserving a couple of bits in field 'Flags' from class
+MCInstrDesc: one bit for barrier operations, and another bit to mark
+instructions as serializing operations.
+
+Lack of support for instruction itineraries
+-------------------------------------------
+The current version of the tool doesn't know how to process instruction
+itineraries. This is probably one of the most important limitations, since it
+affects a few out-of-order processors in LLVM.
+
+As mentioned in section 'Instruction Issue', class Scheduler delegates to an
+instance of class ResourceManager the handling of processor resources.
+ResourceManager is where most of the scheduling logic is implemented.
+
+Adding support for instruction itineraries requires that we teach
+ResourceManager how to handle functional units and instruction stages. This
+development can be a future extension, and it would probably require a few
+changes to the ResourceManager interface.
+
+Instructions that affect control flow are not correctly modeled
+---------------------------------------------------------------
+Examples of instructions that affect the control flow are: return, indirect
+branches, calls, etc. The tool doesn't try to predict/evaluate branch targets.
+In particular, the tool doesn't model any sort of branch prediction, nor does it
+attempt to track changes to the program counter. The tool always assumes that
+the input assembly sequence is the body of a microbenchmark (a simple loop
+executed for a number of iterations). The "next" instruction in sequence is
+always the next instruction to dispatch.
+
+Call instructions default to an arbitrary high latency of 100cy. A warning is
+generated if the tool encounters a call instruction in the sequence. Return
+instructions are not evaluated, and therefore control flow is not affected.
+However, the tool still queries the processor scheduling model to obtain latency
+information for instructions that affect the control flow.
+
+Possible extensions to the scheduling model
+-------------------------------------------
+Section "Instruction Dispatch" explained how the tool doesn't know about the
+register files, and temporaries available in each register file for register
+renaming purposes.
+
+The LLVM scheduling model could be extended to better describe register files.
+Ideally, scheduling model should be able to define:
+ - The size of each register file
+ - How many temporary registers are available for register renaming
+ - How register classes map to register files
+
+The scheduling model doesn't specify the retire throughput (i.e. how many
+instructions can be retired every cycle). Users can specify flag
+`-max-retire-per-cycle=<uint>` to limit how many instructions the retire control
+unit can retire every cycle. Ideally, every processor should be able to specify
+the retire throughput (for example, by adding an extra field to the scheduling
+model tablegen class).
+
+Known limitations on X86 processors
+-----------------------------------
+
+1) Partial register updates versus full register updates.
+
+On x86-64, a 32-bit GPR write fully updates the super-register. Example:
+ add %edi %eax ## eax += edi
+
+Here, register %eax aliases the lower half of 64-bit register %rax. On x86-64,
+register %rax is fully updated by the 'add' (the upper half of %rax is zeroed).
+Essentially, it "kills" any previous definition of (the upper half of) register
+%rax.
+
+On the other hand, 8/16 bit register writes only perform a so-called "partial
+register update". Example:
+ add %di, %ax ## ax += di
+
+Here, register %eax is only partially updated. To be more specific, the lower
+half of %eax is set, and the upper half is left unchanged. There is also no
+change in the upper 48 bits of register %rax.
+
+To get accurate performance analysis, the tool has to know which instructions
+perform a partial register update, and which instructions fully update the
+destination's super-register.
+
+One way to expose this information is (again) via tablegen. For example, we
+could add a flag in the tablegen instruction class to tag instructions that
+perform partial register updates. Something like this: 'bit
+hasPartialRegisterUpdate = 1'. However, this would force a `let
+hasPartialRegisterUpdate = 0` on several instruction definitions.
+
+Another approach is to have a MCSubtargetInfo hook similar to this:
+ virtual bool updatesSuperRegisters(unsigned short opcode) { return false; }
+
+Targets will be able to override this method if needed. Again, this is just an
+idea. But the plan is to have this fixed as a future development.
+
+2) Macro Op fusion.
+
+The tool doesn't know about macro-op fusion. On modern x86 processors, a
+'cmp/test' followed by a 'jmp' is fused into a single macro operation. The
+advantage is that the fused pair only consumes a single slot in the dispatch
+group.
+
+As a future development, the tool should be extended to address macro-fusion.
+Ideally, we could have LLVM generate a table enumerating all the opcode pairs
+that can be fused together. That table could be exposed to the tool via the
+MCSubtargetInfo interface. This is just an idea; there may be better ways to
+implement this.
+
+3) Intel processors: mixing legacy SSE with AVX instructions.
+
+On modern Intel processors with AVX, mixing legacy SSE code with AVX code
+negatively impacts the performance. The tool is not aware of this issue, and
+the performance penalty is not accounted when doing the analysis. This is
+something that we would like to improve in future.
+
+4) Zero-latency register moves and Zero-idioms.
+
+Most modern AMD/Intel processors know how to optimize out register-register
+moves and zero idioms at register renaming stage. The tool doesn't know
+about these patterns, and this may negatively impact the performance analysis.
+
+Known design problems
+---------------------
+This section describes two design issues that are currently affecting the tool.
+The long term plan is to "fix" these issues.
+Both limitations would be easily fixed if we teach the tool how to directly
+manipulate MachineInstr objects (instead of MCInst objects).
+
+1) Variant instructions not correctly modeled.
+
+The tool doesn't know how to analyze instructions with a "variant" scheduling
+class descriptor. A variant scheduling class needs to be resolved dynamically.
+The "actual" scheduling class often depends on the subtarget, as well as
+properties of the specific MachineInstr object.
+
+Unfortunately, the tool manipulates MCInst, and it doesn't know anything about
+MachineInstr. As a consequence, the tool cannot use the existing machine
+subtarget hooks that are normally used to resolve the variant scheduling class.
+This is a major design issue which mostly affects ARM/AArch64 targets. It
+mostly boils down to the fact that the existing scheduling framework was meant
+to work for MachineInstr.
+
+When the tool encounters a "variant" instruction, it assumes a generic 1cy
+latency. However, the tool would not be able to tell which processor resources
+are effectively consumed by the variant instruction.
+
+2) MCInst and MCInstrDesc.
+
+Performance analysis tools require data dependency information to correctly
+predict the runtime performance of the code. This tool must always be able to
+obtain the set of implicit/explicit register defs/uses for every instruction of
+the input assembly sequence.
+
+In the first section of this document, it was mentioned how the tool takes as
+input an assembly sequence. That sequence is parsed into a MCInst sequence with
+the help of assembly parsers available from the targets.
+
+A MCInst is a very low-level instruction representation. The tool can inspect
+the MCOperand sequence of an MCInst to identify register operands. However,
+there is no way to tell register operands that are definitions from register
+operands that are uses.
+
+In LLVM, class MCInstrDesc is used to fully describe target instructions and
+their operands. The opcode of a machine instruction (a MachineInstr object) can
+be used to query the instruction set through method `MCInstrInfo::get' to obtain
+the associated MCInstrDesc object.
+
+However class MCInstrDesc describes properties and operands of MachineInstr
+objects. Essentially, MCInstrDesc is not meant to be used to describe MCInst
+objects. To be more specific, MCInstrDesc objects are automatically generated
+via tablegen from the instruction set description in the target .td files. For
+example, field `MCInstrDesc::NumDefs' is always equal to the cardinality of the
+`(outs)` set from the tablegen instruction definition.
+
+By construction, register definitions always appear at the beginning of the
+MachineOperands list in MachineInstr. Basically, the (outs) are the first
+operands of a MachineInstr, and the (ins) will come after in the machine operand
+list. Knowing the number of register definitions is enough to identify
+all the register operands that are definitions.
+
+In a normal compilation process, MCInst objects are generated from MachineInstr
+objects through a lowering step. By default the lowering logic simply iterates
+over the machine operands of a MachineInstr, and converts/expands them into
+equivalent MCOperand objects.
+
+The default lowering strategy has the advantage of preserving all of the
+above mentioned assumptions on the machine operand sequence. That means, register
+definitions would still be at the beginning of the MCOperand sequence, and
+register uses would come after.
+
+Targets may still define custom lowering routines for specific opcodes. Some of
+these routines may lower operands in a way that potentially breaks (some of) the
+assumptions on the machine operand sequence which were valid for MachineInstr.
+Luckily, this is not the most common form of lowering done by the targets, and
+the vast majority of the MachineInstr are lowered based on the default strategy
+which preserves the original machine operand sequence. This is especially true
+for x86, where the custom lowering logic always preserves the original (i.e.
+from the MachineInstr) operand sequence.
+
+This tool currently works under the strong (and potentially incorrect)
+assumption that register def/uses in a MCInst can always be identified by
+querying the machine instruction descriptor for the opcode. This assumption made
+it possible to develop this tool and get good numbers at least for the
+processors available in the x86 backend.
+
+That being said, the analysis is still potentially incorrect for other targets.
+So we plan (with the help of the community) to find a proper mechanism to map
+when possible MCOperand indices back to MachineOperand indices of the equivalent
+MachineInstr. This would be equivalent to describing changes made by the
+lowering step which affected the operand sequence. For example, we could have an
+index for every register MCOperand (or -1, if the operand didn't exist in the
+original MachineInstr). The mapping could look like this <0,1,3,2>. Here,
+MCOperand #2 was obtained from the lowering of MachineOperand #3. etc.
+
+This information could be automatically generated via tablegen for all the
+instructions whose custom lowering step breaks assumptions made by the tool on
+the register operand sequence (In general, these instructions should be the
+minority of a target's instruction set). Unfortunately, we don't have that
+information now. As a consequence, we assume that the number of explicit
+register definitions is the same number specified in MCInstrDesc. We also
+assume that register definitions always come first in the operand sequence.
+
+In conclusion: these are for now the strong assumptions made by the tool:
+ * The number of explicit and implicit register definitions in a MCInst
+ matches the number of explicit and implicit definitions specified by the
+ MCInstrDesc object.
+ * Register uses always come after register definitions.
+ * If an opcode specifies an optional definition, then the optional
+ definition is always the last register operand in the sequence.
+
+Note that some of the information accessible from the MCInstrDesc is always
+valid for MCInst. For example: implicit register defs, implicit register uses
+and 'MayLoad/MayStore/HasUnmodeledSideEffects' opcode properties still apply to
+MCInst. The tool knows about this, and uses that information during its
+analysis.
+
+Future work
+-----------
+ * Address limitations (described in section "Known limitations").
+ * Integrate extra description in the processor models, and make it opt-in for
+ the targets (see section "Possible extensions to the scheduling model").
+ * Let processors specify the selection strategy for processor resource groups
+ and resources with multiple units. The tool currently uses a round-robin
+ selector to pick the next resource to use.
+ * Address limitations specifically described in section "Known limitations on
+ X86 processors".
+ * Address design issues identified in section "Known design problems".
+ * Define a standard interface for "Views". This would let users customize the
+ performance report generated by the tool.
+ * Simplify the Backend interface.
+
+When interfaces are mature/stable:
+ * Move the logic into a library. This will enable a number of other
+ interesting use cases.
--- /dev/null
+//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods in the ResourcePressureView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ResourcePressureView.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void ResourcePressureView::initialize(
+ const ArrayRef<uint64_t> ProcResourceMasks) {
+ // Populate the map of resource descriptors.
+ unsigned R2VIndex = 0;
+ const MCSchedModel &SM = STI.getSchedModel();
+ for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+ const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+ unsigned NumUnits = ProcResource.NumUnits;
+ // Skip groups and invalid resources with zero units.
+ if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+ continue;
+
+ uint64_t ResourceMask = ProcResourceMasks[I];
+ Resource2VecIndex.insert(
+ std::pair<uint64_t, unsigned>(ResourceMask, R2VIndex));
+ R2VIndex += ProcResource.NumUnits;
+ }
+
+ NumResourceUnits = R2VIndex;
+ ResourceUsage.resize(NumResourceUnits * (Source.size() + 1));
+ std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0);
+}
+
+void ResourcePressureView::onInstructionIssued(
+ unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
+ unsigned SourceIdx = Index % Source.size();
+ for (const std::pair<ResourceRef, unsigned> &Use : Used) {
+ const ResourceRef &RR = Use.first;
+ assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end());
+ unsigned R2VIndex = Resource2VecIndex[RR.first];
+ R2VIndex += countTrailingZeros(RR.second);
+ ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second;
+ ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second;
+ }
+}
+
+static void printColumnNames(raw_string_ostream &OS, const MCSchedModel &SM) {
+ for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
+ I < E; ++I) {
+ const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+ unsigned NumUnits = ProcResource.NumUnits;
+ // Skip groups and invalid resources with zero units.
+ if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+ continue;
+
+ if (NumUnits == 1) {
+ OS << '[' << ResourceIndex << ']';
+ if (ResourceIndex < 10)
+ OS << " ";
+ else
+ OS << " ";
+ ResourceIndex++;
+ continue;
+ }
+
+ for (unsigned J = 0; J < NumUnits; ++J) {
+ OS << "[" << ResourceIndex << '.' << J << ']';
+ if (ResourceIndex < 10)
+ OS << " ";
+ else
+ OS << ' ';
+ }
+ ResourceIndex++;
+ }
+}
+
+void ResourcePressureView::printResourcePressurePerIteration(
+ raw_ostream &OS, unsigned Executions) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+
+ TempStream << "\n\nResources:\n";
+ const MCSchedModel &SM = STI.getSchedModel();
+ for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
+ I < E; ++I) {
+ const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+ unsigned NumUnits = ProcResource.NumUnits;
+ // Skip groups and invalid resources with zero units.
+ if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+ continue;
+
+ for (unsigned J = 0; J < NumUnits; ++J) {
+ TempStream << '[' << ResourceIndex;
+ if (NumUnits > 1)
+ TempStream << '.' << J;
+ TempStream << "] - " << ProcResource.Name << '\n';
+ }
+
+ ResourceIndex++;
+ }
+
+ TempStream << "\n\nResource pressure per iteration:\n";
+ printColumnNames(TempStream, SM);
+ TempStream << '\n';
+
+ for (unsigned I = 0; I < NumResourceUnits; ++I) {
+ unsigned Usage = ResourceUsage[I + Source.size() * NumResourceUnits];
+ if (!Usage) {
+ TempStream << " - ";
+ continue;
+ }
+
+ double Pressure = (double)Usage / Executions;
+ TempStream << format("%.2f", Pressure);
+ if (Pressure < 10.0)
+ TempStream << " ";
+ else if (Pressure < 100.0)
+ TempStream << " ";
+ else
+ TempStream << ' ';
+ }
+
+ TempStream.flush();
+ OS << Buffer;
+}
+
+void ResourcePressureView::printResourcePressurePerInstruction(
+ raw_ostream &OS, unsigned Executions) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+
+ TempStream << "\n\nResource pressure by instruction:\n";
+ printColumnNames(TempStream, STI.getSchedModel());
+ TempStream << "\tInstructions:\n";
+
+ for (unsigned I = 0, E = Source.size(); I < E; ++I) {
+ for (unsigned J = 0; J < NumResourceUnits; ++J) {
+ unsigned Usage = ResourceUsage[J + I * NumResourceUnits];
+ if (Usage == 0) {
+ TempStream << " - ";
+ } else {
+ double Pressure = (double)Usage / Executions;
+ if (Pressure < 0.005) {
+ TempStream << " - ";
+ } else {
+ TempStream << format("%.2f", Pressure);
+ if (Pressure < 10.0)
+ TempStream << " ";
+ else if (Pressure < 100.0)
+ TempStream << " ";
+ else
+ TempStream << ' ';
+ }
+ }
+ }
+
+ MCIP.printInst(&Source.getMCInstFromIndex(I), TempStream, "", STI);
+ TempStream << '\n';
+ TempStream.flush();
+ OS << Buffer;
+ Buffer = "";
+ }
+}
+
+} // namespace mca
--- /dev/null
+//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file define class ResourcePressureView.
+/// Class ResourcePressureView observes hardware events generated by
+/// the Backend object and collects statistics related to resource usage at
+/// instruction granularity.
+/// Resource pressure information is then printed out to a stream in the
+/// form of a table like the one from the example below:
+///
+/// Resources:
+/// [0] - JALU0
+/// [1] - JALU1
+/// [2] - JDiv
+/// [3] - JFPM
+/// [4] - JFPU0
+/// [5] - JFPU1
+/// [6] - JLAGU
+/// [7] - JSAGU
+/// [8] - JSTC
+/// [9] - JVIMUL
+///
+/// Resource pressure per iteration:
+/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
+/// 0.00 0.00 0.00 0.00 2.00 2.00 0.00 0.00 0.00 0.00
+///
+/// Resource pressure by instruction:
+/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
+/// - - - - - 1.00 - - - - vpermilpd $1, %xmm0,
+/// %xmm1
+/// - - - - 1.00 - - - - - vaddps %xmm0, %xmm1,
+/// %xmm2
+/// - - - - - 1.00 - - - - vmovshdup %xmm2, %xmm3
+/// - - - - 1.00 - - - - - vaddss %xmm2, %xmm3,
+/// %xmm4
+///
+/// In this example, we have AVX code executed on AMD Jaguar (btver2).
+/// Both shuffles and vector floating point add operations on XMM registers have
+/// a reciprocal throughput of 1cy.
+/// Each add is issued to pipeline JFPU0, while each shuffle is issued to
+/// pipeline JFPU1. The overall pressure per iteration is reported by two
+/// tables: the first smaller table is the resource pressure per iteration;
+/// the second table reports resource pressure per instruction. Values are the
+/// average resource cycles consumed by an instruction.
+/// Every vector add from the example uses resource JFPU0 for an average of 1cy
+/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per
+/// iteration.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+
+#include "HWEventListener.h"
+#include "SourceMgr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+class Backend;
+
+/// This class collects resource pressure statistics and it is able to print
+/// out all the collected information as a table to an output stream.
+class ResourcePressureView : public HWEventListener {
+ const llvm::MCSubtargetInfo &STI;
+ llvm::MCInstPrinter &MCIP;
+ const SourceMgr &Source;
+
+ // Map to quickly get a resource descriptor from a mask.
+ std::map<uint64_t, unsigned> Resource2VecIndex;
+
+ // Table of resources used by instructions.
+ std::vector<unsigned> ResourceUsage;
+ unsigned NumResourceUnits;
+
+ const llvm::MCInst &GetMCInstFromIndex(unsigned Index) const;
+ void printResourcePressurePerIteration(llvm::raw_ostream &OS,
+ unsigned Executions) const;
+ void printResourcePressurePerInstruction(llvm::raw_ostream &OS,
+ unsigned Executions) const;
+ void initialize(const llvm::ArrayRef<uint64_t> ProcResoureMasks);
+
+public:
+ ResourcePressureView(const llvm::MCSubtargetInfo &ST,
+ llvm::MCInstPrinter &Printer, const SourceMgr &SM,
+ const llvm::ArrayRef<uint64_t> ProcResourceMasks)
+ : STI(ST), MCIP(Printer), Source(SM) {
+ initialize(ProcResourceMasks);
+ }
+
+ void onInstructionIssued(
+ unsigned Index,
+ const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used) override;
+
+ void printResourcePressure(llvm::raw_ostream &OS, unsigned Cycles) const {
+ unsigned Executions = Source.getNumIterations();
+ printResourcePressurePerIteration(OS, Executions);
+ printResourcePressurePerInstruction(OS, Executions);
+ }
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A scheduler for processor resource units and processor resource groups.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Scheduler.h"
+#include "Backend.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+using namespace llvm;
+
+uint64_t ResourceState::selectNextInSequence() {
+ assert(isReady());
+ uint64_t Next = getNextInSequence();
+ while (!isSubResourceReady(Next)) {
+ updateNextInSequence();
+ Next = getNextInSequence();
+ }
+ return Next;
+}
+
+#ifndef NDEBUG
+void ResourceState::dump() const {
+ dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask
+ << ", NEXT: " << NextInSequenceMask << ", RDYMASK: " << ReadyMask
+ << ", BufferSize=" << BufferSize
+ << ", AvailableSlots=" << AvailableSlots
+ << ", Reserved=" << Unavailable << '\n';
+}
+#endif
+
+// Adds a new resource state in Resources, as well as a new descriptor in
+// ResourceDescriptor. Map 'Resources' allows to quickly obtain ResourceState
+// objects from resource mask identifiers.
+void ResourceManager::addResource(const MCProcResourceDesc &Desc,
+ uint64_t Mask) {
+ assert(Resources.find(Mask) == Resources.end() && "Resource already added!");
+ Resources[Mask] = llvm::make_unique<ResourceState>(Desc, Mask);
+}
+
+// Populate vector ProcResID2Mask with resource masks. One per each processor
+// resource declared by the scheduling model.
+void ResourceManager::computeProcResourceMasks(const MCSchedModel &SM) {
+ unsigned ProcResourceID = 0;
+
+ // Create a unique bitmask for every processor resource unit.
+ // Skip resource at index 0, since it always references 'InvalidUnit'.
+ ProcResID2Mask.resize(SM.getNumProcResourceKinds());
+ for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+ const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+ if (Desc.SubUnitsIdxBegin)
+ continue;
+ ProcResID2Mask[I] = 1ULL << ProcResourceID;
+ ProcResourceID++;
+ }
+
+ // Create a unique bitmask for every processor resource group.
+ for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+ const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+ if (!Desc.SubUnitsIdxBegin)
+ continue;
+ ProcResID2Mask[I] |= 1ULL << ProcResourceID;
+ for (unsigned U = 0; U < Desc.NumUnits; ++U) {
+ uint64_t OtherMask = ProcResID2Mask[Desc.SubUnitsIdxBegin[U]];
+ ProcResID2Mask[I] |= OtherMask;
+ }
+ ProcResourceID++;
+ }
+}
+
+// Returns the actual resource consumed by this Use.
+// First, is the primary resource ID.
+// Second, is the specific sub-resource ID.
+std::pair<uint64_t, uint64_t> ResourceManager::selectPipe(uint64_t ResourceID) {
+ ResourceState &RS = *Resources[ResourceID];
+ uint64_t SubResourceID = RS.selectNextInSequence();
+ if (RS.isAResourceGroup())
+ return selectPipe(SubResourceID);
+ return std::pair<uint64_t, uint64_t>(ResourceID, SubResourceID);
+}
+
+void ResourceState::removeFromNextInSequence(uint64_t ID) {
+ assert(NextInSequenceMask);
+ assert(countPopulation(ID) == 1);
+ if (ID > getNextInSequence())
+ RemovedFromNextInSequence |= ID;
+ NextInSequenceMask = NextInSequenceMask & (~ID);
+ if (!NextInSequenceMask) {
+ NextInSequenceMask = ResourceSizeMask;
+ assert(NextInSequenceMask != RemovedFromNextInSequence);
+ NextInSequenceMask ^= RemovedFromNextInSequence;
+ RemovedFromNextInSequence = 0;
+ }
+}
+
+void ResourceManager::use(ResourceRef RR) {
+ // Mark the sub-resource referenced by RR as used.
+ ResourceState &RS = *Resources[RR.first];
+ RS.markSubResourceAsUsed(RR.second);
+ // If there are still available units in RR.first,
+ // then we are done.
+ if (RS.isReady())
+ return;
+
+ // Notify to other resources that RR.first is no longer available.
+ for (const std::pair<uint64_t, UniqueResourceState> &Res : Resources) {
+ ResourceState &Current = *Res.second.get();
+ if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first)
+ continue;
+
+ if (Current.containsResource(RR.first)) {
+ Current.markSubResourceAsUsed(RR.first);
+ Current.removeFromNextInSequence(RR.first);
+ }
+ }
+}
+
+void ResourceManager::release(ResourceRef RR) {
+ ResourceState &RS = *Resources[RR.first];
+ bool WasFullyUsed = !RS.isReady();
+ RS.releaseSubResource(RR.second);
+ if (!WasFullyUsed)
+ return;
+
+ for (const std::pair<uint64_t, UniqueResourceState> &Res : Resources) {
+ ResourceState &Current = *Res.second.get();
+ if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first)
+ continue;
+
+ if (Current.containsResource(RR.first))
+ Current.releaseSubResource(RR.first);
+ }
+}
+
+void ResourceManager::reserveDispatchHazardResources(const InstrDesc &Desc) {
+ for (const uint64_t R : Desc.Buffers) {
+ ResourceState &Resource = *Resources[R];
+ if (Resource.isADispatchHazard()) {
+ assert(!Resource.isReserved());
+ Resource.setReserved();
+ }
+ }
+}
+
+bool ResourceManager::canBeIssued(const InstrDesc &Desc) const {
+ return std::all_of(Desc.Resources.begin(), Desc.Resources.end(),
+ [&](const std::pair<uint64_t, const ResourceUsage> &E) {
+ unsigned NumUnits =
+ E.second.isReserved() ? 0U : E.second.NumUnits;
+ return isReady(E.first, NumUnits);
+ });
+}
+
+// Returns true if all resources are in-order, and there is at least one
+// resource which is a dispatch hazard (BufferSize = 0).
+bool ResourceManager::mustIssueImmediately(const InstrDesc &Desc) {
+ if (!canBeIssued(Desc))
+ return false;
+ bool AllInOrderResources = std::all_of(
+ Desc.Buffers.begin(), Desc.Buffers.end(), [&](const unsigned BufferMask) {
+ const ResourceState &Resource = *Resources[BufferMask];
+ return Resource.isInOrder() || Resource.isADispatchHazard();
+ });
+ if (!AllInOrderResources)
+ return false;
+
+ return std::any_of(Desc.Buffers.begin(), Desc.Buffers.end(),
+ [&](const unsigned BufferMask) {
+ return Resources[BufferMask]->isADispatchHazard();
+ });
+}
+
+double ResourceManager::getRThroughput(const InstrDesc &ID) const {
+ double RThroughput = 0;
+ for (const std::pair<uint64_t, ResourceUsage> &Usage : ID.Resources) {
+ const CycleSegment &CS = Usage.second.CS;
+ assert(CS.begin() == 0);
+
+ if (Usage.second.isReserved()) {
+ RThroughput = std::max(RThroughput, (double)CS.size());
+ continue;
+ }
+
+ unsigned Population = std::max(1U, countPopulation(Usage.first) - 1);
+ unsigned NumUnits = Population * getNumUnits(Usage.first);
+ NumUnits -= Usage.second.NumUnits - 1;
+ unsigned Cycles = CS.size();
+ RThroughput = std::max(RThroughput, (double)Cycles / NumUnits);
+ }
+ return RThroughput;
+}
+
+void ResourceManager::issueInstruction(
+ unsigned Index, const InstrDesc &Desc,
+ SmallVectorImpl<std::pair<ResourceRef, unsigned>> &Pipes) {
+ releaseBuffers(Desc);
+ for (const std::pair<uint64_t, ResourceUsage> &R : Desc.Resources) {
+ const CycleSegment &CS = R.second.CS;
+ if (!CS.size()) {
+ releaseResource(R.first);
+ continue;
+ }
+
+ assert(CS.begin() == 0 && "Invalid {Start, End} cycles!");
+ if (!R.second.isReserved()) {
+ ResourceRef Pipe = selectPipe(R.first);
+ use(Pipe);
+ BusyResources[Pipe] += CS.size();
+ Pipes.emplace_back(std::pair<ResourceRef, unsigned>(Pipe, CS.size()));
+ } else {
+ assert((countPopulation(R.first) > 1) && "Expected a group!");
+ // Mark this group as reserved.
+ assert(R.second.isReserved());
+ reserveResource(R.first);
+ BusyResources[ResourceRef(R.first, R.first)] += CS.size();
+ }
+ }
+}
+
+void ResourceManager::cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed) {
+ for (std::pair<ResourceRef, unsigned> &BR : BusyResources) {
+ if (BR.second)
+ BR.second--;
+ if (!BR.second) {
+ // Release this resource.
+ const ResourceRef &RR = BR.first;
+
+ if (countPopulation(RR.first) == 1)
+ release(RR);
+
+ releaseResource(RR.first);
+ ResourcesFreed.push_back(RR);
+ }
+ }
+
+ for (const ResourceRef &RF : ResourcesFreed)
+ BusyResources.erase(RF);
+}
+
+Instruction *Scheduler::scheduleInstruction(unsigned Idx, Instruction *MCIS) {
+ assert(WaitQueue.find(Idx) == WaitQueue.end());
+ assert(ReadyQueue.find(Idx) == ReadyQueue.end());
+ assert(IssuedQueue.find(Idx) == IssuedQueue.end());
+
+ // Special case where MCIS is a zero-latency instruction. A zero-latency
+ // instruction doesn't consume any scheduler resources. That is because it
+ // doesn't need to be executed. Most of the times, zero latency instructions
+ // are removed at register renaming stage. For example, register-register
+ // moves can be removed at register renaming stage by creating new aliases.
+ // Zero-idiom instruction (for example: a `xor reg, reg`) can also be
+ // eliminated at register renaming stage, since we know in advance that those
+ // clear their output register.
+ if (MCIS->isZeroLatency()) {
+ MCIS->forceExecuted();
+ notifyInstructionIssued(Idx, {});
+ notifyInstructionExecuted(Idx);
+ return MCIS;
+ }
+
+ // Consume entries in the reservation stations.
+ const InstrDesc &Desc = MCIS->getDesc();
+ Resources->reserveBuffers(Desc);
+
+ // Mark units with BufferSize=0 as reserved. These resources will only
+ // be released after MCIS is issued, and all the ResourceCycles for
+ // those units have been consumed.
+ Resources->reserveDispatchHazardResources(Desc);
+
+ bool MayLoad = Desc.MayLoad;
+ bool MayStore = Desc.MayStore;
+ if (MayLoad || MayStore)
+ LSU->reserve(Idx, MayLoad, MayStore, Desc.HasSideEffects);
+
+ MCIS->dispatch();
+ bool IsReady = MCIS->isReady();
+ if (IsReady && (MayLoad || MayStore))
+ IsReady &= LSU->isReady(Idx);
+
+ if (!IsReady) {
+ DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Wait Queue\n");
+ WaitQueue[Idx] = MCIS;
+ return MCIS;
+ }
+ notifyInstructionReady(Idx);
+
+ // Special case where the instruction is ready, and it uses an in-order
+ // dispatch/issue processor resource. The instruction is issued immediately to
+ // the pipelines. Any other in-order buffered resources (i.e. BufferSize=1)
+ // are consumed.
+ if (Resources->mustIssueImmediately(Desc)) {
+ DEBUG(dbgs() << "[SCHEDULER] Instruction " << Idx
+ << " issued immediately\n");
+ issueInstruction(MCIS, Idx);
+ return MCIS;
+ }
+
+ DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Ready Queue\n");
+ ReadyQueue[Idx] = MCIS;
+ return MCIS;
+}
+
+void Scheduler::cycleEvent(unsigned /* unused */) {
+ SmallVector<ResourceRef, 8> ResourcesFreed;
+ Resources->cycleEvent(ResourcesFreed);
+
+ for (const ResourceRef &RR : ResourcesFreed)
+ notifyResourceAvailable(RR);
+
+ updateIssuedQueue();
+ updatePendingQueue();
+ issue();
+}
+
+#ifndef NDEBUG
+void Scheduler::dump() const {
+ dbgs() << "[SCHEDULER]: WaitQueue size is: " << WaitQueue.size() << '\n';
+ dbgs() << "[SCHEDULER]: ReadyQueue size is: " << ReadyQueue.size() << '\n';
+ dbgs() << "[SCHEDULER]: IssuedQueue size is: " << IssuedQueue.size() << '\n';
+ Resources->dump();
+}
+#endif
+
+Scheduler::Event Scheduler::canBeDispatched(const InstrDesc &Desc) const {
+ if (Desc.MayLoad && LSU->isLQFull())
+ return HWS_LD_QUEUE_UNAVAILABLE;
+ if (Desc.MayStore && LSU->isSQFull())
+ return HWS_ST_QUEUE_UNAVAILABLE;
+
+ Scheduler::Event Event;
+ switch (Resources->canBeDispatched(Desc)) {
+ case ResourceStateEvent::RS_BUFFER_AVAILABLE:
+ Event = HWS_AVAILABLE;
+ break;
+ case ResourceStateEvent::RS_BUFFER_UNAVAILABLE:
+ Event = HWS_QUEUE_UNAVAILABLE;
+ break;
+ case ResourceStateEvent::RS_RESERVED:
+ Event = HWS_DISPATCH_GROUP_RESTRICTION;
+ }
+ return Event;
+}
+
+void Scheduler::issueInstruction(Instruction *IS, unsigned InstrIndex) {
+ // Issue the instruction and collect all the consumed resources
+ // into a vector. That vector is then used to notify the listener.
+ // Most instructions consume very few resurces (typically one or
+ // two resources). We use a small vector here, and conservatively
+ // initialize its capacity to 4. This should address the majority of
+ // the cases.
+ SmallVector<std::pair<ResourceRef, unsigned>, 4> UsedResources;
+
+ const InstrDesc &D = IS->getDesc();
+ Resources->issueInstruction(InstrIndex, D, UsedResources);
+ // Notify the instruction that it started executing.
+ // This updates the internal state of each write.
+ IS->execute();
+
+ if (D.MaxLatency) {
+ IssuedQueue[InstrIndex] = IS;
+ notifyInstructionIssued(InstrIndex, UsedResources);
+ } else {
+ // A zero latency instruction which reads and/or updates registers.
+ notifyInstructionIssued(InstrIndex, UsedResources);
+ notifyInstructionExecuted(InstrIndex);
+ }
+}
+
+void Scheduler::issue() {
+ std::vector<unsigned> ToRemove;
+ for (const QueueEntryTy QueueEntry : ReadyQueue) {
+ // Give priority to older instructions in ReadyQueue. The ready queue is
+ // ordered by key, and therefore older instructions are visited first.
+ Instruction *IS = QueueEntry.second;
+ const InstrDesc &D = IS->getDesc();
+ if (!Resources->canBeIssued(D))
+ continue;
+ unsigned InstrIndex = QueueEntry.first;
+ issueInstruction(IS, InstrIndex);
+ ToRemove.emplace_back(InstrIndex);
+ }
+
+ for (const unsigned InstrIndex : ToRemove)
+ ReadyQueue.erase(InstrIndex);
+}
+
+void Scheduler::updatePendingQueue() {
+ // Scan the set of waiting instructions and promote them to the
+ // ready queue if operands are all ready.
+ for (auto I = WaitQueue.begin(), E = WaitQueue.end(); I != E;) {
+ const QueueEntryTy Entry = *I;
+ Entry.second->cycleEvent();
+
+ const InstrDesc &Desc = Entry.second->getDesc();
+ bool IsMemOp = Desc.MayLoad || Desc.MayStore;
+ bool IsReady = Entry.second->isReady();
+ if (IsReady && IsMemOp)
+ IsReady &= LSU->isReady(Entry.first);
+
+ if (IsReady) {
+ notifyInstructionReady(Entry.first);
+ ReadyQueue[Entry.first] = Entry.second;
+ auto ToRemove = I;
+ ++I;
+ WaitQueue.erase(ToRemove);
+ } else {
+ ++I;
+ }
+ }
+}
+
+void Scheduler::updateIssuedQueue() {
+ for (auto I = IssuedQueue.begin(), E = IssuedQueue.end(); I != E;) {
+ const QueueEntryTy Entry = *I;
+ Entry.second->cycleEvent();
+ if (Entry.second->isExecuted()) {
+ notifyInstructionExecuted(Entry.first);
+ auto ToRemove = I;
+ ++I;
+ IssuedQueue.erase(ToRemove);
+ } else {
+ DEBUG(dbgs() << "[SCHEDULER]: Instruction " << Entry.first
+ << " is still executing.\n");
+ ++I;
+ }
+ }
+}
+
+void Scheduler::notifyInstructionIssued(
+ unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
+ Owner->notifyInstructionIssued(Index, Used);
+}
+
+void Scheduler::notifyInstructionExecuted(unsigned Index) {
+ LSU->onInstructionExecuted(Index);
+ Owner->notifyInstructionExecuted(Index);
+}
+
+void Scheduler::notifyInstructionReady(unsigned Index) {
+ Owner->notifyInstructionReady(Index);
+}
+
+void Scheduler::notifyResourceAvailable(const ResourceRef &RR) {
+ Owner->notifyResourceAvailable(RR);
+}
+} // namespace mca
--- /dev/null
+//===--------------------- Scheduler.h ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A scheduler for Processor Resource Units and Processor Resource Groups.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+
+#include "Instruction.h"
+#include "LSUnit.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+class Backend;
+/// Used to notify the internal state of a processor resource.
+///
+/// A processor resource is available if it is not reserved, and there are
+/// available slots in the buffer. A processor resource is unavailable if it
+/// is either reserved, or the associated buffer is full. A processor resource
+/// with a buffer size of -1 is always available if it is not reserved.
+///
+/// Values of type ResourceStateEvent are returned by method
+/// ResourceState::isBufferAvailable(), which is used to query the internal
+/// state of a resource.
+///
+/// The naming convention for resource state events is:
+/// * Event names start with prefix RS_
+/// * Prefix RS_ is followed by a string describing the actual resource state.
+enum ResourceStateEvent {
+ RS_BUFFER_AVAILABLE,
+ RS_BUFFER_UNAVAILABLE,
+ RS_RESERVED
+};
+
+/// \brief A descriptor for processor resources.
+///
+/// Each object of class ResourceState is associated to a specific processor
+/// resource. There is an instance of this class for every processor resource
+/// defined by the scheduling model.
+/// A ResourceState dynamically tracks the availability of units of a processor
+/// resource. For example, the ResourceState of a ProcResGroup tracks the
+/// availability of resource units which are part of the group.
+///
+/// Internally, ResourceState uses a round-robin selector to identify
+/// which unit of the group shall be used next.
+class ResourceState {
+ // A resource unique identifier generated by the tool.
+ // For processor resource groups, the number of number of bits set in this
+ // mask is equivalent to the cardinality of the group plus one.
+ // Excluding the most significant bit, the remaining bits in the resource mask
+ // identify resources that are part of the group.
+ //
+ // Example, lets assume that this ResourceState describes a
+ // group containing ResourceA and ResourceB:
+ //
+ // ResourceA -- Mask: 0b001
+ // ResourceB -- Mask: 0b010
+ // ResourceAB -- Mask: 0b100 U (ResourceA::Mask | ResourceB::Mask) == 0b111
+ //
+ // There is only one bit set for non-group resources.
+ // A ResourceMask can be used to solve set membership problems with simple bit
+ // manipulation operations.
+ uint64_t ResourceMask;
+
+ // A ProcResource can specify a number of units. For the purpose of dynamic
+ // scheduling, a processor resource with more than one unit behaves like a
+ // group. This field has one bit set for every unit/resource that is part of
+ // the group.
+ // For groups, this field defaults to 'ResourceMask'. For non-group
+ // resources, the number of bits set in this mask is equivalent to the
+ // number of units (i.e. field 'NumUnits' in 'ProcResourceUnits').
+ uint64_t ResourceSizeMask;
+
+ // A simple round-robin selector for processor resources.
+ // Each bit of the mask identifies a sub resource within this group.
+ //
+ // As an example, lets assume that this ResourceState describes a
+ // processor resource group composed of the following three units:
+ // ResourceA -- 0b001
+ // ResourceB -- 0b010
+ // ResourceC -- 0b100
+ //
+ // Each unit is identified by a ResourceMask which always contains a
+ // single bit set. Field NextInSequenceMask is initially set to value
+ // 0xb111. That value is obtained by OR'ing the resource masks of
+ // processor resource that are part of the group.
+ //
+ // NextInSequenceMask -- 0b111
+ //
+ // Field NextInSequenceMask is used by the resource manager (i.e.
+ // an object of class ResourceManager) to select the "next available resource"
+ // from the set. The algorithm would prioritize resources with a bigger
+ // ResourceMask value.
+ //
+ // In this example, there are three resources in the set, and 'ResourceC'
+ // has the highest mask value. The round-robin selector would firstly select
+ // 'ResourceC', then 'ResourceB', and eventually 'ResourceA'.
+ //
+ // When a resource R is used, its corresponding bit is cleared from the set.
+ //
+ // Back to the example:
+ // If 'ResourceC' is selected, then the new value of NextInSequenceMask
+ // becomes 0xb011.
+ //
+ // When NextInSequenceMask becomes zero, it is reset to its original value
+ // (in this example, that value would be 0b111).
+ uint64_t NextInSequenceMask;
+
+ // Some instructions can only be issued on very specific pipeline resources.
+ // For those instructions, we know exactly which resource would be consumed
+ // without having to dynamically select it using field 'NextInSequenceMask'.
+ //
+ // The resource mask bit associated to the (statically) selected
+ // processor resource is still cleared from the 'NextInSequenceMask'.
+ // If that bit was already zero in NextInSequenceMask, then we update
+ // mask 'RemovedFromNextInSequence'.
+ //
+ // When NextInSequenceMask is reset back to its initial value, the algorithm
+ // removes any bits which are set in RemoveFromNextInSequence.
+ uint64_t RemovedFromNextInSequence;
+
+ // A mask of ready units.
+ uint64_t ReadyMask;
+
+ // Buffered resources will have this field set to a positive number bigger
+ // than 0. A buffered resource behaves like a separate reservation station
+ // implementing its own buffer for out-of-order execution.
+ // A buffer of 1 is for units that force in-order execution.
+ // A value of 0 is treated specially. In particular, a resource with
+ // A BufferSize = 0 is for an in-order issue/dispatch resource.
+ // That means, this resource is reserved starting from the dispatch event,
+ // until all the "resource cycles" are consumed after the issue event.
+ // While this resource is reserved, no other instruction may be dispatched.
+ int BufferSize;
+
+ // Available slots in the buffer (zero, if this is not a buffered resource).
+ unsigned AvailableSlots;
+
+ // Maximum number of buffer slots seen used during one cycle.
+ // This helps tracking dynamic dispatch stalls caused by the lack of
+ // entries in the scheduler's queue.
+ unsigned MaxUsedSlots;
+
+ // True if this is resource is currently unavailable.
+ // An instruction may "reserve" a resource for a number of cycles.
+ // During those cycles, the reserved resource cannot be used for other
+ // instructions, even if the ReadyMask is set.
+ bool Unavailable;
+
+ bool isSubResourceReady(uint64_t ID) const { return ReadyMask & ID; }
+
+ /// Returns the mask identifier of the next available resource in the set.
+ uint64_t getNextInSequence() const {
+ assert(NextInSequenceMask);
+ return llvm::PowerOf2Floor(NextInSequenceMask);
+ }
+
+ /// Returns the mask of the next available resource within the set,
+ /// and updates the resource selector.
+ void updateNextInSequence() {
+ NextInSequenceMask ^= getNextInSequence();
+ if (!NextInSequenceMask)
+ NextInSequenceMask = ResourceSizeMask;
+ }
+
+ uint64_t computeResourceSizeMaskForGroup(uint64_t ResourceMask) {
+ assert(llvm::countPopulation(ResourceMask) > 1);
+ return ResourceMask ^ llvm::PowerOf2Floor(ResourceMask);
+ }
+
+public:
+ ResourceState(const llvm::MCProcResourceDesc &Desc, uint64_t Mask)
+ : ResourceMask(Mask) {
+ bool IsAGroup = llvm::countPopulation(ResourceMask) > 1;
+ ResourceSizeMask = IsAGroup ? computeResourceSizeMaskForGroup(ResourceMask)
+ : ((1ULL << Desc.NumUnits) - 1);
+ NextInSequenceMask = ResourceSizeMask;
+ RemovedFromNextInSequence = 0;
+ ReadyMask = ResourceSizeMask;
+ BufferSize = Desc.BufferSize;
+ AvailableSlots = BufferSize == -1 ? 0U : static_cast<unsigned>(BufferSize);
+ MaxUsedSlots = 0;
+ Unavailable = false;
+ }
+
+ uint64_t getResourceMask() const { return ResourceMask; }
+ int getBufferSize() const { return BufferSize; }
+ unsigned getMaxUsedSlots() const { return MaxUsedSlots; }
+
+ bool isBuffered() const { return BufferSize > 0; }
+ bool isInOrder() const { return BufferSize == 1; }
+ bool isADispatchHazard() const { return BufferSize == 0; }
+ bool isReserved() const { return Unavailable; }
+
+ void setReserved() { Unavailable = true; }
+ void clearReserved() { Unavailable = false; }
+
+ // A resource is ready if it is not reserved, and if there are enough
+ // available units.
+ // If a resource is also a dispatch hazard, then we don't check if
+ // it is reserved because that check would always return true.
+ // A resource marked as "dispatch hazard" is always reserved at
+ // dispatch time. When this method is called, the assumption is that
+ // the user of this resource has been already dispatched.
+ bool isReady(unsigned NumUnits = 1) const {
+ return (!isReserved() || isADispatchHazard()) &&
+ llvm::countPopulation(ReadyMask) >= NumUnits;
+ }
+ bool isAResourceGroup() const {
+ return llvm::countPopulation(ResourceMask) > 1;
+ }
+
+ bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
+
+ void markSubResourceAsUsed(uint64_t ID) {
+ assert(isSubResourceReady(ID));
+ ReadyMask ^= ID;
+ }
+
+ void releaseSubResource(uint64_t ID) {
+ assert(!isSubResourceReady(ID));
+ ReadyMask ^= ID;
+ }
+
+ unsigned getNumUnits() const {
+ return isAResourceGroup() ? 1U : llvm::countPopulation(ResourceSizeMask);
+ }
+
+ uint64_t selectNextInSequence();
+ void removeFromNextInSequence(uint64_t ID);
+
+ ResourceStateEvent isBufferAvailable() const {
+ if (isADispatchHazard() && isReserved())
+ return RS_RESERVED;
+ if (!isBuffered() || AvailableSlots)
+ return RS_BUFFER_AVAILABLE;
+ return RS_BUFFER_UNAVAILABLE;
+ }
+
+ void reserveBuffer() {
+ if (AvailableSlots)
+ AvailableSlots--;
+ unsigned UsedSlots = static_cast<unsigned>(BufferSize) - AvailableSlots;
+ MaxUsedSlots = std::max(MaxUsedSlots, UsedSlots);
+ }
+
+ void releaseBuffer() {
+ if (BufferSize > 0)
+ AvailableSlots++;
+ assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
+ }
+
+#ifndef NDEBUG
+ void dump() const;
+#endif
+};
+
+/// \brief A resource unit identifier.
+///
+/// This is used to identify a specific processor resource unit using a pair
+/// of indices where the 'first' index is a processor resource mask, and the
+/// 'second' index is an index for a "sub-resource" (i.e. unit).
+typedef std::pair<uint64_t, uint64_t> ResourceRef;
+
+// First: a resource mask identifying a buffered resource.
+// Second: max number of buffer entries used in this resource.
+typedef std::pair<uint64_t, unsigned> BufferUsageEntry;
+
+/// A resource manager for processor resource units and groups.
+///
+/// This class owns all the ResourceState objects, and it is responsible for
+/// acting on requests from a Scheduler by updating the internal state of
+/// ResourceState objects.
+/// This class doesn't know about instruction itineraries and functional units.
+/// In future, it can be extended to support itineraries too through the same
+/// public interface.
+class ResourceManager {
+ // The resource manager owns all the ResourceState.
+ using UniqueResourceState = std::unique_ptr<ResourceState>;
+ llvm::SmallDenseMap<uint64_t, UniqueResourceState> Resources;
+
+ // Keeps track of which resources are busy, and how many cycles are left
+ // before those become usable again.
+ llvm::SmallDenseMap<ResourceRef, unsigned> BusyResources;
+
+ // A table to map processor resource IDs to processor resource masks.
+ llvm::SmallVector<uint64_t, 8> ProcResID2Mask;
+
+ // Adds a new resource state in Resources, as well as a new descriptor in
+ // ResourceDescriptor.
+ void addResource(const llvm::MCProcResourceDesc &Desc, uint64_t Mask);
+
+ // Compute processor resource masks for each processor resource declared by
+ // the scheduling model.
+ void computeProcResourceMasks(const llvm::MCSchedModel &SM);
+
+ // Populate resource descriptors.
+ void initialize(const llvm::MCSchedModel &SM) {
+ computeProcResourceMasks(SM);
+ for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I)
+ addResource(*SM.getProcResource(I), ProcResID2Mask[I]);
+ }
+
+ // Returns the actual resource unit that will be used.
+ ResourceRef selectPipe(uint64_t ResourceID);
+
+ void use(ResourceRef RR);
+ void release(ResourceRef RR);
+
+ unsigned getNumUnits(uint64_t ResourceID) const {
+ assert(Resources.find(ResourceID) != Resources.end());
+ return Resources.find(ResourceID)->getSecond()->getNumUnits();
+ }
+
+ // Reserve a specific Resource kind.
+ void reserveBuffer(uint64_t ResourceID) {
+ assert(isBufferAvailable(ResourceID) ==
+ ResourceStateEvent::RS_BUFFER_AVAILABLE);
+ ResourceState &Resource = *Resources[ResourceID];
+ Resource.reserveBuffer();
+ }
+
+ void releaseBuffer(uint64_t ResourceID) {
+ Resources[ResourceID]->releaseBuffer();
+ }
+
+ ResourceStateEvent isBufferAvailable(uint64_t ResourceID) const {
+ const ResourceState &Resource = *Resources.find(ResourceID)->second;
+ return Resource.isBufferAvailable();
+ }
+
+ bool isReady(uint64_t ResourceID, unsigned NumUnits) const {
+ const ResourceState &Resource = *Resources.find(ResourceID)->second;
+ return Resource.isReady(NumUnits);
+ }
+
+public:
+ ResourceManager(const llvm::MCSchedModel &SM) { initialize(SM); }
+
+ ResourceStateEvent canBeDispatched(const InstrDesc &Desc) const {
+ ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE;
+ for (uint64_t Buffer : Desc.Buffers) {
+ Result = isBufferAvailable(Buffer);
+ if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE)
+ break;
+ }
+
+ return Result;
+ }
+
+ void reserveBuffers(const InstrDesc &Desc) {
+ for (const uint64_t R : Desc.Buffers)
+ reserveBuffer(R);
+ }
+
+ void releaseBuffers(const InstrDesc &Desc) {
+ for (const uint64_t R : Desc.Buffers)
+ releaseBuffer(R);
+ }
+
+ void reserveResource(uint64_t ResourceID) {
+ ResourceState &Resource = *Resources[ResourceID];
+ assert(!Resource.isReserved());
+ Resource.setReserved();
+ }
+
+ void releaseResource(uint64_t ResourceID) {
+ ResourceState &Resource = *Resources[ResourceID];
+ Resource.clearReserved();
+ }
+
+ void reserveDispatchHazardResources(const InstrDesc &Desc);
+
+ // Returns true if all resources are in-order, and there is at least one
+ // resource which is a dispatch hazard (BufferSize = 0).
+ bool mustIssueImmediately(const InstrDesc &Desc);
+
+ bool canBeIssued(const InstrDesc &Desc) const;
+ double getRThroughput(const InstrDesc &Desc) const;
+
+ void issueInstruction(
+ unsigned Index, const InstrDesc &Desc,
+ llvm::SmallVectorImpl<std::pair<ResourceRef, unsigned>> &Pipes);
+
+ void cycleEvent(llvm::SmallVectorImpl<ResourceRef> &ResourcesFreed);
+
+ void getBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
+ for (const std::pair<uint64_t, UniqueResourceState> &Resource : Resources) {
+ const ResourceState &RS = *Resource.second;
+ if (RS.isBuffered())
+ Usage.emplace_back(std::pair<uint64_t, unsigned>(RS.getResourceMask(),
+ RS.getMaxUsedSlots()));
+ }
+ }
+
+ const llvm::ArrayRef<uint64_t> getProcResMasks() const {
+ return ProcResID2Mask;
+ }
+
+#ifndef NDEBUG
+ void dump() const {
+ for (const std::pair<uint64_t, UniqueResourceState> &Resource : Resources)
+ Resource.second->dump();
+ }
+#endif
+}; // namespace mca
+
+/// Class Scheduler is responsible for issuing instructions to pipeline
+/// resources. Internally, it delegates to a ResourceManager the management of
+/// processor resources.
+/// This class is also responsible for tracking the progress of instructions
+/// from the dispatch stage, until the write-back stage.
+///
+/// An nstruction dispatched to the Scheduler is initially placed into either
+/// the 'WaitQueue' or the 'ReadyQueue' depending on the availability of the
+/// input operands. Instructions in the WaitQueue are ordered by instruction
+/// index. An instruction is moved from the WaitQueue to the ReadyQueue when
+/// register operands become available, and all memory dependencies are met.
+/// Instructions that are moved from the WaitQueue to the ReadyQueue transition
+/// from state 'IS_AVAILABLE' to state 'IS_READY'.
+///
+/// At the beginning of each cycle, the Scheduler checks if there are
+/// instructions in the WaitQueue that can be moved to the ReadyQueue. If the
+/// ReadyQueue is not empty, then older instructions from the queue are issued
+/// to the processor pipelines, and the underlying ResourceManager is updated
+/// accordingly. The ReadyQueue is ordered by instruction index to guarantee
+/// that the first instructions in the set are also the oldest.
+///
+/// An Instruction is moved from the ReadyQueue the `IssuedQueue` when it is
+/// issued to a (one or more) pipeline(s). This event also causes an instruction
+/// state transition (i.e. from state IS_READY, to state IS_EXECUTING).
+/// An Instruction leaves the IssuedQueue when it reaches the write-back stage.
+class Scheduler {
+ const llvm::MCSchedModel &SM;
+
+ // Hardware resources that are managed by this scheduler.
+ std::unique_ptr<ResourceManager> Resources;
+ std::unique_ptr<LSUnit> LSU;
+
+ // The Backend gets notified when instructions are ready/issued/executed.
+ Backend *Owner;
+
+ using QueueEntryTy = std::pair<unsigned, Instruction *>;
+ std::map<unsigned, Instruction *> WaitQueue;
+ std::map<unsigned, Instruction *> ReadyQueue;
+ std::map<unsigned, Instruction *> IssuedQueue;
+
+ void notifyInstructionIssued(
+ unsigned Index,
+ const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used);
+ void notifyInstructionExecuted(unsigned Index);
+ void notifyInstructionReady(unsigned Index);
+ void notifyResourceAvailable(const ResourceRef &RR);
+
+ /// Issue instructions from the ready queue by giving priority to older
+ /// instructions.
+ void issue();
+
+ /// Issue an instruction without updating the ready queue.
+ void issueInstruction(Instruction *IS, unsigned InstrIndex);
+ void updatePendingQueue();
+ void updateIssuedQueue();
+
+public:
+ Scheduler(Backend *B, const llvm::MCSchedModel &Model, unsigned LoadQueueSize,
+ unsigned StoreQueueSize, bool AssumeNoAlias)
+ : SM(Model), Resources(llvm::make_unique<ResourceManager>(SM)),
+ LSU(llvm::make_unique<LSUnit>(LoadQueueSize, StoreQueueSize,
+ AssumeNoAlias)),
+ Owner(B) {}
+
+ /// Scheduling events.
+ ///
+ /// The DispatchUnit is responsible for querying the Scheduler before
+ /// dispatching new instructions. Queries are performed through method
+ /// `Scheduler::CanBeDispatched`, which returns an instance of this enum to
+ /// tell if the dispatch would fail or not. If scheduling resources are
+ /// available, and the instruction can be dispatched, then the query returns
+ /// HWS_AVAILABLE. A values different than HWS_AVAILABLE means that the
+ /// instruction cannot be dispatched during this cycle.
+ ///
+ /// Each event name starts with prefix "HWS_", and it is followed by
+ /// a substring which describes the reason why the Scheduler was unavailable
+ /// (or "AVAILABLE" if the instruction is allowed to be dispatched).
+ ///
+ /// HWS_QUEUE_UNAVAILABLE is returned if there are not enough available slots
+ /// in the scheduler's queue. That means, one (or more) buffered resources
+ /// consumed by the instruction were full.
+ ///
+ /// HWS_LD_QUEUE_UNAVAILABLE is returned when an instruction 'mayLoad', and
+ /// the load queue in the load/store unit (implemented by class LSUnit) is
+ /// full. Similarly, HWS_ST_QUEUE_UNAVAILABLE is returned when the store
+ /// queue is full, and the instruction to be dispatched 'mayStore'.
+ ///
+ /// HWS_DISPATCH_GROUP_RESTRICTION is only returned in special cases where the
+ /// instruction consumes an in-order issue/dispatch resource (i.e. a resource
+ /// with `BufferSize=0`), and the pipeline resource is not immediately
+ /// available.
+ enum Event {
+ HWS_AVAILABLE,
+ HWS_QUEUE_UNAVAILABLE,
+ HWS_DISPATCH_GROUP_RESTRICTION,
+ HWS_LD_QUEUE_UNAVAILABLE,
+ HWS_ST_QUEUE_UNAVAILABLE
+ };
+
+ Event canBeDispatched(const InstrDesc &Desc) const;
+ Instruction *scheduleInstruction(unsigned Idx, Instruction *MCIS);
+
+ double getRThroughput(const InstrDesc &Desc) const {
+ return Resources->getRThroughput(Desc);
+ }
+
+ void cycleEvent(unsigned Cycle);
+
+ void getBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
+ Resources->getBuffersUsage(Usage);
+ }
+
+ const llvm::ArrayRef<uint64_t> getProcResourceMasks() const {
+ return Resources->getProcResMasks();
+ }
+
+#ifndef NDEBUG
+ void dump() const;
+#endif
+};
+
+} // Namespace mca
+
+#endif
--- /dev/null
+//===--------------------- SourceMgr.h --------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements class SourceMgr. Class SourceMgr abstracts the input
+/// code sequence (a sequence of MCInst), and assings unique identifiers to
+/// every instruction in the sequence.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+
+#include "llvm/MC/MCInst.h"
+#include <vector>
+
+namespace mca {
+
+typedef std::pair<unsigned, const llvm::MCInst *> InstRef;
+
+class SourceMgr {
+ using InstVec = std::vector<std::unique_ptr<const llvm::MCInst>>;
+ InstVec Sequence;
+ unsigned Current;
+ unsigned Iterations;
+ static const unsigned DefaultIterations = 70;
+
+public:
+ SourceMgr(unsigned NumIterations)
+ : Current(0),
+ Iterations(NumIterations ? NumIterations : DefaultIterations) {}
+
+ unsigned getCurrentIteration() const { return Current / Sequence.size(); }
+ unsigned getNumIterations() const { return Iterations; }
+ unsigned size() const { return Sequence.size(); }
+ const InstVec &getSequence() const { return Sequence; }
+ InstVec &getSequence() { return Sequence; }
+
+ bool hasNext() { return Current < (Iterations * size()); }
+ void updateNext() { Current++; }
+
+ const InstRef peekNext() const {
+ unsigned Index = getCurrentInstructionIndex();
+ return InstRef(Current, Sequence[Index].get());
+ }
+
+ unsigned getCurrentInstructionIndex() const {
+ return Current % Sequence.size();
+ }
+
+ const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
+ return *Sequence[Index % size()];
+ }
+
+ bool isEmpty() const { return size() == 0; }
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===--------------------- TimelineView.cpp ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements the TimelineView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "TimelineView.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void TimelineView::initialize(unsigned MaxIterations) {
+ unsigned NumInstructions =
+ AsmSequence.getNumIterations() * AsmSequence.size();
+ if (!MaxIterations)
+ MaxIterations = DEFAULT_ITERATIONS;
+ unsigned NumEntries =
+ std::min(NumInstructions, MaxIterations * AsmSequence.size());
+ Timeline.resize(NumEntries);
+ TimelineViewEntry NullTVEntry = {0, 0, 0, 0, 0};
+ std::fill(Timeline.begin(), Timeline.end(), NullTVEntry);
+
+ WaitTime.resize(AsmSequence.size());
+ WaitTimeEntry NullWTEntry = {0, 0, 0, 0};
+ std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry);
+}
+
+void TimelineView::onInstructionDispatched(unsigned Index) {
+ if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+ return;
+ Timeline[Index].CycleDispatched = CurrentCycle;
+ LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::onInstructionReady(unsigned Index) {
+ if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+ return;
+ Timeline[Index].CycleReady = CurrentCycle;
+ LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::onInstructionIssued(
+ unsigned Index,
+ const ArrayRef<std::pair<ResourceRef, unsigned>> & /* Unused */) {
+ if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+ return;
+ Timeline[Index].CycleIssued = CurrentCycle;
+ LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::onInstructionExecuted(unsigned Index) {
+ if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+ return;
+ Timeline[Index].CycleExecuted = CurrentCycle;
+ LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::onInstructionRetired(unsigned Index) {
+ if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+ return;
+ TimelineViewEntry &TVEntry = Timeline[Index];
+ TVEntry.CycleRetired = CurrentCycle;
+ LastCycle = std::max(LastCycle, CurrentCycle);
+
+ // Update the WaitTime entry which corresponds to this Index.
+
+ WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()];
+ WTEntry.Executions++;
+ WTEntry.CyclesSpentInSchedulerQueue +=
+ TVEntry.CycleIssued - TVEntry.CycleDispatched;
+ assert(TVEntry.CycleDispatched <= TVEntry.CycleReady);
+ WTEntry.CyclesSpentInSQWhileReady += TVEntry.CycleIssued - TVEntry.CycleReady;
+ WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
+ (TVEntry.CycleRetired - 1) - TVEntry.CycleExecuted;
+}
+
+void TimelineView::printWaitTimeEntry(raw_string_ostream &OS,
+ const WaitTimeEntry &Entry,
+ unsigned SourceIndex) const {
+ OS << SourceIndex << '.';
+ if (SourceIndex < 10)
+ OS << " ";
+ else if (SourceIndex < 100)
+ OS << " ";
+ else if (SourceIndex < 1000)
+ OS << " ";
+ else
+ OS << ' ';
+
+ if (Entry.Executions == 0) {
+ OS << " - - - - ";
+ } else {
+ double AverageTime1, AverageTime2, AverageTime3;
+ unsigned Executions = Entry.Executions;
+ AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions;
+ AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions;
+ AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions;
+ if (Executions < 10)
+ OS << ' ' << Executions << " ";
+ else if (Executions < 100)
+ OS << ' ' << Executions << " ";
+ else
+ OS << Executions << " ";
+
+ OS << format("%.1f", AverageTime1);
+ if (AverageTime1 < 10.0)
+ OS << " ";
+ else if (AverageTime1 < 100.0)
+ OS << " ";
+ else
+ OS << " ";
+
+ OS << format("%.1f", AverageTime2);
+ if (AverageTime2 < 10.0)
+ OS << " ";
+ else if (AverageTime2 < 100.0)
+ OS << " ";
+ else
+ OS << " ";
+
+ OS << format("%.1f", AverageTime3);
+ if (AverageTime3 < 10.0)
+ OS << " ";
+ else if (AverageTime3 < 100.0)
+ OS << ' ';
+ }
+}
+
+void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
+ if (WaitTime.empty())
+ return;
+
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+
+ TempStream
+ << "\n\nAverage Wait times (based on the timeline view):\n"
+ << "[0]: Executions\n"
+ << "[1]: Average time spent waiting in a scheduler's queue\n"
+ << "[2]: Average time spent waiting in a scheduler's queue while ready\n"
+ << "[3]: Average time elapsed from WB until retire stage\n\n";
+ TempStream << " [0] [1] [2] [3]\n";
+
+ for (unsigned I = 0, E = WaitTime.size(); I < E; ++I) {
+ printWaitTimeEntry(TempStream, WaitTime[I], I);
+ // Append the instruction info at the end of the line.
+ const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
+ MCIP.printInst(&Inst, TempStream, "", STI);
+ TempStream << '\n';
+ TempStream.flush();
+ OS << Buffer;
+ Buffer = "";
+ }
+}
+
+void TimelineView::printTimelineViewEntry(raw_string_ostream &OS,
+ const TimelineViewEntry &Entry,
+ unsigned Iteration,
+ unsigned SourceIndex) const {
+ if (SourceIndex == 0)
+ OS << '\n';
+ OS << '[' << Iteration << ',' << SourceIndex << "]\t";
+ for (unsigned I = 0, E = Entry.CycleDispatched; I < E; ++I)
+ OS << ((I % 5 == 0) ? '.' : ' ');
+ OS << 'D';
+ if (Entry.CycleDispatched != Entry.CycleExecuted) {
+ // Zero latency instructions have the same value for CycleDispatched,
+ // CycleIssued and CycleExecuted.
+ for (unsigned I = Entry.CycleDispatched + 1, E = Entry.CycleIssued; I < E; ++I)
+ OS << '=';
+ if (Entry.CycleIssued == Entry.CycleExecuted)
+ OS << 'E';
+ else {
+ if (Entry.CycleDispatched != Entry.CycleIssued)
+ OS << 'e';
+ for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E; ++I)
+ OS << 'e';
+ OS << 'E';
+ }
+ }
+
+ for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I)
+ OS << '-';
+ OS << 'R';
+
+ // Skip other columns.
+ for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I)
+ OS << ((I % 5 == 0 || I == LastCycle) ? '.' : ' ');
+}
+
+static void printTimelineHeader(raw_string_ostream &OS, unsigned Cycles) {
+ OS << "\n\nTimeline view:\n";
+ OS << " \t";
+ for (unsigned I = 0; I <= Cycles; ++I) {
+ if (((I / 10) & 1) == 0)
+ OS << ' ';
+ else
+ OS << I % 10;
+ }
+
+ OS << "\nIndex\t";
+ for (unsigned I = 0; I <= Cycles; ++I) {
+ if (((I / 10) & 1) == 0)
+ OS << I % 10;
+ else
+ OS << ' ';
+ }
+ OS << '\n';
+}
+
+void TimelineView::printTimeline(raw_ostream &OS) const {
+ std::string Buffer;
+ raw_string_ostream TempStream(Buffer);
+
+ printTimelineHeader(TempStream, LastCycle);
+ TempStream.flush();
+ OS << Buffer;
+
+ for (unsigned I = 0, E = Timeline.size(); I < E; ++I) {
+ Buffer = "";
+ const TimelineViewEntry &Entry = Timeline[I];
+ if (Entry.CycleRetired == 0)
+ return;
+
+ unsigned Iteration = I / AsmSequence.size();
+ unsigned SourceIndex = I % AsmSequence.size();
+ printTimelineViewEntry(TempStream, Entry, Iteration, SourceIndex);
+ // Append the instruction info at the end of the line.
+ const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
+ MCIP.printInst(&Inst, TempStream, "", STI);
+ TempStream << '\n';
+ TempStream.flush();
+ OS << Buffer;
+ }
+}
+
+} // namespace mca
--- /dev/null
+//===--------------------- TimelineView.h ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements a timeline view for the llvm-mca tool.
+///
+/// Class TimelineView observes events generated by the backend. For every
+/// instruction executed by the backend, it stores information related to
+/// state transition. It then plots that information in the form of a table
+/// as reported by the example below:
+///
+/// Timeline view:
+/// 0123456
+/// Index 0123456789
+///
+/// [0,0] DeER . . .. vmovshdup %xmm0, %xmm1
+/// [0,1] DeER . . .. vpermilpd $1, %xmm0, %xmm2
+/// [0,2] .DeER. . .. vpermilps $231, %xmm0, %xmm5
+/// [0,3] .DeeeER . .. vaddss %xmm1, %xmm0, %xmm3
+/// [0,4] . D==eeeER. .. vaddss %xmm3, %xmm2, %xmm4
+/// [0,5] . D=====eeeER .. vaddss %xmm4, %xmm5, %xmm6
+///
+/// [1,0] . DeE------R .. vmovshdup %xmm0, %xmm1
+/// [1,1] . DeE------R .. vpermilpd $1, %xmm0, %xmm2
+/// [1,2] . DeE-----R .. vpermilps $231, %xmm0, %xmm5
+/// [1,3] . D=eeeE--R .. vaddss %xmm1, %xmm0, %xmm3
+/// [1,4] . D===eeeER .. vaddss %xmm3, %xmm2, %xmm4
+/// [1,5] . D======eeeER vaddss %xmm4, %xmm5, %xmm6
+///
+/// There is an entry for every instruction in the input assembly sequence.
+/// The first field is a pair of numbers obtained from the instruction index.
+/// The first element of the pair is the iteration index, while the second
+/// element of the pair is a sequence number (i.e. a position in the assembly
+/// sequence).
+/// The second field of the table is the actual timeline information; each
+/// column is the information related to a specific cycle of execution.
+/// The timeline of an instruction is described by a sequence of character
+/// where each character represents the instruction state at a specific cycle.
+///
+/// Possible instruction states are:
+/// D: Instruction Dispatched
+/// e: Instruction Executing
+/// E: Instruction Executed (write-back stage)
+/// R: Instruction retired
+/// =: Instruction waiting in the Scheduler's queue
+/// -: Instruction executed, waiting to retire in order.
+///
+/// dots ('.') and empty spaces are cycles where the instruction is not
+/// in-flight.
+///
+/// The last column is the assembly instruction associated to the entry.
+///
+/// Based on the timeline view information from the example, instruction 0
+/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3.
+/// Instruction [0,1] was also dispatched at cycle 0, and it retired at
+/// the same cycle than instruction [0,0].
+/// Instruction [0,4] has been dispatched at cycle 2. However, it had to
+/// wait for two cycles before being issued. That is because operands
+/// became ready only at cycle 5.
+///
+/// This view helps further understanding bottlenecks and the impact of
+/// resource pressure on the code.
+///
+/// To better understand why instructions had to wait for multiple cycles in
+/// the scheduler's queue, class TimelineView also reports extra timing info
+/// in another table named "Average Wait times" (see example below).
+///
+///
+/// Average Wait times (based on the timeline view):
+/// [0]: Executions
+/// [1]: Average time spent waiting in a scheduler's queue
+/// [2]: Average time spent waiting in a scheduler's queue while ready
+/// [3]: Average time elapsed from WB until retire stage
+///
+/// [0] [1] [2] [3]
+/// 0. 2 1.0 1.0 3.0 vmovshdup %xmm0, %xmm1
+/// 1. 2 1.0 1.0 3.0 vpermilpd $1, %xmm0, %xmm2
+/// 2. 2 1.0 1.0 2.5 vpermilps $231, %xmm0, %xmm5
+/// 3. 2 1.5 0.5 1.0 vaddss %xmm1, %xmm0, %xmm3
+/// 4. 2 3.5 0.0 0.0 vaddss %xmm3, %xmm2, %xmm4
+/// 5. 2 6.5 0.0 0.0 vaddss %xmm4, %xmm5, %xmm6
+///
+/// By comparing column [2] with column [1], we get an idea about how many
+/// cycles were spent in the scheduler's queue due to data dependencies.
+///
+/// In this example, instruction 5 spent an average of ~6 cycles in the
+/// scheduler's queue. As soon as operands became ready, the instruction
+/// was immediately issued to the pipeline(s).
+/// That is expected because instruction 5 cannot transition to the "ready"
+/// state until %xmm4 is written by instruction 4.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+
+#include "HWEventListener.h"
+#include "SourceMgr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace mca {
+
+/// \brief This class listens to instruction state transition events
+/// in order to construct a timeline information.
+///
+/// For every instruction executed by the Backend, this class constructs
+/// a TimelineViewEntry object. TimelineViewEntry objects are then used
+/// to print the timeline information, as well as the "average wait times"
+/// for every instruction in the input assembly sequence.
+class TimelineView : public HWEventListener {
+ const llvm::MCSubtargetInfo &STI;
+ llvm::MCInstPrinter &MCIP;
+ const SourceMgr &AsmSequence;
+
+ unsigned CurrentCycle;
+ unsigned MaxCycle;
+ unsigned LastCycle;
+
+ struct TimelineViewEntry {
+ unsigned CycleDispatched;
+ unsigned CycleReady;
+ unsigned CycleIssued;
+ unsigned CycleExecuted;
+ unsigned CycleRetired;
+ };
+ std::vector<TimelineViewEntry> Timeline;
+
+ struct WaitTimeEntry {
+ unsigned Executions;
+ unsigned CyclesSpentInSchedulerQueue;
+ unsigned CyclesSpentInSQWhileReady;
+ unsigned CyclesSpentAfterWBAndBeforeRetire;
+ };
+ std::vector<WaitTimeEntry> WaitTime;
+
+ void printTimelineViewEntry(llvm::raw_string_ostream &OS,
+ const TimelineViewEntry &E, unsigned Iteration,
+ unsigned SourceIndex) const;
+ void printWaitTimeEntry(llvm::raw_string_ostream &OS, const WaitTimeEntry &E,
+ unsigned Index) const;
+
+ const unsigned DEFAULT_ITERATIONS = 10;
+
+public:
+ TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer,
+ const SourceMgr &Sequence, unsigned MaxIterations,
+ unsigned Cycles)
+ : STI(sti), MCIP(Printer), AsmSequence(Sequence), CurrentCycle(0),
+ MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0) {
+ initialize(MaxIterations);
+ }
+
+ void initialize(unsigned MaxIterations);
+
+ // Event handlers.
+ void onInstructionDispatched(unsigned Index) override;
+ void onInstructionReady(unsigned Index) override;
+ void onInstructionIssued(
+ unsigned Index,
+ const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used) override;
+ void onInstructionExecuted(unsigned Index) override;
+ void onInstructionRetired(unsigned Index) override;
+ void onCycleBegin(unsigned Cycle) override { CurrentCycle = Cycle; }
+
+ // print functionalities.
+ void printTimeline(llvm::raw_ostream &OS) const;
+ void printAverageWaitTimes(llvm::raw_ostream &OS) const;
+};
+
+} // namespace mca
+
+#endif
--- /dev/null
+//===-- llvm-mca.cpp - Machine Code Analyzer -------------------*- C++ -* -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This utility is a simple driver that allows static performance analysis on
+// machine code similarly to how IACA (Intel Architecture Code Analyzer) works.
+//
+// llvm-mca [options] <file-name>
+// -march <type>
+// -mcpu <cpu>
+// -o <file>
+//
+// The target defaults to the host target.
+// The cpu defaults to 'generic'.
+// The output defaults to standard output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BackendPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+using namespace llvm;
+
+static cl::opt<std::string>
+ InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+ cl::init("-"),
+ cl::value_desc("filename"));
+
+static cl::opt<std::string>
+ ArchName("march", cl::desc("Target arch to assemble for, "
+ "see -version for available targets"));
+
+static cl::opt<std::string>
+ TripleName("mtriple", cl::desc("Target triple to assemble for, "
+ "see -version for available targets"));
+
+static cl::opt<std::string>
+ MCPU("mcpu",
+ cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+ cl::value_desc("cpu-name"), cl::init("generic"));
+
+static cl::opt<unsigned>
+ OutputAsmVariant("output-asm-variant",
+ cl::desc("Syntax variant to use for output printing"));
+
+static cl::opt<unsigned> Iterations("iterations",
+ cl::desc("Number of iterations to run"),
+ cl::init(0));
+
+static cl::opt<unsigned> DispatchWidth(
+ "dispatch",
+ cl::desc("Dispatch Width. By default it is set equal to IssueWidth"),
+ cl::init(0));
+
+static cl::opt<unsigned> MaxRetirePerCycle(
+ "max-retire-per-cycle",
+ cl::desc("Maximum number of instructions that can be retired in one cycle"),
+ cl::init(0));
+
+static cl::opt<unsigned>
+ RegisterFileSize("register-file-size",
+ cl::desc("Maximum number of temporary registers which can "
+ "be used for register mappings"),
+ cl::init(0));
+
+static cl::opt<bool> PrintTimelineView("timeline",
+ cl::desc("Print the timeline view"),
+ cl::init(false));
+
+static cl::opt<unsigned> TimelineMaxIterations(
+ "timeline-max-iterations",
+ cl::desc("Maximum number of iterations to print in timeline view"),
+ cl::init(0));
+
+static cl::opt<unsigned> TimelineMaxCycles(
+ "timeline-max-cycles",
+ cl::desc(
+ "Maximum number of cycles in the timeline view. Defaults to 80 cycles"),
+ cl::init(80));
+
+static cl::opt<bool> PrintModeVerbose("verbose",
+ cl::desc("Enable verbose output"),
+ cl::init(false));
+
+static cl::opt<bool>
+ AssumeNoAlias("noalias",
+ cl::desc("If set, it assumes that loads and stores do not alias"),
+ cl::init(true));
+
+static cl::opt<unsigned>
+ LoadQueueSize("lqueue", cl::desc("Size of the load queue"), cl::init(0));
+static cl::opt<unsigned>
+ StoreQueueSize("squeue", cl::desc("Size of the store queue"), cl::init(0));
+
+static const Target *getTarget(const char *ProgName) {
+ TripleName = Triple::normalize(TripleName);
+ if (TripleName.empty())
+ TripleName = Triple::normalize(sys::getDefaultTargetTriple());
+ Triple TheTriple(TripleName);
+
+ // Get the target specific parser.
+ std::string Error;
+ const Target *TheTarget =
+ TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
+ if (!TheTarget) {
+ errs() << ProgName << ": " << Error;
+ return nullptr;
+ }
+
+ // Return the found target.
+ return TheTarget;
+}
+
+static int AssembleInput(const char *ProgName, const Target *TheTarget,
+ SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
+ MCAsmInfo &MAI, MCSubtargetInfo &STI,
+ MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
+ std::unique_ptr<MCAsmParser> Parser(createMCAsmParser(SrcMgr, Ctx, Str, MAI));
+ std::unique_ptr<MCTargetAsmParser> TAP(
+ TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions));
+
+ if (!TAP) {
+ errs() << ProgName
+ << ": error: this target does not support assembly parsing.\n";
+ return 1;
+ }
+
+ Parser->setTargetParser(*TAP);
+ return Parser->Run(false);
+}
+
+namespace {
+
+class MCStreamerWrapper final : public MCStreamer {
+ using InstVec = std::vector<std::unique_ptr<const MCInst>>;
+ InstVec &Insts;
+
+public:
+ MCStreamerWrapper(MCContext &Context, InstVec &Vec)
+ : MCStreamer(Context), Insts(Vec) {}
+
+ // We only want to intercept the emission of new instructions.
+ virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ bool /* unused */) override {
+ Insts.emplace_back(new MCInst(Inst));
+ }
+
+ bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+ return true;
+ }
+
+ void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+ unsigned ByteAlignment) override {}
+ void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+ uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
+ void EmitGPRel32Value(const MCExpr *Value) override {}
+ void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+ void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+ void EmitCOFFSymbolType(int Type) override {}
+ void EndCOFFSymbolDef() override {}
+
+ const InstVec &GetInstructionSequence() const { return Insts; }
+};
+
+} // end of anonymous namespace
+
+int main(int argc, char **argv) {
+ sys::PrintStackTraceOnErrorSignal(argv[0]);
+ PrettyStackTraceProgram X(argc, argv);
+ llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+ // Initialize targets and assembly parsers.
+ llvm::InitializeAllTargetInfos();
+ llvm::InitializeAllTargetMCs();
+ llvm::InitializeAllAsmParsers();
+
+ // Enable printing of available targets when flag --version is specified.
+ cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+ // Parse flags and initialize target options.
+ cl::ParseCommandLineOptions(argc, argv,
+ "llvm machine code performance analyzer.\n");
+ MCTargetOptions MCOptions;
+ MCOptions.PreserveAsmComments = false;
+
+ // Get the target from the triple. If a triple is not specified, then select
+ // the default triple for the host. If the triple doesn't correspond to any
+ // registered target, then exit with an error message.
+ const char *ProgName = argv[0];
+ const Target *TheTarget = getTarget(ProgName);
+ if (!TheTarget)
+ return 1;
+
+ // GetTarget() may replaced TripleName with a default triple.
+ // For safety, reconstruct the Triple object.
+ Triple TheTriple(TripleName);
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
+ MemoryBuffer::getFileOrSTDIN(InputFilename);
+ if (std::error_code EC = BufferPtr.getError()) {
+ errs() << InputFilename << ": " << EC.message() << '\n';
+ return 1;
+ }
+
+ SourceMgr SrcMgr;
+
+ // Tell SrcMgr about this buffer, which is what the parser will pick up.
+ SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
+
+ std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+ assert(MRI && "Unable to create target register info!");
+
+ std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+ assert(MAI && "Unable to create target asm info!");
+
+ MCObjectFileInfo MOFI;
+ MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+ MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx);
+
+ std::unique_ptr<buffer_ostream> BOS;
+ std::unique_ptr<mca::SourceMgr> S =
+ llvm::make_unique<mca::SourceMgr>(Iterations);
+ MCStreamerWrapper Str(Ctx, S->getSequence());
+
+ std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+ std::unique_ptr<MCSubtargetInfo> STI(
+ TheTarget->createMCSubtargetInfo(TripleName, MCPU, /* FeaturesStr */ ""));
+ if (!STI->isCPUStringValid(MCPU))
+ return 1;
+
+ if (!STI->getSchedModel().isOutOfOrder()) {
+ errs() << "error: please specify an out-of-order cpu. '" << MCPU
+ << "' is an in-order cpu.\n";
+ return 1;
+ }
+
+ if (!STI->getSchedModel().hasInstrSchedModel()) {
+ errs()
+ << "error: unable to find instruction-level scheduling information for"
+ << " target triple '" << TheTriple.normalize() << "' and cpu '" << MCPU
+ << "'.\n";
+
+ if (STI->getSchedModel().InstrItineraries)
+ errs() << "note: cpu '" << MCPU << "' provides itineraries. However, "
+ << "instruction itineraries are currently unsupported.\n";
+ return 1;
+ }
+
+ std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
+ Triple(TripleName), OutputAsmVariant, *MAI, *MCII, *MRI));
+ if (!IP) {
+ errs() << "error: unable to create instruction printer for target triple '"
+ << TheTriple.normalize() << "' with assembly variant "
+ << OutputAsmVariant << ".\n";
+ return 1;
+ }
+
+ int Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, Str, *MAI, *STI,
+ *MCII, MCOptions);
+ if (Res)
+ return Res;
+
+ if (S->isEmpty()) {
+ errs() << "error: no assembly instructions found.\n";
+ return 1;
+ }
+
+ const MCSchedModel &SM = STI->getSchedModel();
+
+ unsigned Width = SM.IssueWidth;
+ if (DispatchWidth)
+ Width = DispatchWidth;
+
+ std::unique_ptr<mca::Backend> B = llvm::make_unique<mca::Backend>(
+ *STI, *MCII, *MRI, std::move(S), Width, RegisterFileSize, MaxRetirePerCycle,
+ LoadQueueSize, StoreQueueSize, AssumeNoAlias);
+
+ std::unique_ptr<mca::BackendPrinter> Printer =
+ llvm::make_unique<mca::BackendPrinter>(*B, OutputFilename, std::move(IP),
+ PrintModeVerbose);
+ Printer->addResourcePressureView();
+ if (PrintTimelineView)
+ Printer->addTimelineView(TimelineMaxIterations, TimelineMaxCycles);
+
+ B->run();
+ Printer->printReport();
+ return 0;
+}