v3d: add shader-db stat to count SFU stalls
authorJose Maria Casanova Crespo <jmcasanova@igalia.com>
Tue, 2 Jul 2019 16:31:09 +0000 (18:31 +0200)
committerJose Maria Casanova Crespo <jmcasanova@igalia.com>
Mon, 22 Jul 2019 01:00:50 +0000 (03:00 +0200)
SFU operations have a latency of 2 cicles, so if their results
are used in the following cycle to a SFU instruction, the GPU
stalls for an extra cycle until the result is available.

This adds the number of stalls to the shader-db debug mode and
sum of instruction + stalls to evaluate optimizations to schedule
instructions that avoid generating sfu-stalls.

v2: Rename v3d_qpu_generates_sfu_stalls to v3d_qpu_instr_is_sfu (Eric)

Reviewed-by: Eric Anholt <eric@anholt.net>
src/broadcom/compiler/qpu_schedule.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir.c
src/broadcom/qpu/qpu_instr.c
src/broadcom/qpu/qpu_instr.h

index b8e04f6..370881b 100644 (file)
@@ -440,6 +440,8 @@ struct choose_scoreboard {
         struct dag *dag;
         int tick;
         int last_magic_sfu_write_tick;
+        int last_stallable_sfu_reg;
+        int last_stallable_sfu_tick;
         int last_ldvary_tick;
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
@@ -531,6 +533,33 @@ pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
         return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
 }
 
+static bool
+qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+                        uint32_t waddr) {
+
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+           return false;
+
+        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+            inst->raddr_a == waddr)
+              return true;
+
+        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+            !inst->sig.small_imm && (inst->raddr_b == waddr))
+              return true;
+
+        return false;
+}
+
+static bool
+mux_read_stalls(struct choose_scoreboard *scoreboard,
+                const struct v3d_qpu_instr *inst)
+{
+        return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
+                qpu_instruction_uses_rf(inst,
+                                        scoreboard->last_stallable_sfu_reg);
+}
+
 static int
 get_instruction_priority(const struct v3d_qpu_instr *inst)
 {
@@ -852,6 +881,16 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
 }
 
 static void
+update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
+                                      const struct v3d_qpu_instr *inst)
+{
+        if (v3d_qpu_instr_is_sfu(inst)) {
+                scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
+                scoreboard->last_stallable_sfu_tick = scoreboard->tick;
+        }
+}
+
+static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                              const struct v3d_qpu_instr *inst)
 {
@@ -864,6 +903,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                 if (inst->alu.add.magic_write) {
                         update_scoreboard_for_magic_waddr(scoreboard,
                                                           inst->alu.add.waddr);
+                } else {
+                        update_scoreboard_for_sfu_stall_waddr(scoreboard,
+                                                              inst);
                 }
         }
 
@@ -1298,6 +1340,8 @@ schedule_instructions(struct v3d_compile *c,
                                         fprintf(stderr, "\n");
                                 }
                         }
+                        if (mux_read_stalls(scoreboard, inst))
+                                c->qpu_inst_stalled_count++;
                 }
 
                 /* Update the uniform index for the rewritten location --
@@ -1481,6 +1525,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_magic_sfu_write_tick = -10;
         scoreboard.last_uniforms_reset_tick = -10;
         scoreboard.last_thrsw_tick = -10;
+        scoreboard.last_stallable_sfu_tick = -10;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
index da32d47..b61119f 100644 (file)
@@ -613,6 +613,7 @@ struct v3d_compile {
         uint64_t *qpu_insts;
         uint32_t qpu_inst_count;
         uint32_t qpu_inst_size;
+        uint32_t qpu_inst_stalled_count;
 
         /* For the FS, the number of varying inputs not counting the
          * point/line varyings payload
index 04129fa..eed3fc1 100644 (file)
@@ -947,7 +947,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
         char *shaderdb;
         int ret = asprintf(&shaderdb,
                            "%s shader: %d inst, %d threads, %d loops, "
-                           "%d uniforms, %d max-temps, %d:%d spills:fills",
+                           "%d uniforms, %d max-temps, %d:%d spills:fills, "
+                           "%d sfu-stalls, %d inst-and-stalls",
                            vir_get_stage_name(c),
                            c->qpu_inst_count,
                            c->threads,
@@ -955,7 +956,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                            c->num_uniforms,
                            vir_get_max_temps(c),
                            c->spills,
-                           c->fills);
+                           c->fills,
+                           c->qpu_inst_stalled_count,
+                           c->qpu_inst_count + c->qpu_inst_stalled_count);
         if (ret >= 0) {
                 if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
                         fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
index 66e53a6..09d06b3 100644 (file)
@@ -645,19 +645,10 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
 bool
 v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
 {
-        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
-                switch (inst->alu.add.op) {
-                case V3D_QPU_A_RECIP:
-                case V3D_QPU_A_RSQRT:
-                case V3D_QPU_A_EXP:
-                case V3D_QPU_A_LOG:
-                case V3D_QPU_A_SIN:
-                case V3D_QPU_A_RSQRT2:
-                        return true;
-                default:
-                        break;
-                }
+        if (v3d_qpu_instr_is_sfu(inst))
+                return true;
 
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.magic_write &&
                     v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
                         return true;
@@ -673,6 +664,25 @@ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
 }
 
 bool
+v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                switch (inst->alu.add.op) {
+                case V3D_QPU_A_RECIP:
+                case V3D_QPU_A_RSQRT:
+                case V3D_QPU_A_EXP:
+                case V3D_QPU_A_LOG:
+                case V3D_QPU_A_SIN:
+                case V3D_QPU_A_RSQRT2:
+                        return true;
+                default:
+                        return false;
+                }
+        }
+        return false;
+}
+
+bool
 v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
 {
         return (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
index 968d0f6..ad2d37b 100644 (file)
@@ -447,6 +447,7 @@ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;