aco: end reduce tmp after control flow, when used within control flow
authorRhys Perry <pendingchaos02@gmail.com>
Tue, 10 Jan 2023 15:29:15 +0000 (15:29 +0000)
committerEric Engestrom <eric@engestrom.ch>
Wed, 8 Feb 2023 20:34:42 +0000 (20:34 +0000)
In the case of:

v0 = start_linear_vgpr
if (...) {

} else {
   use_linear_vgpr(v0)
}
v0 = phi

We need a p_end_linear_vgpr to ensure that the phi does not use the same
VGPR as the linear VGPR.

fossil-db (gfx1100):
Totals from 3763 (2.80% of 134574) affected shaders:
MaxWaves: 90296 -> 90164 (-0.15%)
Instrs: 6857726 -> 6856608 (-0.02%); split: -0.03%, +0.01%
CodeSize: 35382188 -> 35377688 (-0.01%); split: -0.02%, +0.01%
VGPRs: 234864 -> 235692 (+0.35%); split: -0.01%, +0.36%
Latency: 47471923 -> 47474965 (+0.01%); split: -0.03%, +0.04%
InvThroughput: 5640320 -> 5639736 (-0.01%); split: -0.04%, +0.03%
VClause: 93098 -> 93107 (+0.01%); split: -0.01%, +0.02%
SClause: 214137 -> 214130 (-0.00%); split: -0.00%, +0.00%
Copies: 369895 -> 369305 (-0.16%); split: -0.31%, +0.15%
Branches: 164996 -> 164504 (-0.30%); split: -0.30%, +0.00%
PreVGPRs: 210655 -> 211438 (+0.37%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20621>
(cherry picked from commit 44fdd2ebcb271011665dd100ba9ef6852cddb22e)

.pick_status.json
src/amd/compiler/aco_reduce_assign.cpp

index ef5350f..ae537c8 100644 (file)
         "description": "aco: end reduce tmp after control flow, when used within control flow",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null
     },
index 3c31b46..535e031 100644 (file)
@@ -63,31 +63,36 @@ setup_reduce_temp(Program* program)
    Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
    int inserted_at = -1;
    int vtmp_inserted_at = -1;
-   bool reduceTmp_in_loop = false;
    bool vtmp_in_loop = false;
 
    for (Block& block : program->blocks) {
 
-      /* insert p_end_linear_vgpr after the outermost loop */
-      if (reduceTmp_in_loop && block.loop_nest_depth == 0) {
-         assert(inserted_at == (int)last_top_level_block_idx);
-
-         aco_ptr<Instruction> end{create_instruction<Instruction>(
-            aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
-         end->operands[0] = Operand(reduceTmp);
-         if (vtmp_in_loop)
-            end->operands[1] = Operand(vtmp);
-         /* insert after the phis of the loop exit block */
-         std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
-         while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
-            ++it;
-         block.instructions.insert(it, std::move(end));
-         reduceTmp_in_loop = false;
-      }
-
-      if (block.kind & block_kind_top_level)
+      if (block.kind & block_kind_top_level) {
          last_top_level_block_idx = block.index;
 
+         /* TODO: this could be improved in this case:
+          *    start_linear_vgpr
+          *    if (...) {
+          *       use_linear_vgpr
+          *    }
+          *    end_linear_vgpr
+          * Here, the linear vgpr is used before any phi copies, so this isn't necessary.
+          */
+         if (inserted_at >= 0) {
+            aco_ptr<Instruction> end{create_instruction<Instruction>(
+               aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)};
+            end->operands[0] = Operand(reduceTmp);
+            if (vtmp_inserted_at >= 0)
+               end->operands[1] = Operand(vtmp);
+            /* insert after the phis of the block */
+            std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
+            while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
+               ++it;
+            block.instructions.insert(it, std::move(end));
+            inserted_at = vtmp_inserted_at = -1;
+         }
+      }
+
       if (!hasReductions[block.index])
          continue;
 
@@ -98,8 +103,6 @@ setup_reduce_temp(Program* program)
              instr->opcode != aco_opcode::p_interp_gfx11)
             continue;
 
-         reduceTmp_in_loop |= block.loop_nest_depth > 0;
-
          if ((int)last_top_level_block_idx != inserted_at) {
             reduceTmp = program->allocateTmp(reduceTmp.regClass());
             aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(