From 947bf0bd67b7047f247fc10874ced2db8d4f6527 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Fri, 5 Feb 2021 14:38:08 +0100
Subject: [PATCH] aco: don't decrease the vgpr_limit when encountering bpermute

Instead we recalculate vgpr_limit on demand, depending on
the number of needed shared VGPRs.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921>
---
 src/amd/compiler/aco_instruction_selection.cpp | 12 +++++-------
 src/amd/compiler/aco_instruction_selection.h   |  1 -
 src/amd/compiler/aco_live_var_analysis.cpp     |  4 +++-
 src/amd/compiler/aco_scheduler.cpp             |  1 +
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 89d1b0e..75c632e 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -192,14 +192,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
 
       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
-      /* GFX10 wave64 mode: emulate full-wave bpermute */
-      if (!ctx->has_gfx10_wave64_bpermute) {
-         ctx->has_gfx10_wave64_bpermute = true;
-         /* Shared VGPRs are allocated in groups of 8/16 */
-         ctx->program->config->num_shared_vgprs = ctx->program->chip_class >= GFX10_3 ? 16 : 8;
-         ctx->program->vgpr_limit -= ctx->program->chip_class >= GFX10_3 ? 8 : 4;
-      }
 
+      /* GFX10 wave64 mode: emulate full-wave bpermute */
       Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
       Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
@@ -211,6 +205,10 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
       input_data.setLateKill(true);
       same_half.setLateKill(true);
 
+      /* We need one pair of shared VGPRs:
+       * Note, that these have twice the allocation granularity of normal VGPRs */
+      ctx->program->config->num_shared_vgprs = 2 * ctx->program->vgpr_alloc_granule;
+
       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
    } else {
       /* GFX8-9 or GFX10 wave32: bpermute works normally */
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index d520045..dd478c4 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -60,7 +60,6 @@ struct isel_context {
    uint32_t first_temp_id;
    std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
    Stage stage;
-   bool has_gfx10_wave64_bpermute = false;
    struct {
       bool has_branch;
       uint16_t loop_nest_depth = 0;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 56d88e0..06827c4 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -302,6 +302,7 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
 {
    uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1);
+   vgprs -= program->config->num_shared_vgprs / 2;
    return std::min(vgprs, program->vgpr_limit);
 }
 
@@ -342,7 +343,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
       program->max_reg_demand = new_demand;
    } else {
       program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
-      program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / get_vgpr_alloc(program, new_demand.vgpr));
+      uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
+      program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / vgpr_demand);
       program->max_waves = max_waves_per_simd;
 
       /* adjust max_waves for workgroup and LDS limits */
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index ad85259..f84544d 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -892,6 +892,7 @@ void schedule_program(Program *program, live& live_vars)
    RegisterDemand demand;
    for (Block& block : program->blocks)
       demand.update(block.register_demand);
+   demand.vgpr += program->config->num_shared_vgprs / 2;
 
    sched_ctx ctx;
    ctx.mv.depends_on.resize(program->peekAllocationId());
-- 
2.7.4