aco: don't decrease the vgpr_limit when encountering bpermute

author Daniel Schürmann <daniel@schuermann.dev>

Fri, 5 Feb 2021 13:38:08 +0000 (14:38 +0100)

committer Marge Bot <eric+marge@anholt.net>

Fri, 12 Feb 2021 19:00:18 +0000 (19:00 +0000)
author Daniel Schürmann <daniel@schuermann.dev>
Fri, 5 Feb 2021 13:38:08 +0000 (14:38 +0100)
committer Marge Bot <eric+marge@anholt.net>
Fri, 12 Feb 2021 19:00:18 +0000 (19:00 +0000)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 89d1b0e..75c632e 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -192,14 +192,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
  
        return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
     } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
-      /* GFX10 wave64 mode: emulate full-wave bpermute */
-      if (!ctx->has_gfx10_wave64_bpermute) {
-         ctx->has_gfx10_wave64_bpermute = true;
-         /* Shared VGPRs are allocated in groups of 8/16 */
-         ctx->program->config->num_shared_vgprs = ctx->program->chip_class >= GFX10_3 ? 16 : 8;
-         ctx->program->vgpr_limit -= ctx->program->chip_class >= GFX10_3 ? 8 : 4;
-      }
  
+      /* GFX10 wave64 mode: emulate full-wave bpermute */
        Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
        Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
        Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
@@ -211,6 +205,10 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
        input_data.setLateKill(true);
        same_half.setLateKill(true);
  
+      /* We need one pair of shared VGPRs:
+       * Note, that these have twice the allocation granularity of normal VGPRs */
+      ctx->program->config->num_shared_vgprs = 2 * ctx->program->vgpr_alloc_granule;
+
        return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
     } else {
        /* GFX8-9 or GFX10 wave32: bpermute works normally */
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h

index d520045..dd478c4 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -60,7 +60,6 @@ struct isel_context {
     uint32_t first_temp_id;
     std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
     Stage stage;
-   bool has_gfx10_wave64_bpermute = false;
     struct {
        bool has_branch;
        uint16_t loop_nest_depth = 0;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp

index 56d88e0..06827c4 100644 (file)
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -302,6 +302,7 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
  uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
  {
     uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1);
+   vgprs -= program->config->num_shared_vgprs / 2;
     return std::min(vgprs, program->vgpr_limit);
  }
  
@@ -342,7 +343,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
        program->max_reg_demand = new_demand;
     } else {
        program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
-      program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / get_vgpr_alloc(program, new_demand.vgpr));
+      uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
+      program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / vgpr_demand);
        program->max_waves = max_waves_per_simd;
  
        /* adjust max_waves for workgroup and LDS limits */
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp

index ad85259..f84544d 100644 (file)
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -892,6 +892,7 @@ void schedule_program(Program *program, live& live_vars)
     RegisterDemand demand;
     for (Block& block : program->blocks)
        demand.update(block.register_demand);
+   demand.vgpr += program->config->num_shared_vgprs / 2;
  
     sched_ctx ctx;
     ctx.mv.depends_on.resize(program->peekAllocationId());
author	Daniel Schürmann <daniel@schuermann.dev>
	Fri, 5 Feb 2021 13:38:08 +0000 (14:38 +0100)
committer	Marge Bot <eric+marge@anholt.net>
	Fri, 12 Feb 2021 19:00:18 +0000 (19:00 +0000)
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.h		patch \| blob \| history
src/amd/compiler/aco_live_var_analysis.cpp		patch \| blob \| history
src/amd/compiler/aco_scheduler.cpp		patch \| blob \| history