From 6113ee650a272dc737a200e276de474e083a2fdf Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 26 Oct 2022 21:11:31 +0100 Subject: [PATCH] aco/gfx11: fix FS input loads in quad-divergent control flow MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is not ideal and it would be great to somehow make it better some day. fossil-db (gfx1100): Totals from 5208 (3.86% of 135032) affected shaders: MaxWaves: 127058 -> 126962 (-0.08%); split: +0.01%, -0.09% Instrs: 3983440 -> 4072736 (+2.24%); split: -0.00%, +2.24% CodeSize: 21872468 -> 22230852 (+1.64%); split: -0.00%, +1.64% VGPRs: 206688 -> 206984 (+0.14%); split: -0.05%, +0.20% Latency: 37447383 -> 37491197 (+0.12%); split: -0.05%, +0.17% InvThroughput: 6421955 -> 6422348 (+0.01%); split: -0.03%, +0.03% VClause: 71579 -> 71545 (-0.05%); split: -0.09%, +0.04% SClause: 148289 -> 147146 (-0.77%); split: -0.84%, +0.07% Copies: 259011 -> 258084 (-0.36%); split: -0.61%, +0.25% Branches: 101366 -> 101314 (-0.05%); split: -0.10%, +0.05% PreSGPRs: 223482 -> 223460 (-0.01%); split: -0.21%, +0.20% PreVGPRs: 184448 -> 184744 (+0.16%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 2 +- src/amd/compiler/aco_instruction_selection.cpp | 58 ++++++++++++++++++++--- src/amd/compiler/aco_instruction_selection.h | 1 + src/amd/compiler/aco_lower_to_hw_instr.cpp | 48 +++++++++++++++++++ src/amd/compiler/aco_opcodes.py | 5 ++ src/amd/compiler/aco_optimizer.cpp | 1 + src/amd/compiler/aco_reduce_assign.cpp | 64 +++++++++++++++----------- src/amd/compiler/aco_register_allocation.cpp | 4 +- src/amd/compiler/aco_validate.cpp | 1 + 9 files changed, 149 insertions(+), 35 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index af91669..cff50c8 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -522,7 +522,7 @@ public: } <% import itertools -formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8)]), +formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6)]), ("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]), ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])), ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])), diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 352f793..399fbc7 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -72,6 +72,8 @@ struct if_context { bool divergent_old; bool exec_potentially_empty_discard_old; bool exec_potentially_empty_break_old; + bool had_divergent_discard_old; + bool had_divergent_discard_then; uint16_t exec_potentially_empty_break_depth_old; unsigned BB_if_idx; @@ -5306,6 +5308,13 @@ visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr) } } +bool +in_exec_divergent_or_in_loop(isel_context* ctx) +{ + return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent || + ctx->cf_info.had_divergent_discard; +} + void emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask) @@ -5315,7 +5324,16 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem Builder bld(ctx->program, ctx->block); - //TODO: this doesn't work in quad-divergent control flow + if (in_exec_divergent_or_in_loop(ctx)) { + Operand prim_mask_op = bld.m0(prim_mask); + prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ + Operand coord2_op(coord2); + coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */ + bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), + Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), coord1, + coord2_op, prim_mask_op); + return; + } Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); @@ -5385,13 +5403,22 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig { Builder bld(ctx->program, ctx->block); if (ctx->options->gfx_level >= GFX11) { - //TODO: this doesn't work in quad-divergent control flow and ignores vertex_id - Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); + // TODO: this ignores vertex_id uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0); - Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl); + if (in_exec_divergent_or_in_loop(ctx)) { + Operand prim_mask_op = bld.m0(prim_mask); + prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ + bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), + Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), + Operand::c32(dpp_ctrl), prim_mask_op); + } else { + Temp p = + bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); + Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl); - /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */ - emit_wqm(bld, res, dst, true); + /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */ + emit_wqm(bld, res, dst, true); + } } else { bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id), bld.m0(prim_mask), idx, component); @@ -5825,7 +5852,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) unsigned chan_component = (component + i) % 4; unsigned chan_idx = idx + (component + i) / 4; vec->operands[i] = Operand(bld.tmp(instr->dest.ssa.bit_size == 16 ? v2b : v1)); - emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(), prim_mask); + emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, + vec->operands[i].getTemp(), prim_mask); } vec->definitions[0] = Definition(dst); bld.insert(std::move(vec)); @@ -8980,6 +9008,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) ctx->cf_info.exec_potentially_empty_discard = true; + ctx->block->kind |= block_kind_uses_discard; ctx->program->needs_exact = true; break; @@ -8992,6 +9021,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) ctx->cf_info.exec_potentially_empty_discard = true; + ctx->block->kind |= block_kind_uses_discard; ctx->program->needs_exact = true; break; @@ -9007,12 +9037,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) assert(src.regClass() == bld.lm); cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + + ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]); } bld.pseudo(aco_opcode::p_discard_if, cond); if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) ctx->cf_info.exec_potentially_empty_discard = true; + ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx); ctx->block->kind |= block_kind_uses_discard; ctx->program->needs_exact = true; break; @@ -10554,6 +10587,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break; ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth; ic->divergent_old = ctx->cf_info.parent_if.is_divergent; + ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard; ctx->cf_info.parent_if.is_divergent = true; /* divergent branches use cbranch_execz */ @@ -10621,6 +10655,9 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic, ctx->cf_info.exec_potentially_empty_break = false; ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard; + ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old; + /** emit logical else block */ ctx->program->next_divergent_if_logical_depth++; Block* BB_else_logical = ctx->program->create_and_insert_block(); @@ -10683,6 +10720,7 @@ end_divergent_if(isel_context* ctx, if_context* ic) ctx->cf_info.exec_potentially_empty_break = false; ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; } + ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then; } static void @@ -10709,6 +10747,8 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) ctx->cf_info.has_branch = false; ctx->cf_info.parent_loop.has_divergent_branch = false; + ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard; + /** emit then block */ ctx->program->next_uniform_if_depth++; Block* BB_then = ctx->program->create_and_insert_block(); @@ -10742,6 +10782,9 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic) ctx->cf_info.has_branch = false; ctx->cf_info.parent_loop.has_divergent_branch = false; + ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard; + ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old; + /** emit else block */ Block* BB_else = ctx->program->create_and_insert_block(); add_edge(ic->BB_if_idx, BB_else); @@ -10770,6 +10813,7 @@ end_uniform_if(isel_context* ctx, if_context* ic) ctx->cf_info.has_branch &= ic->uniform_has_then_branch; ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; + ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then; /** emit endif merge block */ ctx->program->next_uniform_if_depth--; diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 4c37178..5370034 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -74,6 +74,7 @@ struct isel_context { struct { bool is_divergent = false; } parent_if; + bool had_divergent_discard = false; bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */ uint16_t exec_potentially_empty_break_depth = UINT16_MAX; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 028876e..9732933 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2378,6 +2378,54 @@ lower_to_hw_instr(Program* program) bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]); break; } + case aco_opcode::p_interp_gfx11: { + assert(instr->definitions[0].regClass() == v1 || + instr->definitions[0].regClass() == v2b); + assert(instr->definitions[1].regClass() == bld.lm); + assert(instr->operands[0].regClass() == v1.as_linear()); + assert(instr->operands[1].isConstant()); + assert(instr->operands[2].isConstant()); + assert(instr->operands.back().physReg() == m0); + Definition dst = instr->definitions[0]; + PhysReg exec_tmp = instr->definitions[1].physReg(); + PhysReg lin_vgpr = instr->operands[0].physReg(); + unsigned attribute = instr->operands[1].constantValue(); + unsigned component = instr->operands[2].constantValue(); + uint16_t dpp_ctrl = 0; + Operand coord1, coord2; + if (instr->operands.size() == 6) { + assert(instr->operands[3].regClass() == v1); + assert(instr->operands[4].regClass() == v1); + coord1 = instr->operands[3]; + coord2 = instr->operands[4]; + } else { + assert(instr->operands[3].isConstant()); + dpp_ctrl = instr->operands[3].constantValue(); + } + + bld.sop1(Builder::s_mov, Definition(exec_tmp, bld.lm), Operand(exec, bld.lm)); + bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), Operand(exec, bld.lm)); + bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1), + attribute, component); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(exec_tmp, bld.lm)); + + Operand p(lin_vgpr, v1); + Operand dst_op(dst.physReg(), v1); + if (instr->operands.size() == 5) { + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl); + } else if (dst.regClass() == v2b) { + bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p, + coord1, p); + bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, + coord2, dst_op); + } else { + bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1, + p); + bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, + dst_op); + } + break; + } default: break; } } else if (instr->isBranch()) { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 52a52af..2594d4f 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -335,6 +335,11 @@ opcode("p_init_scratch") # jumps to a shader epilog opcode("p_jump_to_epilog") +# loads and interpolates a fragment shader input with a correct exec mask +#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=coord1, src4=coord2, src5=m0 +#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0 +opcode("p_interp_gfx11") + # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index a79af92..0c128db 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -673,6 +673,7 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand) case aco_opcode::v_readfirstlane_b32: case aco_opcode::p_extract: case aco_opcode::p_insert: return operand != 0; + case aco_opcode::p_interp_gfx11: return false; default: return true; } } diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 3bc63d8..3c31b46 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -45,11 +45,13 @@ setup_reduce_temp(Program* program) std::vector hasReductions(program->blocks.size()); for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) { - if (instr->format != Format::PSEUDO_REDUCTION) - continue; - - maxSize = MAX2(maxSize, instr->operands[0].size()); - hasReductions[block.index] = true; + if (instr->opcode == aco_opcode::p_interp_gfx11) { + maxSize = MAX2(maxSize, 1); + hasReductions[block.index] = true; + } else if (instr->format == Format::PSEUDO_REDUCTION) { + maxSize = MAX2(maxSize, instr->operands[0].size()); + hasReductions[block.index] = true; + } } } @@ -92,10 +94,10 @@ setup_reduce_temp(Program* program) std::vector>::iterator it; for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { Instruction* instr = (*it).get(); - if (instr->format != Format::PSEUDO_REDUCTION) + if (instr->format != Format::PSEUDO_REDUCTION && + instr->opcode != aco_opcode::p_interp_gfx11) continue; - ReduceOp op = instr->reduction().reduce_op; reduceTmp_in_loop |= block.loop_nest_depth > 0; if ((int)last_top_level_block_idx != inserted_at) { @@ -122,22 +124,26 @@ setup_reduce_temp(Program* program) } /* same as before, except for the vector temporary instead of the reduce temporary */ - unsigned cluster_size = instr->reduction().cluster_size; - bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || - op == fmax64 || op == umin64 || op == umax64 || op == imin64 || - op == imax64 || op == imul64; - bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 || - op == imul16 || op == imax16 || op == imin16 || op == umin16 || - op == iadd64; - - if (program->gfx_level >= GFX10 && cluster_size == 64) - need_vtmp = true; - if (program->gfx_level >= GFX10 && gfx10_need_vtmp) - need_vtmp = true; - if (program->gfx_level <= GFX7) - need_vtmp = true; - - need_vtmp |= cluster_size == 32; + bool need_vtmp = false; + if (instr->isReduction()) { + ReduceOp op = instr->reduction().reduce_op; + unsigned cluster_size = instr->reduction().cluster_size; + need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || + op == fmax64 || op == umin64 || op == umax64 || op == imin64 || + op == imax64 || op == imul64; + bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 || + op == imul16 || op == imax16 || op == imin16 || op == umin16 || + op == iadd64; + + if (program->gfx_level >= GFX10 && cluster_size == 64) + need_vtmp = true; + if (program->gfx_level >= GFX10 && gfx10_need_vtmp) + need_vtmp = true; + if (program->gfx_level <= GFX7) + need_vtmp = true; + + need_vtmp |= cluster_size == 32; + } vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { @@ -158,9 +164,15 @@ setup_reduce_temp(Program* program) } } - instr->operands[1] = Operand(reduceTmp); - if (need_vtmp) - instr->operands[2] = Operand(vtmp); + if (instr->isReduction()) { + instr->operands[1] = Operand(reduceTmp); + if (need_vtmp) + instr->operands[2] = Operand(vtmp); + } else { + assert(instr->opcode == aco_opcode::p_interp_gfx11); + instr->operands[0] = Operand(reduceTmp); + instr->operands[0].setLateKill(true); + } } } } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 9cbc208..0a3401e 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -607,7 +607,9 @@ get_subdword_definition_info(Program* program, const aco_ptr& instr amd_gfx_level gfx_level = program->gfx_level; if (instr->isPseudo()) { - if (gfx_level >= GFX8) + if (instr->opcode == aco_opcode::p_interp_gfx11) + return std::make_pair(4u, 4u); + else if (gfx_level >= GFX8) return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes()); else return std::make_pair(4, rc.size() * 4u); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index fef525d..d0367e7 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -262,6 +262,7 @@ validate_ir(Program* program) bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() || instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_jump_to_epilog || + (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || (instr->isScratch() && i == 0); -- 2.7.4