From 45f6d38a766875616ffc480f27560389d7d585ef Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Sat, 2 Sep 2023 11:14:33 +0200 Subject: [PATCH] aco: insert a single p_end_wqm after the last derivative calculation This new instruction replaces p_wqm. Totals from 28065 (36.65% of 76572) affected shaders: (GFX11) MaxWaves: 823922 -> 823952 (+0.00%); split: +0.01%, -0.01% Instrs: 22221375 -> 22180465 (-0.18%); split: -0.26%, +0.08% CodeSize: 117310676 -> 117040684 (-0.23%); split: -0.30%, +0.07% VGPRs: 1183476 -> 1186656 (+0.27%); split: -0.19%, +0.46% SpillSGPRs: 2305 -> 2302 (-0.13%) Latency: 176559310 -> 176427793 (-0.07%); split: -0.21%, +0.14% InvThroughput: 26245204 -> 26195550 (-0.19%); split: -0.26%, +0.07% VClause: 368058 -> 369460 (+0.38%); split: -0.21%, +0.59% SClause: 857077 -> 842588 (-1.69%); split: -2.06%, +0.37% Copies: 1245650 -> 1249434 (+0.30%); split: -0.33%, +0.63% Branches: 394837 -> 396070 (+0.31%); split: -0.01%, +0.32% PreSGPRs: 1019139 -> 1019567 (+0.04%); split: -0.02%, +0.06% PreVGPRs: 925739 -> 931860 (+0.66%); split: -0.00%, +0.66% Changes are due to scheduling and re-enabling cross-lane optimizations. Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 4 +- src/amd/compiler/aco_instruction_selection.cpp | 122 ++++++++++++++++--------- src/amd/compiler/aco_instruction_selection.h | 4 + src/amd/compiler/aco_ir.cpp | 1 + src/amd/compiler/aco_opcodes.py | 2 +- src/amd/compiler/aco_opt_value_numbering.cpp | 2 +- src/amd/compiler/tests/test_d3d11_derivs.cpp | 10 +- 7 files changed, 93 insertions(+), 52 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 74458ae..517e27c 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -133,7 +133,7 @@ get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block) for (int i = block->instructions.size() - 1; i >= 0; --i) { aco_ptr& instr = block->instructions[i]; - if (instr->opcode == aco_opcode::p_wqm) + if (instr->opcode == aco_opcode::p_end_wqm) propagate_wqm = true; bool pred_by_exec = needs_exec_mask(instr.get()) || @@ -153,7 +153,7 @@ get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block) /* for "if () " or "while () ", * should be computed in WQM */ - if (info.block_needs & WQM) { + if (propagate_wqm) { mark_block_wqm(ctx, block->index); } } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c965d71..8dda7eb 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -169,11 +169,12 @@ emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = } inline void -emit_wqm(Builder& bld, bool program_needs_wqm = false) +set_wqm(isel_context* ctx, bool enable_helpers = false) { - if (bld.program->stage == fragment_fs) { - bld.pseudo(aco_opcode::p_wqm); - bld.program->needs_wqm |= program_needs_wqm; + if (ctx->program->stage == fragment_fs) { + ctx->wqm_block_idx = ctx->block->index; + ctx->wqm_instruction_idx = ctx->block->instructions.size(); + ctx->program->needs_wqm |= enable_helpers; } } @@ -3835,7 +3836,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2); bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl); } - emit_wqm(bld, true); + set_wqm(ctx, true); break; } default: isel_err(&instr->instr, "Unknown NIR ALU instr"); @@ -5314,7 +5315,7 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10); } /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */ - emit_wqm(bld, true); + set_wqm(ctx, true); } void @@ -5387,7 +5388,7 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl); } /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */ - emit_wqm(bld, true); + set_wqm(ctx, true); } } else { bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3), @@ -5894,7 +5895,7 @@ image_type_to_components_count(enum glsl_sampler_dim dim, bool array) static MIMG_instruction* emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector coords, - bool needs_wqm = false, Operand vdata = Operand(v1)) + Operand vdata = Operand(v1)) { size_t nsa_size = bld.program->dev.max_nsa_vgprs; nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0; @@ -5934,7 +5935,6 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v } bool has_dst = dst.id() != 0; - assert(!needs_wqm || has_dst); aco_ptr mimg{ create_instruction(op, Format::MIMG, 3 + coords.size(), has_dst)}; @@ -5949,8 +5949,6 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v MIMG_instruction* res = mimg.get(); bld.insert(std::move(mimg)); - if (needs_wqm) - emit_wqm(bld, true); return res; } @@ -6206,8 +6204,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) } Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); - MIMG_instruction* load = - emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, false, vdata); + MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata); load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; load->dlc = load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3); @@ -6341,7 +6338,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) } MIMG_instruction* store = - emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, false, Operand(data)); + emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data)); store->glc = glc; store->dlc = false; store->a16 = instr->src[1].ssa->bit_size == 16; @@ -6498,7 +6495,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1); MIMG_instruction* mimg = - emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, false, Operand(data)); + emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data)); mimg->glc = return_previous; mimg->dlc = false; /* Not needed for atomics */ mimg->dmask = (1 << data.size()) - 1; @@ -7774,7 +7771,7 @@ emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr) Temp thread_count = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm)); - emit_wqm(bld, nir_intrinsic_include_helpers(instr)); + set_wqm(ctx, nir_intrinsic_include_helpers(instr)); emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count); } else { @@ -7804,7 +7801,7 @@ emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr) packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u)); else packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm)); - emit_wqm(bld); + set_wqm(ctx); emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid); return true; @@ -7839,7 +7836,7 @@ emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr) as_vgpr(ctx, src)); } - emit_wqm(bld); + set_wqm(ctx); return true; } @@ -7980,7 +7977,7 @@ emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2) tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1); tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2); - emit_wqm(bld, true); + set_wqm(ctx, true); return; } @@ -8306,7 +8303,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero()); } - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_shuffle: @@ -8361,7 +8358,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } - emit_wqm(bld); + set_wqm(ctx); } break; } @@ -8390,7 +8387,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) } else { bld.copy(Definition(dst), src); } - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_vote_all: { @@ -8405,7 +8402,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) .getTemp(); Temp cond = bool_to_vector_condition(ctx, tmp); bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_vote_any: { @@ -8416,7 +8413,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) Temp tmp = bool_to_scalar_condition(ctx, src); bool_to_vector_condition(ctx, tmp, dst); - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_reduce: @@ -8490,7 +8487,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) else emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src); } - emit_wqm(bld, create_helpers); + set_wqm(ctx, create_helpers); break; } case nir_intrinsic_quad_broadcast: @@ -8502,7 +8499,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) if (!instr->def.divergent) { emit_uniform_subgroup(ctx, instr, src); - emit_wqm(bld, true); break; } @@ -8582,7 +8578,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) } /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */ - emit_wqm(bld, true); + set_wqm(ctx, true); break; } case nir_intrinsic_masked_swizzle_amd: { @@ -8621,7 +8617,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_write_invocation_amd: { @@ -8653,7 +8649,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) /* Fit 64-bit mask for wave32 */ src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size())); emit_mbcnt(ctx, dst, Operand(src), Operand(add_src)); - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_lane_permute_16_amd: { @@ -8727,14 +8723,14 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_first_invocation: { bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)), Operand(exec, bld.lm)); - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_last_invocation: { Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm)); bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), Operand::c32(ctx->program->wave_size - 1u), flbit); - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_elect: { @@ -8744,7 +8740,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) */ bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)), Operand(exec, bld.lm)); - emit_wqm(bld); + set_wqm(ctx); break; } case nir_intrinsic_shader_clock: { @@ -9554,8 +9550,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) ? aco_opcode::image_load : aco_opcode::image_load_mip; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); - MIMG_instruction* tex = - emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, false, vdata); + MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata); if (instr->op == nir_texop_fragment_mask_fetch_amd) tex->dim = da ? ac_image_2darray : ac_image_2d; else @@ -9734,14 +9729,15 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); - MIMG_instruction* tex = - emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, implicit_derivs, vdata); + MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata); tex->dim = dim; tex->dmask = dmask & 0xf; tex->da = da; tex->tfe = instr->is_sparse; tex->d16 = d16; tex->a16 = a16; + if (implicit_derivs) + set_wqm(ctx, true); if (tg4_integer_cube_workaround) { assert(tmp_dst.id() != dst.id()); @@ -11270,6 +11266,48 @@ cleanup_cfg(Program* program) } } +void +finish_program(isel_context* ctx) +{ + cleanup_cfg(ctx->program); + + /* Insert a single p_end_wqm instruction after the last derivative calculation */ + if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) { + /* Find the next BB at top-level CFG */ + while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) { + ctx->wqm_block_idx++; + ctx->wqm_instruction_idx = 0; + } + + std::vector>* instrs = + &ctx->program->blocks[ctx->wqm_block_idx].instructions; + auto it = instrs->begin() + ctx->wqm_instruction_idx; + + /* Delay transistion to Exact to help optimizations and scheduling */ + while (it != instrs->end()) { + aco_ptr& instr = *it; + /* End WQM before: */ + if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() || + instr->opcode == aco_opcode::p_dual_src_export_gfx11 || + instr->opcode == aco_opcode::p_logical_start) + break; + + ++it; + + /* End WQM after: */ + if (instr->opcode == aco_opcode::p_logical_end || + instr->opcode == aco_opcode::p_discard_if || + instr->opcode == aco_opcode::p_demote_to_helper || + instr->opcode == aco_opcode::p_end_with_regs) + break; + } + + Builder bld(ctx->program); + bld.reset(instrs, it); + bld.pseudo(aco_opcode::p_end_wqm); + } +} + Temp lanecount_to_mask(isel_context* ctx, Temp count) { @@ -11360,7 +11398,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c } ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val; - cleanup_cfg(ctx.program); + finish_program(&ctx); } void @@ -11804,7 +11842,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const bld.sopp(aco_opcode::s_endpgm); } - cleanup_cfg(program); + finish_program(&ctx); } void @@ -11864,7 +11902,7 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade ctx.block->kind |= block_kind_uniform; bld.sopp(aco_opcode::s_endpgm); - cleanup_cfg(program); + finish_program(&ctx); } Operand @@ -12428,7 +12466,7 @@ select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config, bld.reset(ctx.block); bld.sopp(aco_opcode::s_endpgm); - cleanup_cfg(program); + finish_program(&ctx); } void @@ -12623,7 +12661,7 @@ select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config, bld.reset(ctx.block); bld.sopp(aco_opcode::s_endpgm); - cleanup_cfg(program); + finish_program(&ctx); } void @@ -12673,7 +12711,7 @@ select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config, build_end_with_regs(&ctx, regs); - cleanup_cfg(program); + finish_program(&ctx); } } // namespace aco diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 9ce9de9..4c5115bb 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -102,6 +102,10 @@ struct isel_context { /* I/O information */ shader_io_state inputs; shader_io_state outputs; + + /* WQM information */ + uint32_t wqm_block_idx; + uint32_t wqm_instruction_idx; }; inline Temp diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 9bc20b7..ecfb2ab 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -861,6 +861,7 @@ needs_exec_mask(const Instruction* instr) case aco_opcode::p_logical_start: case aco_opcode::p_logical_end: case aco_opcode::p_startpgm: + case aco_opcode::p_end_wqm: case aco_opcode::p_init_scratch: return instr->reads_exec(); case aco_opcode::p_start_linear_vgpr: return instr->operands.size(); default: break; diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 137a922..dc55919 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -328,7 +328,7 @@ opcode("p_reload") opcode("p_start_linear_vgpr") opcode("p_end_linear_vgpr") -opcode("p_wqm") +opcode("p_end_wqm") opcode("p_discard_if") opcode("p_demote_to_helper") opcode("p_is_helper") diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index d73adc4..6e9d768 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -382,7 +382,7 @@ process_block(vn_ctx& ctx, Block& block) } if (instr->opcode == aco_opcode::p_discard_if || - instr->opcode == aco_opcode::p_demote_to_helper || instr->opcode == aco_opcode::p_wqm) + instr->opcode == aco_opcode::p_demote_to_helper || instr->opcode == aco_opcode::p_end_wqm) ctx.exec_id++; if (!can_eliminate(instr)) { diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp index c0742e0..ee0299e 100644 --- a/src/amd/compiler/tests/test_d3d11_derivs.cpp +++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp @@ -599,13 +599,11 @@ BEGIN_TEST(d3d11_derivs.get_lod) //>> v2: %vec = p_create_vector %x, %y //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 - //>> v1: %x1_m_x0 = v_sub_f32 %x, (kill)%x0 quad_perm:[1,1,1,1] bound_ctrl:1 - //>> v1: %x1 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 - //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x1 quad_perm:[2,2,2,2] bound_ctrl:1 + //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 + //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 //>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 - //>> v1: %y1_m_y0 = v_sub_f32 %y, (kill)%y0 quad_perm:[1,1,1,1] bound_ctrl:1 - //>> v1: %y1 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 - //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y1 quad_perm:[2,2,2,2] bound_ctrl:1 + //>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 + //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 //>> BB1 //>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, %wqm 2d //>> BB2 -- 2.7.4