From be21d54aca09e4466781c2cc3d83022ef480c3f6 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 3 Aug 2022 20:54:52 -0700 Subject: [PATCH] intel/compiler: Use an existing URB write to end TCS threads when viable VS, TCS, TES, and GS threads must end with a URB write message with the EOT (end of thread) bit set. For VS and TES, we shadow output variables with temporaries and perform all stores at the end of the shader, giving us an existing message to do the EOT. In tessellation control shaders, we don't defer output stores until the end of the thread like we do for vertex or evaluation shaders. We just process store_output and store_per_vertex_output intrinsics where they occur, which may be in control flow. So we can't guarantee that there's a URB write being at the end of the shader. Traditionally, we've just emitted a separate URB write to finish TCS threads, doing a writemasked write to an single patch header DWord. On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is a convenient spot to do so. But on other platforms, there's no such field, and this write is purely wasteful. Insetad of emitting a separate write, we can just look for an existing URB write at the end of the program and tag that with EOT, if possible. We already had code to do this for geometry shaders, so just lift it into a helper function and reuse it. No changes in shader-db. Reviewed-by: Lionel Landwerlin Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_fs.cpp | 82 +++++++++++++++++++++++++++++++------------ src/intel/compiler/brw_fs.h | 2 ++ 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 1a0140c..b6a0109 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1511,6 +1511,34 @@ fs_visitor::resolve_source_modifiers(const fs_reg &src) return temp; } +/** + * Walk backwards from the end of the program looking for a URB write that + * isn't in control flow, and mark it with EOT. + * + * Return true if successful or false if a separate EOT write is needed. + */ +bool +fs_visitor::mark_last_urb_write_with_eot() +{ + foreach_in_list_reverse(fs_inst, prev, &this->instructions) { + if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) { + prev->eot = true; + + /* Delete now dead instructions. */ + foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { + if (dead == prev) + break; + dead->remove(); + } + return true; + } else if (prev->is_control_flow() || prev->has_side_effects()) { + break; + } + } + + return false; +} + void fs_visitor::emit_gs_thread_end() { @@ -1526,21 +1554,12 @@ fs_visitor::emit_gs_thread_end() fs_inst *inst; if (gs_prog_data->static_vertex_count != -1) { - foreach_in_list_reverse(fs_inst, prev, &this->instructions) { - if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) { - prev->eot = true; + /* Try and tag the last URB write with EOT instead of emitting a whole + * separate write just to finish the thread. + */ + if (mark_last_urb_write_with_eot()) + return; - /* Delete now dead instructions. */ - foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { - if (dead == prev) - break; - dead->remove(); - } - return; - } else if (prev->is_control_flow() || prev->has_side_effects()) { - break; - } - } fs_reg srcs[URB_LOGICAL_NUM_SRCS]; srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles; inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, @@ -6555,6 +6574,31 @@ fs_visitor::set_tcs_invocation_id() } } +void +fs_visitor::emit_tcs_thread_end() +{ + /* Try and tag the last URB write with EOT instead of emitting a whole + * separate write just to finish the thread. There isn't guaranteed to + * be one, so this may not succeed. + */ + if (devinfo->ver != 8 && mark_last_urb_write_with_eot()) + return; + + /* Emit a URB write to end the thread. On Broadwell, we use this to write + * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy + * algorithm to set it optimally). On other platforms, we simply write + * zero to a reserved/MBZ patch header DWord which has no consequence. + */ + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16); + srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0); + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->mlen = 3; + inst->eot = true; +} + bool fs_visitor::run_tcs() { @@ -6587,15 +6631,7 @@ fs_visitor::run_tcs() bld.emit(BRW_OPCODE_ENDIF); } - /* Emit EOT write; set TR DS Cache bit */ - fs_reg srcs[URB_LOGICAL_NUM_SRCS]; - srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output; - srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16); - srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0); - fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, - reg_undef, srcs, ARRAY_SIZE(srcs)); - inst->mlen = 3; - inst->eot = true; + emit_tcs_thread_end(); if (failed) return false; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index bbe4845..0bdebed 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -412,6 +412,8 @@ public: void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src, unsigned base_offset, const nir_src &offset_src, unsigned num_components, unsigned first_component); + bool mark_last_urb_write_with_eot(); + void emit_tcs_thread_end(); void emit_urb_fence(); void emit_cs_terminate(); fs_reg emit_work_group_id_setup(); -- 2.7.4