From 15afb8dcc6cf6ca9d704ca4ba3d5690660da5570 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marcin=20=C5=9Alusarz?= Date: Wed, 1 Feb 2023 17:23:25 +0100 Subject: [PATCH] intel/compiler/mesh: apply URB payload mask once per program Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_fs.h | 3 +-- src/intel/compiler/brw_fs_thread_payload.cpp | 17 ++++++++++++-- src/intel/compiler/brw_mesh.cpp | 34 +++++++++++----------------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 9bf4c3a..454b587 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -412,8 +412,7 @@ public: const fs_reg &urb_handle); void emit_task_mesh_load(const brw::fs_builder &bld, nir_intrinsic_instr *instr, - const fs_reg &urb_handle, - bool mask); + const fs_reg &urb_handle); void emit_barrier(); void emit_tcs_barrier(); diff --git a/src/intel/compiler/brw_fs_thread_payload.cpp b/src/intel/compiler/brw_fs_thread_payload.cpp index dff0b33..b28f743 100644 --- a/src/intel/compiler/brw_fs_thread_payload.cpp +++ b/src/intel/compiler/brw_fs_thread_payload.cpp @@ -420,10 +420,23 @@ task_mesh_thread_payload::task_mesh_thread_payload(const fs_visitor &v) unsigned r = 0; assert(subgroup_id_.file != BAD_FILE); extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD); - urb_output = brw_ud1_grf(0, 6); - if (v.stage == MESA_SHADER_MESH) + urb_output = v.bld.vgrf(BRW_REGISTER_TYPE_UD); + /* In both mesh and task shader payload, lower 16 bits of g0.6 is + * an offset within Slice's Local URB, which says where shader is + * supposed to output its data. + */ + v.bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF)); + + if (v.stage == MESA_SHADER_MESH) { + /* g0.7 is Task Shader URB Entry Offset, which contains both an offset + * within Slice's Local USB (bits 0:15) and a slice selector + * (bits 16:24). Slice selector can be non zero when mesh shader + * is spawned on slice other than the one where task shader was run. + * Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID. + */ task_urb_input = brw_ud1_grf(0, 7); + } r++; local_index = brw_uw8_grf(1, 0); diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index a854c94..0f52a31 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -1113,7 +1113,10 @@ adjust_handle_and_offset(const fs_builder &bld, if (adjustment) { fs_builder ubld8 = bld.group(8, 0).exec_all(); - ubld8.ADD(urb_handle, urb_handle, brw_imm_ud(adjustment)); + /* Allocate new register to not overwrite the shared URB handle. */ + fs_reg new_handle = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ubld8.ADD(new_handle, urb_handle, brw_imm_ud(adjustment)); + urb_handle = new_handle; urb_global_offset -= adjustment; } } @@ -1420,13 +1423,8 @@ fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *ins fs_reg src = get_nir_src(instr->src[0]); nir_src *offset_nir_src = nir_get_io_offset_src(instr); - fs_builder ubld8 = bld.group(8, 0).exec_all(); - fs_reg h = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1); - ubld8.MOV(h, urb_handle); - ubld8.AND(h, h, brw_imm_ud(0xFFFF)); - if (nir_src_is_const(*offset_nir_src)) { - emit_urb_direct_writes(bld, instr, src, h); + emit_urb_direct_writes(bld, instr, src, urb_handle); } else { bool use_mod = false; unsigned mod; @@ -1443,35 +1441,29 @@ fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *ins } if (use_mod) { - emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), h, mod); + emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle, mod); } else { - emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h); + emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), urb_handle); } } } void fs_visitor::emit_task_mesh_load(const fs_builder &bld, nir_intrinsic_instr *instr, - const fs_reg &urb_handle, bool mask) + const fs_reg &urb_handle) { fs_reg dest = get_nir_dest(instr->dest); nir_src *offset_nir_src = nir_get_io_offset_src(instr); - fs_builder ubld8 = bld.group(8, 0).exec_all(); - fs_reg h = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1); - ubld8.MOV(h, urb_handle); - if (mask) - ubld8.AND(h, h, brw_imm_ud(0xFFFF)); - /* TODO(mesh): for per_vertex and per_primitive, if we could keep around * the non-array-index offset, we could use to decide if we can perform * a single large aligned read instead one per component. */ if (nir_src_is_const(*offset_nir_src)) - emit_urb_direct_reads(bld, instr, dest, h); + emit_urb_direct_reads(bld, instr, dest, urb_handle); else - emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src), h); + emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src), urb_handle); } void @@ -1489,7 +1481,7 @@ fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld, case nir_intrinsic_load_output: case nir_intrinsic_load_task_payload: - emit_task_mesh_load(bld, instr, payload.urb_output, true); + emit_task_mesh_load(bld, instr, payload.urb_output); break; default: @@ -1515,11 +1507,11 @@ fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld, case nir_intrinsic_load_per_vertex_output: case nir_intrinsic_load_per_primitive_output: case nir_intrinsic_load_output: - emit_task_mesh_load(bld, instr, payload.urb_output, true); + emit_task_mesh_load(bld, instr, payload.urb_output); break; case nir_intrinsic_load_task_payload: - emit_task_mesh_load(bld, instr, payload.task_urb_input, false); + emit_task_mesh_load(bld, instr, payload.task_urb_input); break; default: -- 2.7.4