From ffefa386fda5aec8f66b4499d93b41a846a0b86c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marcin=20=C5=9Alusarz?= Date: Wed, 30 Nov 2022 13:47:19 +0100 Subject: [PATCH] nir/lower_task_shader: fix task payload corruption when shared memory workaround is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We were not taking into account that when all invocations within workgroup are active, we'll copy more data than needed, corrupting task payload of other workgroups. Fixes: 8aff8d3dd42 ("nir: Add common task shader lowering to make the backend's job easier.") Reviewed-by: Caio Oliveira Reviewed-by: Timur Kristóf Part-of: --- src/compiler/nir/nir_lower_task_shader.c | 94 +++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 19 deletions(-) diff --git a/src/compiler/nir/nir_lower_task_shader.c b/src/compiler/nir/nir_lower_task_shader.c index 07dec31..c4013a0 100644 --- a/src/compiler/nir/nir_lower_task_shader.c +++ b/src/compiler/nir/nir_lower_task_shader.c @@ -192,21 +192,49 @@ lower_task_payload_to_shared(nir_builder *b, } static void +copy_shared_to_payload(nir_builder *b, + unsigned num_components, + nir_ssa_def *addr, + unsigned shared_base, + unsigned off) +{ + /* Read from shared memory. */ + nir_ssa_def *copy = nir_load_shared(b, num_components, 32, addr, + .align_mul = 16, + .base = shared_base + off); + + /* Write to task payload memory. */ + nir_store_task_payload(b, copy, addr, .base = off); +} + +static void emit_shared_to_payload_copy(nir_builder *b, uint32_t payload_addr, uint32_t payload_size, lower_task_state *s) { + /* Copy from shared memory to task payload using as much parallelism + * as possible. This is achieved by splitting the work into max 3 phases: + * 1) copy maximum number of vec4s using all invocations within workgroup + * 2) copy maximum number of vec4s using some invocations + * 3) copy remaining dwords (< 4) using only the first invocation + */ const unsigned invocations = b->shader->info.workgroup_size[0] * - b->shader->info.workgroup_size[1] * - b->shader->info.workgroup_size[2]; - const unsigned bytes_per_copy = 16; - const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy); - const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations); + b->shader->info.workgroup_size[1] * + b->shader->info.workgroup_size[2]; + const unsigned vec4size = 16; + const unsigned whole_wg_vec4_copies = payload_size / vec4size; + const unsigned vec4_copies_per_invocation = whole_wg_vec4_copies / invocations; + const unsigned remaining_vec4_copies = whole_wg_vec4_copies % invocations; + const unsigned remaining_dwords = + DIV_ROUND_UP(payload_size + - vec4size * vec4_copies_per_invocation * invocations + - vec4size * remaining_vec4_copies, + 4); const unsigned base_shared_addr = s->payload_shared_addr + payload_addr; nir_ssa_def *invocation_index = nir_load_local_invocation_index(b); - nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy); + nir_ssa_def *addr = nir_imul_imm(b, invocation_index, vec4size); /* Wait for all previous shared stores to finish. * This is necessary because we placed the payload in shared memory. @@ -216,22 +244,50 @@ emit_shared_to_payload_copy(nir_builder *b, .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared); - for (unsigned i = 0; i < copies_per_invocation; ++i) { - /* Payload_size is a size of user-accessible payload, but on some - * hardware (e.g. Intel) payload has a private header, which we have - * to offset (payload_offset_in_bytes). - */ - unsigned const_off = - bytes_per_copy * invocations * i + s->payload_offset_in_bytes; + /* Payload_size is a size of user-accessible payload, but on some + * hardware (e.g. Intel) payload has a private header, which we have + * to offset (payload_offset_in_bytes). + */ + unsigned off = s->payload_offset_in_bytes; - /* Read from shared memory. */ - nir_ssa_def *copy = - nir_load_shared(b, 4, 32, addr, .align_mul = 16, - .base = base_shared_addr + const_off); + /* Technically dword-alignment is not necessary for correctness + * of the code below, but even if backend implements unaligned + * load/stores, they will very likely be slow(er). + */ + assert(off % 4 == 0); - /* Write to task payload memory. */ - nir_store_task_payload(b, copy, addr, .base = const_off); + /* Copy full vec4s using all invocations in workgroup. */ + for (unsigned i = 0; i < vec4_copies_per_invocation; ++i) { + copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off); + off += vec4size * invocations; } + + /* Copy full vec4s using only the invocations needed to not overflow. */ + if (remaining_vec4_copies > 0) { + assert(remaining_vec4_copies < invocations); + + nir_ssa_def *cmp = nir_ilt(b, invocation_index, nir_imm_int(b, remaining_vec4_copies)); + nir_if *if_stmt = nir_push_if(b, cmp); + { + copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off); + } + nir_pop_if(b, if_stmt); + off += vec4size * remaining_vec4_copies; + } + + /* Copy the last few dwords not forming full vec4. */ + if (remaining_dwords > 0) { + assert(remaining_dwords < 4); + nir_ssa_def *cmp = nir_ieq(b, invocation_index, nir_imm_int(b, 0)); + nir_if *if_stmt = nir_push_if(b, cmp); + { + copy_shared_to_payload(b, remaining_dwords, addr, base_shared_addr, off); + } + nir_pop_if(b, if_stmt); + off += remaining_dwords * 4; + } + + assert(s->payload_offset_in_bytes + ALIGN(payload_size, 4) == off); } static bool -- 2.7.4