From 0fa7ec14734a640858e7f4047ffab78f71272ece Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Mon, 15 Feb 2021 13:14:56 +0200 Subject: [PATCH] turnip,freedreno/a6xx: tell hw the size of shared mem used by CS Before, we only used 2k of shared memory. It was found that 5 lower bits of SP_CS_UNKNOWN_A9B1 do control the available size of shared memory for compute shaders, with AVAILABLE_SIZE = (SP_CS_UNKNOWN_A9B1_SHARED_SIZE + 1) * 1k up to 32k. And SP_CS_UNKNOWN_A9B1_SHARED_SIZE being zero enables all 32k of shared memory. Fixes tests: dEQP-VK.rasterization.line_continuity.line-strip dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.write_after_read.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp Signed-off-by: Danylo Piliaiev Part-of: --- ci-expects/freedreno/deqp-freedreno-a630-fails.txt | 4 ---- src/freedreno/.gitlab-ci/reference/crash.log | 4 ++-- src/freedreno/ir3/ir3_compiler_nir.c | 1 + src/freedreno/ir3/ir3_shader.h | 3 +++ src/freedreno/registers/adreno/a6xx.xml | 25 +++++++++++++--------- src/freedreno/vulkan/tu_pipeline.c | 4 +++- src/gallium/drivers/freedreno/a6xx/fd6_compute.c | 4 +++- 7 files changed, 27 insertions(+), 18 deletions(-) diff --git a/ci-expects/freedreno/deqp-freedreno-a630-fails.txt b/ci-expects/freedreno/deqp-freedreno-a630-fails.txt index eb6f905..f4156ff 100644 --- a/ci-expects/freedreno/deqp-freedreno-a630-fails.txt +++ b/ci-expects/freedreno/deqp-freedreno-a630-fails.txt @@ -49,9 +49,6 @@ dEQP-VK.image.subresource_layout.3d.all_levels.a8b8g8r8_snorm_pack32,Fail dEQP-VK.image.subresource_layout.3d.all_levels.r16g16b16a16_snorm,Fail dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp,Fail dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.image.comp,Fail -dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp,Fail -dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_nonlocal.workgroup.guard_local.buffer.comp,Fail -dEQP-VK.memory_model.write_after_read.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp,Fail dEQP-VK.memory.requirements.dedicated_allocation.buffer.regular,Fail dEQP-VK.memory.requirements.dedicated_allocation.image.transient_tiling_optimal,Fail dEQP-VK.pipeline.extended_dynamic_state.after_pipelines.depth_compare_greater_equal_greater,Fail @@ -66,7 +63,6 @@ dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_combined_image_sampl dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_sampled_image,Crash dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_sampler,Crash dEQP-VK.pipeline.push_descriptor.compute.binding3_numcalls2_storage_image,Crash -dEQP-VK.rasterization.line_continuity.line-strip,Fail dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail dEQP-VK.spirv_assembly.instruction.compute.opquantize.infinities,Fail dEQP-VK.spirv_assembly.instruction.graphics.opquantize.carry_bit_geom,Fail diff --git a/src/freedreno/.gitlab-ci/reference/crash.log b/src/freedreno/.gitlab-ci/reference/crash.log index 3ca8922..8389724 100644 --- a/src/freedreno/.gitlab-ci/reference/crash.log +++ b/src/freedreno/.gitlab-ci/reference/crash.log @@ -7412,7 +7412,7 @@ clusters: 00000080 SP_FS_TEX_COUNT: 128 0000f000 SP_UNKNOWN_A9A8: 0xf000 00421800 SP_CS_CTRL_REG0: { THREADMODE = MULTI | HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 48 | BRANCHSTACK = 8 | THREADSIZE = THREAD64 | VARYING } - 0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE_2K | UNK1 = 0xf } + 0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE = 31 } 00000000 SP_CS_BRANCH_COND: 0 00000000 SP_CS_OBJ_FIRST_EXEC_OFFSET: 0 8c415420 SP_CS_OBJ_START: 0x8c415420 @@ -7494,7 +7494,7 @@ clusters: 00000080 SP_FS_TEX_COUNT: 128 0000f000 SP_UNKNOWN_A9A8: 0xf000 00421800 SP_CS_CTRL_REG0: { THREADMODE = MULTI | HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 48 | BRANCHSTACK = 8 | THREADSIZE = THREAD64 | VARYING } - 0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE_2K | UNK1 = 0xf } + 0000001f SP_CS_UNKNOWN_A9B1: { SHARED_SIZE = 31 } 00000000 SP_CS_BRANCH_COND: 0 00000000 SP_CS_OBJ_FIRST_EXEC_OFFSET: 0 8c415420 SP_CS_OBJ_START: 0x8c415420 diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 0694bd8..7c06bf3 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3413,6 +3413,7 @@ emit_instructions(struct ir3_context *ctx) ctx->s->info.clip_distance_array_size; ctx->so->pvtmem_size = ctx->s->scratch_size; + ctx->so->shared_size = ctx->s->shared_size; /* NOTE: need to do something more clever when we support >1 fxn */ nir_foreach_register (reg, &fxn->registers) { diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index d28567b..5d2aaab 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -559,6 +559,9 @@ struct ir3_shader_variant { /* Whether we should use the new per-wave layout rather than per-fiber. */ bool pvtmem_per_wave; + /* Size in bytes of required shared memory */ + unsigned shared_size; + /* About Linkage: * + Let the frag shader determine the position/compmask for the * varyings, since it is the place where we know if the varying diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml index 6a50378..f1bac71 100644 --- a/src/freedreno/registers/adreno/a6xx.xml +++ b/src/freedreno/registers/adreno/a6xx.xml @@ -3059,17 +3059,22 @@ to upconvert to 32b float internally? - + + - - bit 0 seems to toggle between 2k and 32k of shared storage - the ldl/stl offset seems to be rewritten to 0 when it is beyond - this limit. This is different from ldlw/stlw, which wraps at - 64k (and has 36k of storage on A640 - reads between 36k-64k - always return 0) - - - + + + If 0 - all 32k of shared storage is enabled, otherwise + (SHARED_SIZE + 1) * 1k is enabled. + The ldl/stl offset seems to be rewritten to 0 when it is beyond + this limit. This is different from ldlw/stlw, which wraps at + 64k (and has 36k of storage on A640 - reads between 36k-64k + always return 0) + + + + + diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 3135311..1389f8a 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -532,8 +532,10 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader, tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); + uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); - tu_cs_emit(cs, 0x41); + tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | + A6XX_SP_CS_UNKNOWN_A9B1_UNK6); uint32_t local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c index 909b678..e6b42a8 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c @@ -77,8 +77,10 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) | COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE)); + uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); - OUT_RING(ring, 0x41); + OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | + A6XX_SP_CS_UNKNOWN_A9B1_UNK6); uint32_t local_invocation_id, work_group_id; local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); -- 2.7.4