From e5499ca2bfe8b8d9520e2ef72ebc0da24425992c Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Mon, 14 Dec 2020 18:42:59 +0200 Subject: [PATCH] freedreno/a6xx: Fix SP_HS_UNKNOWN_A831 value and document it It appears that storage for varyings in a wave has an upper limit of wavesize * max_a831 where max_a831 is 64. Exceeding the limit seam to force gpu to reduce primitives processed per wave, at least calculations make sense with such interpretation. With blob SP_HS_UNKNOWN_A831 never exceeds 64 and setting it to 65 in freedreno leads to a hang. On A630 tests (patch_size=3 + gl_Position + array of vec4) have shown such relation: | Num of vec4 | A831 | PC_HS_INPUT_SIZE | |-------------|------|------------------| | 1 | 0x10 | 0xc | | 2 | 0x14 | 0xf | | 3 | 0x18 | 0x12 | | 4 | 0x1c | 0x15 | | 5 | 0x20 | 0x18 | | 6 | 0x24 | 0x1b | | 7 | 0x28 | 0x1e | | 8 | 0x2c | 0x21 | | 9 | 0x30 | 0x24 | | 10 | 0x34 | 0x27 | | 11 | 0x38 | 0x2a | | 12 | 0x3c | 0x2d | | 13 | 0x3f | 0x30 | | 14 | 0x40 | 0x33 | | 15 | 0x3d | 0x36 | | 16 | 0x3d | 0x39 | | 17 | 0x40 | 0x3c | | 18 | 0x3f | 0x3f | | 19 | 0x3e | 0x42 | | 20 | 0x3d | 0x45 | | 21 | 0x3f | 0x48 | | 22 | 0x3d | 0x4b | | 23 | 0x40 | 0x4e | | 24 | 0x3d | 0x51 | | 25 | 0x3f | 0x54 | | 26 | 0x3c | 0x57 | | 27 | 0x3e | 0x5a | | 28 | 0x40 | 0x5d | | 29 | 0x3c | 0x60 | | 30 | 0x3e | 0x63 | | 31 | 0x40 | 0x66 | |-------------|------|------------------| Brief tests with high patch sizes also confirm that formula matches blob behaviour. A831 is not a limit for storage available for one thread, so naming it as SP_HS_WAVE_INPUT_SIZE would make more sense. Fixes: 47e2c195 "freedreno/a6xx: Program state for tessellation stages" Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/.gitlab-ci/reference/crash.log | 4 ++-- ...w.indexed.indirect_draw_count.triangle_list.log | 6 +++--- src/freedreno/.gitlab-ci/reference/fd-clouds.log | 12 ++++++------ src/freedreno/registers/adreno/a6xx.xml | 8 +++++++- src/freedreno/vulkan/tu_pipeline.c | 4 ++-- src/gallium/drivers/freedreno/a6xx/fd6_program.c | 22 +++++++++++++++++++--- 6 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/freedreno/.gitlab-ci/reference/crash.log b/src/freedreno/.gitlab-ci/reference/crash.log index 76ee876..db42f99 100644 --- a/src/freedreno/.gitlab-ci/reference/crash.log +++ b/src/freedreno/.gitlab-ci/reference/crash.log @@ -7973,7 +7973,7 @@ clusters: 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 } 00000000 SP_VS_INSTRLEN: 0 00000000 SP_HS_CTRL_REG0: { HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 0 | BRANCHSTACK = 0 | THREADSIZE = TWO_QUADS } - 00000000 SP_HS_UNKNOWN_A831: 0 + 00000000 SP_HS_WAVE_INPUT_SIZE: 0 00000000 0xa832: 00000000 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0 780a8ca5 SP_HS_OBJ_START_LO: 0x780a8ca5 @@ -8119,7 +8119,7 @@ clusters: 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 } 00000000 SP_VS_INSTRLEN: 0 00000000 SP_HS_CTRL_REG0: { HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 0 | BRANCHSTACK = 0 | THREADSIZE = TWO_QUADS } - 00000000 SP_HS_UNKNOWN_A831: 0 + 00000000 SP_HS_WAVE_INPUT_SIZE: 0 00000000 0xa832: 00000000 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0 780a8ca5 SP_HS_OBJ_START_LO: 0x780a8ca5 diff --git a/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log b/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log index 3f56d8a..8900e61 100644 --- a/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log +++ b/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log @@ -948,8 +948,8 @@ t4 write SP_CS_CONFIG (a9bb) t4 write HLSQ_CS_CNTL (b987) HLSQ_CS_CNTL: { CONSTLEN = 0 } 0000000001054250: 0000: 40b98701 00000000 -t4 write SP_HS_UNKNOWN_A831 (a831) - SP_HS_UNKNOWN_A831: 0 +t4 write SP_HS_WAVE_INPUT_SIZE (a831) + SP_HS_WAVE_INPUT_SIZE: 0 0000000001054258: 0000: 48a83101 00000000 t4 write VFD_CONTROL_1 (a001) VFD_CONTROL_1: { REGID4VTX = r2.y | REGID4INST = r63.x | REGID4PRIMID = r63.x | REGID4VIEWID = r63.x } @@ -1511,7 +1511,7 @@ t7 opcode: CP_DRAW_INDIRECT_MULTI (2a) (12 dwords) - shaderdb: 0 (ss), 0 (sy) !+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 } !+ 00000001 SP_VS_INSTRLEN: 1 - + 00000000 SP_HS_UNKNOWN_A831: 0 + + 00000000 SP_HS_WAVE_INPUT_SIZE: 0 + 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } + 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } + 00000000 SP_GS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } diff --git a/src/freedreno/.gitlab-ci/reference/fd-clouds.log b/src/freedreno/.gitlab-ci/reference/fd-clouds.log index 24fdc66..77bf190 100644 --- a/src/freedreno/.gitlab-ci/reference/fd-clouds.log +++ b/src/freedreno/.gitlab-ci/reference/fd-clouds.log @@ -681,8 +681,8 @@ t4 write SP_VS_OUT[0].REG (a803) t4 write SP_VS_VPC_DST[0].REG (a813) SP_VS_VPC_DST[0].REG: { OUTLOC0 = 0 | OUTLOC1 = 0 | OUTLOC2 = 0 | OUTLOC3 = 0 } 0000000001121070: 0000: 48a81301 00000000 -t4 write SP_HS_UNKNOWN_A831 (a831) - SP_HS_UNKNOWN_A831: 0 +t4 write SP_HS_WAVE_INPUT_SIZE (a831) + SP_HS_WAVE_INPUT_SIZE: 0 0000000001121078: 0000: 48a83101 00000000 t4 write SP_VS_PRIMITIVE_CNTL (a802) SP_VS_PRIMITIVE_CNTL: { OUT = 1 } @@ -1118,7 +1118,7 @@ t7 opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords) - shaderdb: 0 (ss), 0 (sy) !+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 } !+ 00000001 SP_VS_INSTRLEN: 1 - + 00000000 SP_HS_UNKNOWN_A831: 0 + + 00000000 SP_HS_WAVE_INPUT_SIZE: 0 + 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0 + 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } + 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } @@ -1996,8 +1996,8 @@ t4 write SP_VS_OUT[0].REG (a803) t4 write SP_VS_VPC_DST[0].REG (a813) SP_VS_VPC_DST[0].REG: { OUTLOC0 = 0 | OUTLOC1 = 0 | OUTLOC2 = 0 | OUTLOC3 = 0 } 0000000001120070: 0000: 48a81301 00000000 -t4 write SP_HS_UNKNOWN_A831 (a831) - SP_HS_UNKNOWN_A831: 0 +t4 write SP_HS_WAVE_INPUT_SIZE (a831) + SP_HS_WAVE_INPUT_SIZE: 0 0000000001120078: 0000: 48a83101 00000000 t4 write SP_VS_PRIMITIVE_CNTL (a802) SP_VS_PRIMITIVE_CNTL: { OUT = 1 } @@ -5343,7 +5343,7 @@ t7 opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords) - shaderdb: 0 (ss), 0 (sy) + 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 } + 00000001 SP_VS_INSTRLEN: 1 - + 00000000 SP_HS_UNKNOWN_A831: 0 + + 00000000 SP_HS_WAVE_INPUT_SIZE: 0 + 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0 + 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } + 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 } diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml index 082fd72..61a73cb 100644 --- a/src/freedreno/registers/adreno/a6xx.xml +++ b/src/freedreno/registers/adreno/a6xx.xml @@ -3220,7 +3220,13 @@ to upconvert to 32b float internally? - + + diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 883d549..bd7072d 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -1058,7 +1058,7 @@ tu6_emit_vpc(struct tu_cs *cs, unknown_a831 = DIV_ROUND_UP(total_size, wavesize); } - tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1); + tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); tu_cs_emit(cs, unknown_a831); /* In SPIR-V generated from GLSL, the tessellation primitive params are @@ -1563,7 +1563,7 @@ tu6_emit_program(struct tu_cs *cs, tu_cs_emit(cs, builder->multiview_mask); } - tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1); + tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); tu_cs_emit(cs, 0); tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index c872c94..251479a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -654,8 +654,24 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx, OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4); - OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1); - OUT_RING(ring, vs->output_size); + const uint32_t wavesize = 64; + const uint32_t max_wave_input_size = 64; + const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out; + + /* note: if HS is really just the VS extended, then this + * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) + * however that doesn't match the blob, and fails some dEQP tests. + */ + uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; + uint32_t max_prims_per_wave = + max_wave_input_size * wavesize / (vs->output_size * patch_control_points); + prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); + + uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave; + uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); + + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); shader_info *ds_info = &ds->shader->nir->info; OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); @@ -706,7 +722,7 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx, A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); } else { - OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1); + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); OUT_RING(ring, 0); } -- 2.7.4