It appears that storage for varyings in a wave has an upper
limit of wavesize * max_a831 where max_a831 is 64.
Exceeding the limit seam to force gpu to reduce primitives
processed per wave, at least calculations make sense with
such interpretation.
With blob SP_HS_UNKNOWN_A831 never exceeds 64 and setting
it to 65 in freedreno leads to a hang.
On A630 tests (patch_size=3 + gl_Position + array of vec4)
have shown such relation:
| Num of vec4 | A831 | PC_HS_INPUT_SIZE |
|-------------|------|------------------|
| 1 | 0x10 | 0xc |
| 2 | 0x14 | 0xf |
| 3 | 0x18 | 0x12 |
| 4 | 0x1c | 0x15 |
| 5 | 0x20 | 0x18 |
| 6 | 0x24 | 0x1b |
| 7 | 0x28 | 0x1e |
| 8 | 0x2c | 0x21 |
| 9 | 0x30 | 0x24 |
| 10 | 0x34 | 0x27 |
| 11 | 0x38 | 0x2a |
| 12 | 0x3c | 0x2d |
| 13 | 0x3f | 0x30 |
| 14 | 0x40 | 0x33 |
| 15 | 0x3d | 0x36 |
| 16 | 0x3d | 0x39 |
| 17 | 0x40 | 0x3c |
| 18 | 0x3f | 0x3f |
| 19 | 0x3e | 0x42 |
| 20 | 0x3d | 0x45 |
| 21 | 0x3f | 0x48 |
| 22 | 0x3d | 0x4b |
| 23 | 0x40 | 0x4e |
| 24 | 0x3d | 0x51 |
| 25 | 0x3f | 0x54 |
| 26 | 0x3c | 0x57 |
| 27 | 0x3e | 0x5a |
| 28 | 0x40 | 0x5d |
| 29 | 0x3c | 0x60 |
| 30 | 0x3e | 0x63 |
| 31 | 0x40 | 0x66 |
|-------------|------|------------------|
Brief tests with high patch sizes also confirm that formula
matches blob behaviour.
A831 is not a limit for storage available for one thread, so
naming it as SP_HS_WAVE_INPUT_SIZE would make more sense.
Fixes:
47e2c195 "freedreno/a6xx: Program state for tessellation stages"
Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7917>
00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
00000000 SP_VS_INSTRLEN: 0
00000000 SP_HS_CTRL_REG0: { HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 0 | BRANCHSTACK = 0 | THREADSIZE = TWO_QUADS }
- 00000000 SP_HS_UNKNOWN_A831: 0
+ 00000000 SP_HS_WAVE_INPUT_SIZE: 0
00000000 0xa832: 00000000
00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
780a8ca5 SP_HS_OBJ_START_LO: 0x780a8ca5
00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
00000000 SP_VS_INSTRLEN: 0
00000000 SP_HS_CTRL_REG0: { HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 0 | BRANCHSTACK = 0 | THREADSIZE = TWO_QUADS }
- 00000000 SP_HS_UNKNOWN_A831: 0
+ 00000000 SP_HS_WAVE_INPUT_SIZE: 0
00000000 0xa832: 00000000
00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
780a8ca5 SP_HS_OBJ_START_LO: 0x780a8ca5
t4 write HLSQ_CS_CNTL (b987)
HLSQ_CS_CNTL: { CONSTLEN = 0 }
0000000001054250: 0000: 40b98701 00000000
-t4 write SP_HS_UNKNOWN_A831 (a831)
- SP_HS_UNKNOWN_A831: 0
+t4 write SP_HS_WAVE_INPUT_SIZE (a831)
+ SP_HS_WAVE_INPUT_SIZE: 0
0000000001054258: 0000: 48a83101 00000000
t4 write VFD_CONTROL_1 (a001)
VFD_CONTROL_1: { REGID4VTX = r2.y | REGID4INST = r63.x | REGID4PRIMID = r63.x | REGID4VIEWID = r63.x }
- shaderdb: 0 (ss), 0 (sy)
!+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
!+ 00000001 SP_VS_INSTRLEN: 1
- + 00000000 SP_HS_UNKNOWN_A831: 0
+ + 00000000 SP_HS_WAVE_INPUT_SIZE: 0
+ 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_GS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
t4 write SP_VS_VPC_DST[0].REG (a813)
SP_VS_VPC_DST[0].REG: { OUTLOC0 = 0 | OUTLOC1 = 0 | OUTLOC2 = 0 | OUTLOC3 = 0 }
0000000001121070: 0000: 48a81301 00000000
-t4 write SP_HS_UNKNOWN_A831 (a831)
- SP_HS_UNKNOWN_A831: 0
+t4 write SP_HS_WAVE_INPUT_SIZE (a831)
+ SP_HS_WAVE_INPUT_SIZE: 0
0000000001121078: 0000: 48a83101 00000000
t4 write SP_VS_PRIMITIVE_CNTL (a802)
SP_VS_PRIMITIVE_CNTL: { OUT = 1 }
- shaderdb: 0 (ss), 0 (sy)
!+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
!+ 00000001 SP_VS_INSTRLEN: 1
- + 00000000 SP_HS_UNKNOWN_A831: 0
+ + 00000000 SP_HS_WAVE_INPUT_SIZE: 0
+ 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
+ 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
t4 write SP_VS_VPC_DST[0].REG (a813)
SP_VS_VPC_DST[0].REG: { OUTLOC0 = 0 | OUTLOC1 = 0 | OUTLOC2 = 0 | OUTLOC3 = 0 }
0000000001120070: 0000: 48a81301 00000000
-t4 write SP_HS_UNKNOWN_A831 (a831)
- SP_HS_UNKNOWN_A831: 0
+t4 write SP_HS_WAVE_INPUT_SIZE (a831)
+ SP_HS_WAVE_INPUT_SIZE: 0
0000000001120078: 0000: 48a83101 00000000
t4 write SP_VS_PRIMITIVE_CNTL (a802)
SP_VS_PRIMITIVE_CNTL: { OUT = 1 }
- shaderdb: 0 (ss), 0 (sy)
+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000001 SP_VS_INSTRLEN: 1
- + 00000000 SP_HS_UNKNOWN_A831: 0
+ + 00000000 SP_HS_WAVE_INPUT_SIZE: 0
+ 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
+ 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
<reg32 offset="0xa825" name="SP_VS_PVT_MEM_HW_STACK_OFFSET" low="0" high="18" shr="11"/>
<reg32 offset="0xa830" name="SP_HS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0"/>
- <reg32 offset="0xa831" name="SP_HS_UNKNOWN_A831"/>
+ <!--
+ Total size of local storage in dwords divided by the wave size.
+ The maximum value is 64. With the wave size being always 64 for HS,
+ the maximum size of local storage should be:
+ 64 (wavesize) * 64 (SP_HS_WAVE_INPUT_SIZE) * 4 = 16k
+ -->
+ <reg32 offset="0xa831" name="SP_HS_WAVE_INPUT_SIZE" low="0" high="7" type="uint"/>
<reg32 offset="0xa833" name="SP_HS_OBJ_FIRST_EXEC_OFFSET" type="uint"/>
<reg32 offset="0xa834" name="SP_HS_OBJ_START_LO"/>
unknown_a831 = DIV_ROUND_UP(total_size, wavesize);
}
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
tu_cs_emit(cs, unknown_a831);
/* In SPIR-V generated from GLSL, the tessellation primitive params are
tu_cs_emit(cs, builder->multiview_mask);
}
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
tu_cs_emit(cs, 0);
tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch,
OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4);
- OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
- OUT_RING(ring, vs->output_size);
+ const uint32_t wavesize = 64;
+ const uint32_t max_wave_input_size = 64;
+ const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out;
+
+ /* note: if HS is really just the VS extended, then this
+ * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
+ * however that doesn't match the blob, and fails some dEQP tests.
+ */
+ uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
+ uint32_t max_prims_per_wave =
+ max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
+ prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
+
+ uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
+ uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
+
+ OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
+ OUT_RING(ring, wave_input_size);
shader_info *ds_info = &ds->shader->nir->info;
OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
} else {
- OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
+ OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
OUT_RING(ring, 0);
}