break;
}
case nir_intrinsic_load_hs_out_patch_data_offset_amd:
- replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 11, 21);
+ replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 16, 16);
break;
case nir_intrinsic_load_ring_tess_offchip_offset_amd:
replacement = ac_nir_load_arg(b, &args->ac, args->ac.tess_offchip_offset);
* [0:5] = the number of patches per threadgroup - 1, max = 63
* # 5 bits
* [6:10] = the number of output vertices per patch - 1, max = 31
- * # 21 bits
- * [11:31] = the offset of per patch attributes in the buffer in bytes.
- * max = NUM_PATCHES*32*32*16 = 1M
+ * # 16 bits
+ * [16:31] = the offset of per patch attributes in the buffer in bytes.
+ * 64 outputs are implied by SI_UNIQUE_SLOT_* values.
+ * max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) = 2M,
+ * clamped to 32K(LDS limit) = 32K
*/
struct ac_arg tcs_offchip_layout;
/* API TCS */
/* Offsets where TCS outputs and TCS patch outputs live in LDS (<= 16K):
- * [16:31] = TCS output patch0 offset for per-patch / 4, max = 16K / 4 = 4K
+ * [16:31] = TCS output patch0 offset for per-patch / 4,
+ * 64 outputs are implied by SI_UNIQUE_SLOT_* values.
+ * max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + outputs) / 4
+ * = 1M, clamped to 32K(LDS limit) / 4 = 8K
*/
struct ac_arg tcs_out_lds_offsets;
/* Layout of TCS outputs / TES inputs:
base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
if (!vertex_index) {
- LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->args->tcs_offchip_layout, 11, 21);
+ LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->args->tcs_offchip_layout, 16, 16);
base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
}
assert(num_tcs_input_cp <= 32);
assert(num_tcs_output_cp <= 32);
assert(num_patches <= 64);
- assert(((pervertex_output_patch_size * num_patches) & ~0x1fffff) == 0);
+ assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0);
uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ?
si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
sctx->tcs_out_offsets = ((perpatch_output_offset / 4) << 16);
sctx->tcs_offchip_layout =
(num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |
- ((pervertex_output_patch_size * num_patches) << 11);
+ ((pervertex_output_patch_size * num_patches) << 16);
/* Compute the LDS size. */
unsigned lds_size = lds_per_patch * num_patches;