if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
/* We need to store the primitive IDs in LDS */
unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size;
- ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) /
- ctx->program->lds_alloc_granule;
+ ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->lds_encoding_granule);
}
}
unsigned total_lds_bytes = esgs_ring_bytes + ngg_emit_bytes + ngg_gs_scratch_bytes;
assert(total_lds_bytes >= ctx->ngg_gs_emit_addr);
assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr);
- ctx->program->config->lds_size = (total_lds_bytes + ctx->program->lds_alloc_granule - 1) / ctx->program->lds_alloc_granule;
+ ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->lds_encoding_granule);
/* Make sure we have enough room for emitted GS vertices */
if (nir->info.gs.vertices_out)
ctx->args->shader_info->tcs.num_patches = ctx->tcs_num_patches;
ctx->args->shader_info->tcs.num_lds_blocks = lds_size;
- ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) /
- ctx->program->lds_alloc_granule;
+ ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->lds_encoding_granule);
}
void
break;
}
case MESA_SHADER_COMPUTE: {
- ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) /
- ctx->program->lds_alloc_granule;
+ ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.cs.shared_size, ctx->program->lds_encoding_granule);
break;
}
case MESA_SHADER_VERTEX: {
}
/* Make sure we fit the available LDS space. */
- assert((ctx->program->config->lds_size * ctx->program->lds_alloc_granule) <= ctx->program->lds_limit);
+ assert((ctx->program->config->lds_size * ctx->program->lds_encoding_granule) <= ctx->program->lds_limit);
}
void
program->wave_size = info->wave_size;
program->lane_mask = program->wave_size == 32 ? s1 : s2;
- program->lds_alloc_granule = chip_class >= GFX7 ? 512 : 256;
+ program->lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
+ program->lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->lds_encoding_granule;
program->lds_limit = chip_class >= GFX7 ? 65536 : 32768;
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
program->has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
if (program->config->lds_size) {
- unsigned lds = program->config->lds_size * program->lds_alloc_granule;
+ unsigned lds = program->config->lds_size * program->lds_encoding_granule;
+ lds = align(lds, program->lds_alloc_granule);
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
}
if (waves_per_workgroup > 1 && program->chip_class < GFX10)