return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
} else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
- /* GFX10 wave64 mode: emulate full-wave bpermute */
- if (!ctx->has_gfx10_wave64_bpermute) {
- ctx->has_gfx10_wave64_bpermute = true;
- /* Shared VGPRs are allocated in groups of 8/16 */
- ctx->program->config->num_shared_vgprs = ctx->program->chip_class >= GFX10_3 ? 16 : 8;
- ctx->program->vgpr_limit -= ctx->program->chip_class >= GFX10_3 ? 8 : 4;
- }
+ /* GFX10 wave64 mode: emulate full-wave bpermute */
Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
input_data.setLateKill(true);
same_half.setLateKill(true);
+ /* We need one pair of shared VGPRs:
+ * Note, that these have twice the allocation granularity of normal VGPRs */
+ ctx->program->config->num_shared_vgprs = 2 * ctx->program->vgpr_alloc_granule;
+
return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
} else {
/* GFX8-9 or GFX10 wave32: bpermute works normally */
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
{
uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1);
+ vgprs -= program->config->num_shared_vgprs / 2;
return std::min(vgprs, program->vgpr_limit);
}
program->max_reg_demand = new_demand;
} else {
program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
- program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / get_vgpr_alloc(program, new_demand.vgpr));
+ uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
+ program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / vgpr_demand);
program->max_waves = max_waves_per_simd;
/* adjust max_waves for workgroup and LDS limits */