input_data.setLateKill(true);
same_half.setLateKill(true);
- /* We need one pair of shared VGPRs:
- * Note, that these have twice the allocation granularity of normal VGPRs */
- ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
+ if (ctx->options->gfx_level <= GFX10_3) {
+ /* We need one pair of shared VGPRs:
+ * Note, that these have twice the allocation granularity of normal VGPRs
+ */
+ ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
- return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2), bld.def(s1, scc),
- index_x4, input_data, same_half);
+ return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2),
+ bld.def(s1, scc), index_x4, input_data, same_half);
+ } else {
+ unreachable("emit_bpermute does not yet support GFX11+");
+ }
} else {
/* GFX8-9 or GFX10 wave32: bpermute works normally */
Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
* manually swap the data between the two halves using two shared VGPRs.
*/
- assert(program->gfx_level >= GFX10);
+ assert(program->gfx_level >= GFX10 && program->gfx_level <= GFX10_3);
assert(program->wave_size == 64);
unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
/* Restore saved EXEC */
bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
- /* RA assumes that the result is always in the low part of the register, so we have to shift, if
- * it's not there already */
+ /* RA assumes that the result is always in the low part of the register, so we have to shift,
+ * if it's not there already.
+ */
if (input_data.physReg().byte()) {
unsigned right_shift = input_data.physReg().byte() * 8;
bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),