Also modify all existing uses to pass a zero to this new src.
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@collabora.com> (nir)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17551>
full_dwords++;
}
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+
for (unsigned i = 0; i < full_dwords; ++i)
- comps[i] = nir_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
+ comps[i] = nir_load_buffer_amd(b, 1, 32, desc, v_off, s_off, zero,
.base = component_stride * i, .memory_modes = nir_var_shader_in,
.access = ACCESS_COHERENT);
if (remaining_bytes)
- comps[full_dwords] = nir_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
+ comps[full_dwords] = nir_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off, zero,
.base = component_stride * full_dwords,
.memory_modes = nir_var_shader_in,
.access = ACCESS_COHERENT);
unsigned component_stride, unsigned num_components, unsigned bit_size,
unsigned writemask, bool swizzled, bool slc)
{
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+
while (writemask) {
int start, count;
u_bit_scan_consecutive_range(&writemask, &start, &count);
store_bytes = MIN2(store_bytes, 2);
nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u);
- nir_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
+ nir_store_buffer_amd(b, store_val, desc, v_off, s_off, zero, .is_swizzled = swizzled, .slc_amd = slc,
.base = start_byte, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
start_byte += store_bytes;
nir_ssa_def *out_data =
nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset);
+ nir_ssa_def *zero = nir_imm_int(b, 0);
nir_store_buffer_amd(b, out_data, so_buffer[out->buffer],
vtx_buffer_offsets[out->buffer],
- nir_imm_int(b, 0),
+ zero, zero,
.base = out->offset,
.slc_amd = true);
}
} else if (out_mode == ms_out_mode_vram) {
nir_ssa_def *ring = nir_load_ring_mesh_scratch_amd(b);
nir_ssa_def *off = nir_load_ring_mesh_scratch_offset_amd(b);
- nir_store_buffer_amd(b, store_val, ring, addr, off,
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_store_buffer_amd(b, store_val, ring, addr, off, zero,
.base = const_off,
.write_mask = write_mask,
.memory_modes = nir_var_shader_out,
} else if (out_mode == ms_out_mode_vram) {
nir_ssa_def *ring = nir_load_ring_mesh_scratch_amd(b);
nir_ssa_def *off = nir_load_ring_mesh_scratch_offset_amd(b);
- return nir_load_buffer_amd(b, num_components, load_bit_size, ring, addr, off,
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ return nir_load_buffer_amd(b, num_components, load_bit_size, ring, addr, off, zero,
.base = const_off,
.memory_modes = nir_var_shader_out,
.access = ACCESS_COHERENT);
nir_ssa_def *ring = nir_load_ring_task_draw_amd(b);
nir_ssa_def *scalar_off = nir_imul_imm(b, ptr, s->draw_entry_bytes);
nir_ssa_def *vector_off = nir_imm_int(b, 0);
+ nir_ssa_def *zero = nir_imm_int(b, 0);
- nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off,
+ nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off, zero,
.base = const_off, .memory_modes = nir_var_shader_out,
.access = ACCESS_COHERENT);
}
nir_ssa_def *ring = nir_load_ring_task_payload_amd(b);
nir_ssa_def *ptr = task_ring_entry_index(b, s);
nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes);
+ nir_ssa_def *zero = nir_imm_int(b, 0);
- nir_store_buffer_amd(b, store_val, ring, addr, ring_off, .base = base,
+ nir_store_buffer_amd(b, store_val, ring, addr, ring_off, zero, .base = base,
.write_mask = write_mask,
.memory_modes = nir_var_mem_task_payload,
.access = ACCESS_COHERENT);
nir_ssa_def *addr = intrin->src[0].ssa;
nir_ssa_def *ring = nir_load_ring_task_payload_amd(b);
nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes);
+ nir_ssa_def *zero = nir_imm_int(b, 0);
- return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, .base = base,
+ return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, zero, .base = base,
.memory_modes = nir_var_mem_task_payload,
.access = ACCESS_COHERENT);
}
nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
- nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset,
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, zero,
.write_mask = write_mask, .memory_modes = nir_var_shader_out,
.access = ACCESS_COHERENT);
}
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_in_loc % 16u)
: NULL;
+ nir_ssa_def *zero = nir_imm_int(b, 0);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *tess_factors_base = nir_load_ring_tess_factors_offset_amd(b);
nir_ssa_def *tess_factors_offset = nir_imul_imm(b, rel_patch_id, (inner_comps + outer_comps) * 4u);
/* Store the dynamic HS control word. */
nir_if *rel_patch_id_zero = nir_push_if(b, nir_ieq_imm(b, rel_patch_id, 0));
nir_ssa_def *ctrlw = nir_imm_int(b, 0x80000000u);
- nir_store_buffer_amd(b, ctrlw, tessfactor_ring, nir_imm_zero(b, 1, 32), tess_factors_base,
+ nir_store_buffer_amd(b, ctrlw, tessfactor_ring, zero, tess_factors_base, zero,
.access = ACCESS_COHERENT);
tess_factors_const_offset += 4;
nir_pop_if(b, rel_patch_id_zero);
if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
/* LINES reversal */
nir_ssa_def *t = nir_vec2(b, nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 0));
- nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset, .access = ACCESS_COHERENT);
} else if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
nir_ssa_def *t = nir_vec4(b, nir_channel(b, tessfactors_outer, 0), nir_channel(b, tessfactors_outer, 1),
nir_channel(b, tessfactors_outer, 2), nir_channel(b, tessfactors_inner, 0));
- nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset, .access = ACCESS_COHERENT);
} else {
- nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset, .access = ACCESS_COHERENT);
- nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, zero,
.base = tess_factors_const_offset + 4u * outer_comps, .access = ACCESS_COHERENT);
}
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_out_loc);
- nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset,
+ nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, zero,
.memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
if (inner_comps) {
nir_ssa_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_in_loc);
- nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset,
+ nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, zero,
.memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
}
}
? hs_per_vertex_output_vmem_offset(b, st, intrin)
: hs_per_patch_output_vmem_offset(b, st, intrin, 0);
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+
return nir_load_buffer_amd(b, intrin->dest.ssa.num_components,
- intrin->dest.ssa.bit_size, offchip_ring, off, offchip_offset,
+ intrin->dest.ssa.bit_size, offchip_ring,
+ off, offchip_offset, zero,
.access = ACCESS_COHERENT);
}
unsigned num_components;
unsigned component_size;
Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
+ Temp idx = Temp(0, v1); /* buffer index */
unsigned component_stride = 0;
unsigned const_offset = 0;
unsigned align_mul = 0;
soffset = Operand(info.soffset);
}
+ bool offen = !vaddr.isUndefined();
+ bool idxen = info.idx.id();
+
+ if (offen && idxen)
+ vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
+ else if (idxen)
+ vaddr = Operand(info.idx);
+
unsigned bytes_size = 0;
aco_opcode op;
if (bytes_needed == 1 || align_ % 2) {
mubuf->operands[0] = Operand(info.resource);
mubuf->operands[1] = vaddr;
mubuf->operands[2] = soffset;
- mubuf->offen = (offset.type() == RegType::vgpr);
+ mubuf->offen = offen;
+ mubuf->idxen = idxen;
mubuf->glc = info.glc;
mubuf->dlc =
info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
}
void
-emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
+emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx, Temp vdata,
unsigned const_offset, memory_sync_info sync, bool glc, bool slc,
bool swizzled)
{
aco_opcode op = get_buffer_store_op(vdata.bytes());
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
- Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
+ bool offen = voffset.id();
+ bool idxen = idx.id();
+
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
glc &= ctx->program->gfx_level < GFX11;
+
+ Operand vaddr_op(v1);
+ if (offen && idxen)
+ vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset);
+ else if (offen)
+ vaddr_op = Operand(voffset);
+ else if (idxen)
+ vaddr_op = Operand(idx);
+
Builder::Result r =
- bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
- /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
- /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false,
- /* glc */ glc, /* dlc*/ false, /* slc */ slc);
+ bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset,
+ offen, swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc,
+ /* dlc*/ false, slc);
r.instr->mubuf().sync = sync;
}
void
-store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
+store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
bool swizzled, memory_sync_info sync, bool glc, bool slc)
{
for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset;
- emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset, sync,
glc, slc, swizzled);
}
}
void
-load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
+load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
unsigned swizzle_element_size, bool glc, bool slc, memory_sync_info sync)
{
Builder bld(ctx->program, ctx->block);
LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
+ info.idx = idx;
info.component_stride = swizzle_element_size;
info.glc = glc;
info.slc = slc;
{
Builder bld(ctx->program, ctx->block);
+ bool idxen = !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
+
Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
+ Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
bool swizzled = nir_intrinsic_is_swizzled(intrin);
bool slc = nir_intrinsic_slc_amd(intrin);
nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
- load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
+ load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, idx, const_offset, elem_size_bytes,
num_components, swizzle_element_size, glc, slc, sync);
}
{
Builder bld(ctx->program, ctx->block);
+ bool idxen = !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
+
Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa));
Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
+ Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
bool swizzled = nir_intrinsic_is_swizzled(intrin);
bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
- store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
+ store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset, elem_size_bytes,
write_mask, swizzled, sync, glc, slc);
}
Temp val = bld.tmp(v1);
unsigned const_offset = offset * program->info.gs.vertices_out * 16 * 4;
- load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
+ load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), Temp(), const_offset, 4, 1, 0, true,
true, memory_sync_info());
ctx.outputs.mask[i] |= 1 << j;
/* Currently ignored. */
break;
case nir_intrinsic_load_buffer_amd: {
+ bool idxen = !nir_src_is_const(instr->src[3]) || nir_src_as_uint(instr->src[3]);
+
LLVMValueRef descriptor = get_src(ctx, instr->src[0]);
LLVMValueRef addr_voffset = get_src(ctx, instr->src[1]);
LLVMValueRef addr_soffset = get_src(ctx, instr->src[2]);
+ LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[3]) : NULL;
unsigned num_components = instr->dest.ssa.num_components;
unsigned const_offset = nir_intrinsic_base(instr);
bool swizzled = nir_intrinsic_is_swizzled(instr);
LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, addr_voffset,
LLVMConstInt(ctx->ac.i32, const_offset, 0), "");
- result = ac_build_buffer_load(&ctx->ac, descriptor, num_components, NULL, voffset,
+ result = ac_build_buffer_load(&ctx->ac, descriptor, num_components, vidx, voffset,
addr_soffset, channel_type, cache_policy, reorder, false);
+
result = ac_to_integer(&ctx->ac, ac_trim_vector(&ctx->ac, result, num_components));
break;
}
case nir_intrinsic_store_buffer_amd: {
+ bool idxen = !nir_src_is_const(instr->src[4]) || nir_src_as_uint(instr->src[4]);
+
LLVMValueRef store_data = get_src(ctx, instr->src[0]);
LLVMValueRef descriptor = get_src(ctx, instr->src[1]);
LLVMValueRef addr_voffset = get_src(ctx, instr->src[2]);
LLVMValueRef addr_soffset = get_src(ctx, instr->src[3]);
+ LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[4]) : NULL;
unsigned const_offset = nir_intrinsic_base(instr);
bool swizzled = nir_intrinsic_is_swizzled(instr);
bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
LLVMConstInt(ctx->ac.i32, const_offset + start * 4, 0), "");
LLVMValueRef data = extract_vector_range(&ctx->ac, store_data, start, count);
- ac_build_buffer_store_dword(&ctx->ac, descriptor, data, NULL, voffset, addr_soffset,
+ ac_build_buffer_store_dword(&ctx->ac, descriptor, data, vidx, voffset, addr_soffset,
cache_policy);
}
break;
# AMD GCN/RDNA specific intrinsics
-# src[] = { descriptor, base address, scalar offset }
-intrinsic("load_buffer_amd", src_comp=[4, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE])
-# src[] = { store value, descriptor, base address, scalar offset }
-intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS])
+# src[] = { descriptor, vector byte offset, scalar byte offset, index offset }
+# The index offset is multiplied by the stride in the descriptor. The vertex/scalar byte offsets
+# are in bytes.
+intrinsic("load_buffer_amd", src_comp=[4, 1, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE])
+# src[] = { store value, descriptor, vector byte offset, scalar byte offset, index offset }
+intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS])
# src[] = { address, unsigned 32-bit offset }.
load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])