}
for (unsigned i = 0; i < full_dwords; ++i)
- comps[i] = nir_build_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
- .base = component_stride * i, .memory_modes = nir_var_shader_in);
+ comps[i] = nir_load_buffer_amd(b, 1, 32, desc, v_off, s_off,
+ .base = component_stride * i, .memory_modes = nir_var_shader_in,
+ .access = ACCESS_COHERENT);
if (remaining_bytes)
- comps[full_dwords] = nir_build_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
- .base = component_stride * full_dwords, .memory_modes = nir_var_shader_in);
+ comps[full_dwords] = nir_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off,
+ .base = component_stride * full_dwords,
+ .memory_modes = nir_var_shader_in,
+ .access = ACCESS_COHERENT);
return nir_extract_bits(b, comps, full_dwords + !!remaining_bytes, 0, num_components, bit_size);
}
store_bytes = MIN2(store_bytes, 2);
nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u);
- nir_build_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
- .base = start_byte, .memory_modes = nir_var_shader_out);
+ nir_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc,
+ .base = start_byte, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
start_byte += store_bytes;
bytes -= store_bytes;
nir_store_buffer_amd(b, store_val, ring, addr, off,
.base = const_off,
.write_mask = write_mask,
- .memory_modes = nir_var_shader_out);
+ .memory_modes = nir_var_shader_out,
+ .access = ACCESS_COHERENT);
} else if (out_mode == ms_out_mode_var) {
if (store_val->bit_size > 32) {
/* Split 64-bit store values to 32-bit components. */
nir_ssa_def *off = nir_load_ring_mesh_scratch_offset_amd(b);
return nir_load_buffer_amd(b, num_components, load_bit_size, ring, addr, off,
.base = const_off,
- .memory_modes = nir_var_shader_out);
+ .memory_modes = nir_var_shader_out,
+ .access = ACCESS_COHERENT);
} else if (out_mode == ms_out_mode_var) {
nir_ssa_def *arr[8] = {0};
unsigned num_32bit_components = num_components * load_bit_size / 32;
nir_ssa_def *vector_off = nir_imm_int(b, 0);
nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off,
- .base = const_off, .memory_modes = nir_var_shader_out);
+ .base = const_off, .memory_modes = nir_var_shader_out,
+ .access = ACCESS_COHERENT);
}
static bool
nir_store_buffer_amd(b, store_val, ring, addr, ring_off, .base = base,
.write_mask = write_mask,
- .memory_modes = nir_var_mem_task_payload);
+ .memory_modes = nir_var_mem_task_payload,
+ .access = ACCESS_COHERENT);
return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}
nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes);
return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, .base = base,
- .memory_modes = nir_var_mem_task_payload);
+ .memory_modes = nir_var_mem_task_payload,
+ .access = ACCESS_COHERENT);
}
static nir_ssa_def *
nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
- nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, .write_mask = write_mask, .memory_modes = nir_var_shader_out);
+ nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset,
+ .write_mask = write_mask, .memory_modes = nir_var_shader_out,
+ .access = ACCESS_COHERENT);
}
if (write_to_lds) {
/* Store the dynamic HS control word. */
nir_if *rel_patch_id_zero = nir_push_if(b, nir_ieq_imm(b, rel_patch_id, 0));
nir_ssa_def *ctrlw = nir_imm_int(b, 0x80000000u);
- nir_store_buffer_amd(b, ctrlw, tessfactor_ring, nir_imm_zero(b, 1, 32), tess_factors_base);
+ nir_store_buffer_amd(b, ctrlw, tessfactor_ring, nir_imm_zero(b, 1, 32), tess_factors_base,
+ .access = ACCESS_COHERENT);
tess_factors_const_offset += 4;
nir_pop_if(b, rel_patch_id_zero);
}
if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
/* LINES reversal */
nir_ssa_def *t = nir_vec2(b, nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 0));
- nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
+ nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ .base = tess_factors_const_offset, .access = ACCESS_COHERENT);
} else if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
nir_ssa_def *t = nir_vec4(b, nir_channel(b, tessfactors_outer, 0), nir_channel(b, tessfactors_outer, 1),
nir_channel(b, tessfactors_outer, 2), nir_channel(b, tessfactors_inner, 0));
- nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
+ nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ .base = tess_factors_const_offset, .access = ACCESS_COHERENT);
} else {
- nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
- nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset + 4u * outer_comps);
+ nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ .base = tess_factors_const_offset, .access = ACCESS_COHERENT);
+ nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base,
+ .base = tess_factors_const_offset + 4u * outer_comps, .access = ACCESS_COHERENT);
}
if (st->tes_reads_tessfactors) {
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_out_loc);
- nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, .memory_modes = nir_var_shader_out);
+ nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset,
+ .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
if (inner_comps) {
nir_ssa_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_in_loc);
- nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, .memory_modes = nir_var_shader_out);
+ nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset,
+ .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT);
}
}
? hs_per_vertex_output_vmem_offset(b, st, intrin)
: hs_per_patch_output_vmem_offset(b, st, intrin, 0);
- return nir_load_buffer_amd(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, offchip_ring, off, offchip_offset);
+ return nir_load_buffer_amd(b, intrin->dest.ssa.num_components,
+ intrin->dest.ssa.bit_size, offchip_ring, off, offchip_offset,
+ .access = ACCESS_COHERENT);
}
static bool
void
emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
- unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
- bool slc = false, bool swizzled = false)
+ unsigned const_offset, memory_sync_info sync, bool glc, bool slc,
+ bool swizzled)
{
assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
- bool glc = ctx->program->gfx_level < GFX11;
+ glc &= ctx->program->gfx_level < GFX11;
Builder::Result r =
bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
/* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
void
store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
- bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
- bool slc = false)
+ bool allow_combining, memory_sync_info sync, bool glc, bool slc)
{
Builder bld(ctx->program, ctx->block);
assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
for (unsigned i = 0; i < write_count; i++) {
unsigned const_offset = offsets[i] + base_const_offset;
emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
- slc, !allow_combining);
+ glc, slc, !allow_combining);
}
}
load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
- bool slc = false, memory_sync_info sync = memory_sync_info())
+ bool glc = false, bool slc = false, memory_sync_info sync = memory_sync_info())
{
assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
assert((num_components * elem_size_bytes) == dst.bytes());
LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
info.component_stride = allow_combining ? 0 : stride;
- info.glc = true;
+ info.glc = glc;
info.slc = slc;
info.swizzle_component_size = allow_combining ? 0 : 4;
info.align_mul = MIN2(elem_size_bytes, 4);
bool swizzled = nir_intrinsic_is_swizzled(intrin);
bool reorder = nir_intrinsic_can_reorder(intrin);
bool slc = nir_intrinsic_slc_amd(intrin);
+ bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
unsigned const_offset = nir_intrinsic_base(intrin);
unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
- num_components, swizzle_element_size, !swizzled, reorder, slc, sync);
+ num_components, swizzle_element_size, !swizzled, reorder, glc, slc, sync);
}
void
Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
bool swizzled = nir_intrinsic_is_swizzled(intrin);
+ bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
bool slc = nir_intrinsic_slc_amd(intrin);
unsigned const_offset = nir_intrinsic_base(intrin);
memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
- write_mask, !swizzled, sync, slc);
+ write_mask, !swizzled, sync, glc, slc);
}
void
Temp val = bld.tmp(v1);
unsigned const_offset = offset * program->info.gs.vertices_out * 16 * 4;
load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
- true, true);
+ true, true, true);
ctx.outputs.mask[i] |= 1 << j;
ctx.outputs.temps[i * 4u + j] = val;
unsigned const_offset = nir_intrinsic_base(instr);
bool swizzled = nir_intrinsic_is_swizzled(instr);
bool reorder = nir_intrinsic_can_reorder(instr);
+ bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
bool slc = nir_intrinsic_slc_amd(instr);
- enum ac_image_cache_policy cache_policy = ac_glc;
+ enum ac_image_cache_policy cache_policy = 0;
if (swizzled)
cache_policy |= ac_swizzled;
if (slc)
cache_policy |= ac_slc;
+ if (coherent)
+ cache_policy |= ac_glc;
LLVMTypeRef channel_type;
if (instr->dest.ssa.bit_size == 8)
LLVMValueRef addr_soffset = get_src(ctx, instr->src[3]);
unsigned const_offset = nir_intrinsic_base(instr);
bool swizzled = nir_intrinsic_is_swizzled(instr);
+ bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
bool slc = nir_intrinsic_slc_amd(instr);
- enum ac_image_cache_policy cache_policy = ac_glc;
+ enum ac_image_cache_policy cache_policy = 0;
if (swizzled)
cache_policy |= ac_swizzled;
+ if (coherent && ctx->ac.gfx_level < GFX11)
+ cache_policy |= ac_glc;
if (slc)
cache_policy |= ac_slc;
# AMD GCN/RDNA specific intrinsics
# src[] = { descriptor, base address, scalar offset }
-intrinsic("load_buffer_amd", src_comp=[4, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, SLC_AMD, MEMORY_MODES], flags=[CAN_ELIMINATE])
+intrinsic("load_buffer_amd", src_comp=[4, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE])
# src[] = { store value, descriptor, base address, scalar offset }
-intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, SLC_AMD, MEMORY_MODES])
+intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS])
# src[] = { address, unsigned 32-bit offset }.
load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])