This fixes L1 shader array cache coherency.
Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
}
+static LLVMValueRef get_cache_policy(struct ac_llvm_context *ctx,
+ bool load, bool glc, bool slc)
+{
+ return LLVMConstInt(ctx->i32,
+ (glc ? ac_glc : 0) +
+ (slc ? ac_slc : 0) +
+ (ctx->chip_class >= GFX10 && glc && load ? ac_dlc : 0), 0);
+}
+
static void
ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
LLVMValueRef rsrc,
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ args[idx++] = get_cache_policy(ctx, false, glc, slc);
unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ args[idx++] = get_cache_policy(ctx, true, glc, slc);
unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
: "llvm.SI.load.const.v4i32";
unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
+ /* TODO: set glc+dlc on GFX10 (LLVM support is missing) */
+ assert(!glc || ctx->chip_class < GFX10);
LLVMValueRef args[3] = {
rsrc,
offset,
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ args[idx++] = get_cache_policy(ctx, true, glc, slc);
unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ args[idx++] = get_cache_policy(ctx, false, glc, slc);
unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
/* These cache policy bits match the definitions used by the LLVM intrinsics. */
enum ac_image_cache_policy {
- ac_glc = 1 << 0,
- ac_slc = 1 << 1,
+ ac_glc = 1 << 0, /* per-CU cache control */
+ ac_slc = 1 << 1, /* global L2 cache control */
+ ac_dlc = 1 << 2, /* per-shader-array cache control */
};
struct ac_image_args {
enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */
enum ac_image_dim dim : 3;
unsigned dmask : 4;
- unsigned cache_policy : 2;
+ unsigned cache_policy : 3;
bool unorm : 1;
bool level_zero : 1;
unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */
static unsigned get_cache_policy(struct ac_nir_context *ctx,
enum gl_access_qualifier access,
+ bool load,
bool may_store_unaligned,
bool writeonly_memory)
{
*/
writeonly_memory ||
access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
- cache_policy |= ac_glc;
+ cache_policy |= ac_glc |
+ (ctx->ac.chip_class >= GFX10 && load ? ac_dlc : 0);
}
return cache_policy;
unsigned writemask = nir_intrinsic_write_mask(instr);
enum gl_access_qualifier access = nir_intrinsic_access(instr);
bool writeonly_memory = access & ACCESS_NON_READABLE;
- unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
+ unsigned cache_policy = get_cache_policy(ctx, access, false, false, writeonly_memory);
LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
get_src(ctx, instr->src[1]), true);
int elem_size_bytes = instr->dest.ssa.bit_size / 8;
int num_components = instr->num_components;
enum gl_access_qualifier access = nir_intrinsic_access(instr);
- unsigned cache_policy = get_cache_policy(ctx, access, false, false);
+ unsigned cache_policy = get_cache_policy(ctx, access, true, false, false);
LLVMValueRef offset = get_src(ctx, instr->src[1]);
LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
struct ac_image_args args = {};
- args.cache_policy = get_cache_policy(ctx, access, false, false);
+ args.cache_policy = get_cache_policy(ctx, access, true, false, false);
if (dim == GLSL_SAMPLER_DIM_BUF) {
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
bool writeonly_memory = access & ACCESS_NON_READABLE;
struct ac_image_args args = {};
- args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
+ args.cache_policy = get_cache_policy(ctx, access, false, true, writeonly_memory);
if (dim == GLSL_SAMPLER_DIM_BUF) {
LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
static unsigned get_cache_policy(struct si_shader_context *ctx,
const struct tgsi_full_instruction *inst,
- bool atomic, bool may_store_unaligned,
+ bool load, bool atomic, bool may_store_unaligned,
bool writeonly_memory)
{
unsigned cache_policy = 0;
* evicting L1 cache lines that may be needed by other
* instructions. */
writeonly_memory ||
- inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE)))
- cache_policy |= ac_glc;
+ inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))) {
+ cache_policy |= ac_glc |
+ (ctx->screen->info.chip_class >= GFX10 && load ? ac_dlc : 0);
+ }
if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
cache_policy |= ac_slc;
info->uses_bindless_buffer_atomic,
info->uses_bindless_image_store |
info->uses_bindless_image_atomic);
- args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
+ args.cache_policy = get_cache_policy(ctx, inst, true, false, false, false);
if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
/* Don't use SMEM for shader buffer loads, because LLVM doesn't
bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
args.cache_policy = get_cache_policy(ctx, inst,
+ false, /* load */
false, /* atomic */
is_image, /* may_store_unaligned */
writeonly_memory);
args.data[num_data++] =
ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
- args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
+ args.cache_policy = get_cache_policy(ctx, inst, false, true, false, false);
if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);