radeonsi/gfx10: set DLC for loads when GLC is set

author Marek Olšák <marek.olsak@amd.com>

Fri, 24 May 2019 22:48:39 +0000 (18:48 -0400)

committer Marek Olšák <marek.olsak@amd.com>

Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
author Marek Olšák <marek.olsak@amd.com>
Fri, 24 May 2019 22:48:39 +0000 (18:48 -0400)
committer Marek Olšák <marek.olsak@amd.com>
Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c

index ecb72395867644c22f8dc8cd4c274aba37ee0c5b..5089463e2db4be13c47041ed57f0026735d5c948 100644 (file)
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1107,6 +1107,15 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
         return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
  }
  
+static LLVMValueRef get_cache_policy(struct ac_llvm_context *ctx,
+                                    bool load, bool glc, bool slc)
+{
+       return LLVMConstInt(ctx->i32,
+                           (glc ? ac_glc : 0) +
+                           (slc ? ac_slc : 0) +
+                           (ctx->chip_class >= GFX10 && glc && load ? ac_dlc : 0), 0);
+}
+
  static void
  ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
                                    LLVMValueRef rsrc,
@@ -1165,7 +1174,7 @@ ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
                 args[idx++] = vindex ? vindex : ctx->i32_0;
         args[idx++] = voffset ? voffset : ctx->i32_0;
         args[idx++] = soffset ? soffset : ctx->i32_0;
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       args[idx++] = get_cache_policy(ctx, false, glc, slc);
         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
         const char *indexing_kind = structurized ? "struct" : "raw";
         char name[256], type_name[8];
@@ -1350,7 +1359,7 @@ ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
                 args[idx++] = vindex ? vindex : ctx->i32_0;
         args[idx++] = voffset ? voffset : ctx->i32_0;
         args[idx++] = soffset ? soffset : ctx->i32_0;
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       args[idx++] = get_cache_policy(ctx, true, glc, slc);
         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
         const char *indexing_kind = structurized ? "struct" : "raw";
         char name[256], type_name[8];
@@ -1404,6 +1413,8 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                                 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
                                                     : "llvm.SI.load.const.v4i32";
                         unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
+                       /* TODO: set glc+dlc on GFX10 (LLVM support is missing) */
+                       assert(!glc || ctx->chip_class < GFX10);
                         LLVMValueRef args[3] = {
                                 rsrc,
                                 offset,
@@ -1551,7 +1562,7 @@ ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
         args[idx++] = voffset ? voffset : ctx->i32_0;
         args[idx++] = soffset ? soffset : ctx->i32_0;
         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       args[idx++] = get_cache_policy(ctx, true, glc, slc);
         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
         const char *indexing_kind = structurized ? "struct" : "raw";
         char name[256], type_name[8];
@@ -2049,7 +2060,7 @@ ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
         args[idx++] = voffset ? voffset : ctx->i32_0;
         args[idx++] = soffset ? soffset : ctx->i32_0;
         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       args[idx++] = get_cache_policy(ctx, false, glc, slc);
         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
         const char *indexing_kind = structurized ? "struct" : "raw";
         char name[256], type_name[8];
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h

index 17e701b21f8b8fdcf79b2a30b71b1104e14ed14b..4917315cc500a4c4d00b0d04888b8d805be6b8b9 100644 (file)
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -527,8 +527,9 @@ enum ac_image_dim {
  
  /* These cache policy bits match the definitions used by the LLVM intrinsics. */
  enum ac_image_cache_policy {
-       ac_glc = 1 << 0,
-       ac_slc = 1 << 1,
+       ac_glc = 1 << 0, /* per-CU cache control */
+       ac_slc = 1 << 1, /* global L2 cache control */
+       ac_dlc = 1 << 2, /* per-shader-array cache control */
  };
  
  struct ac_image_args {
@@ -536,7 +537,7 @@ struct ac_image_args {
         enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */
         enum ac_image_dim dim : 3;
         unsigned dmask : 4;
-       unsigned cache_policy : 2;
+       unsigned cache_policy : 3;
         bool unorm : 1;
         bool level_zero : 1;
         unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c

index 636fd4035c8274a7ba7dec5579a46611a4e8a517..73941ba6f45e58ed70d4925bac35e61f6013da3a 100644 (file)
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1519,6 +1519,7 @@ static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueR
  
  static unsigned get_cache_policy(struct ac_nir_context *ctx,
                                  enum gl_access_qualifier access,
+                                bool load,
                                  bool may_store_unaligned,
                                  bool writeonly_memory)
  {
@@ -1535,7 +1536,8 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
               */
              writeonly_memory ||
              access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
-               cache_policy |= ac_glc;
+               cache_policy |= ac_glc |
+                               (ctx->ac.chip_class >= GFX10 && load ? ac_dlc : 0);
         }
  
         return cache_policy;
@@ -1549,7 +1551,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
         unsigned writemask = nir_intrinsic_write_mask(instr);
         enum gl_access_qualifier access = nir_intrinsic_access(instr);
         bool writeonly_memory = access & ACCESS_NON_READABLE;
-       unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
+       unsigned cache_policy = get_cache_policy(ctx, access, false, false, writeonly_memory);
  
         LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
                                         get_src(ctx, instr->src[1]), true);
@@ -1713,7 +1715,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
         int elem_size_bytes = instr->dest.ssa.bit_size / 8;
         int num_components = instr->num_components;
         enum gl_access_qualifier access = nir_intrinsic_access(instr);
-       unsigned cache_policy = get_cache_policy(ctx, access, false, false);
+       unsigned cache_policy = get_cache_policy(ctx, access, true, false, false);
  
         LLVMValueRef offset = get_src(ctx, instr->src[1]);
         LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
@@ -2452,7 +2454,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
  
         struct ac_image_args args = {};
  
-       args.cache_policy = get_cache_policy(ctx, access, false, false);
+       args.cache_policy = get_cache_policy(ctx, access, true, false, false);
  
         if (dim == GLSL_SAMPLER_DIM_BUF) {
                 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
@@ -2510,7 +2512,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
         bool writeonly_memory = access & ACCESS_NON_READABLE;
         struct ac_image_args args = {};
  
-       args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
+       args.cache_policy = get_cache_policy(ctx, access, false, true, writeonly_memory);
  
         if (dim == GLSL_SAMPLER_DIM_BUF) {
                 LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c

index 8d6a7dc8d670fbc018d21579b0d42099f60bf957..455af80e206e7fa33acde38b4a70447dcf538405 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -315,7 +315,7 @@ static void image_fetch_coords(
  
  static unsigned get_cache_policy(struct si_shader_context *ctx,
                                  const struct tgsi_full_instruction *inst,
-                                bool atomic, bool may_store_unaligned,
+                                bool load, bool atomic, bool may_store_unaligned,
                                  bool writeonly_memory)
  {
         unsigned cache_policy = 0;
@@ -330,8 +330,10 @@ static unsigned get_cache_policy(struct si_shader_context *ctx,
               * evicting L1 cache lines that may be needed by other
               * instructions. */
              writeonly_memory ||
-            inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE)))
-               cache_policy |= ac_glc;
+            inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))) {
+               cache_policy |= ac_glc |
+                               (ctx->screen->info.chip_class >= GFX10 && load ? ac_dlc : 0);
+       }
  
         if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
                 cache_policy |= ac_slc;
@@ -530,7 +532,7 @@ static void load_emit(
                                                 info->uses_bindless_buffer_atomic,
                                                 info->uses_bindless_image_store |
                                                 info->uses_bindless_image_atomic);
-       args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
+       args.cache_policy = get_cache_policy(ctx, inst, true, false, false, false);
  
         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
                 /* Don't use SMEM for shader buffer loads, because LLVM doesn't
@@ -711,6 +713,7 @@ static void store_emit(
  
         bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
         args.cache_policy = get_cache_policy(ctx, inst,
+                                            false, /* load */
                                              false, /* atomic */
                                              is_image, /* may_store_unaligned */
                                              writeonly_memory);
@@ -833,7 +836,7 @@ static void atomic_emit(
  
         args.data[num_data++] =
                 ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
-       args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
+       args.cache_policy = get_cache_policy(ctx, inst, false, true, false, false);
  
         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);
author	Marek Olšák <marek.olsak@amd.com>
	Fri, 24 May 2019 22:48:39 +0000 (18:48 -0400)
committer	Marek Olšák <marek.olsak@amd.com>
	Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
src/amd/common/ac_llvm_build.c		patch \| blob \| history
src/amd/common/ac_llvm_build.h		patch \| blob \| history
src/amd/common/ac_nir_to_llvm.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c		patch \| blob \| history