radv: Use structured intrinsics instead of indexing workaround for GFX9.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Mon, 12 Nov 2018 21:42:36 +0000 (22:42 +0100)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Mon, 19 Nov 2018 22:36:00 +0000 (23:36 +0100)
These force the index to be used in the instruction so we don't need the
workaround.

Totals:
SGPRS: 1321642 -> 1321802 (0.01 %)
VGPRS: 943664 -> 943788 (0.01 %)
Spilled SGPRs: 28468 -> 28480 (0.04 %)
Spilled VGPRs: 88 -> 89 (1.14 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 80 -> 80 (0.00 %) dwords per thread
Code Size: 52415292 -> 52338932 (-0.15 %) bytes
LDS: 400 -> 400 (0.00 %) blocks
Max Waves: 233903 -> 233803 (-0.04 %)
Wait states: 0 -> 0 (0.00 %)

Totals from affected shaders:
SGPRS: 238344 -> 238504 (0.07 %)
VGPRS: 232732 -> 232856 (0.05 %)
Spilled SGPRs: 13125 -> 13137 (0.09 %)
Spilled VGPRs: 88 -> 89 (1.14 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 80 -> 80 (0.00 %) dwords per thread
Code Size: 15752712 -> 15676352 (-0.48 %) bytes
LDS: 139 -> 139 (0.00 %) blocks
Max Waves: 31680 -> 31580 (-0.32 %)
Wait states: 0 -> 0 (0.00 %)

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
src/amd/common/ac_llvm_build.c
src/amd/common/ac_nir_to_llvm.c
src/amd/vulkan/radv_nir_to_llvm.c

index 1392ec0..22245aa 100644 (file)
@@ -1161,6 +1161,47 @@ ac_build_buffer_load_common(struct ac_llvm_context *ctx,
                                  ac_get_load_intr_attribs(can_speculate));
 }
 
+static LLVMValueRef
+ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
+                                 LLVMValueRef rsrc,
+                                 LLVMValueRef vindex,
+                                 LLVMValueRef voffset,
+                                 LLVMValueRef soffset,
+                                 unsigned num_channels,
+                                 bool glc,
+                                 bool slc,
+                                 bool can_speculate,
+                                 bool use_format,
+                                 bool structurized)
+{
+       LLVMValueRef args[5];
+       int idx = 0;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
+       const char *type_names[] = {"f32", "v2f32", "v4f32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       if (use_format) {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
+                        indexing_kind, type_names[func]);
+       } else {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
+                        indexing_kind, type_names[func]);
+       }
+
+       return ac_build_intrinsic(ctx, name, types[func], args,
+                                 idx,
+                                 ac_get_load_intr_attribs(can_speculate));
+}
+
 LLVMValueRef
 ac_build_buffer_load(struct ac_llvm_context *ctx,
                     LLVMValueRef rsrc,
@@ -1218,6 +1259,11 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
                                         bool glc,
                                         bool can_speculate)
 {
+       if (HAVE_LLVM >= 0x800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+                                                        num_channels, glc, false,
+                                                        can_speculate, true, true);
+       }
        return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
                                           num_channels, glc, false,
                                           can_speculate, true);
@@ -1231,6 +1277,12 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
                                                   bool glc,
                                                   bool can_speculate)
 {
+       if (HAVE_LLVM >= 0x800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+                                                        num_channels, glc, false,
+                                                        can_speculate, true, true);
+       }
+
        LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
        LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
        stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
index c950b81..a19e66f 100644 (file)
@@ -2387,10 +2387,17 @@ static void visit_image_store(struct ac_nir_context *ctx,
                params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                    ctx->ac.i32_0, ""); /* vindex */
                params[3] = ctx->ac.i32_0; /* voffset */
-               params[4] = glc;  /* glc */
-               params[5] = ctx->ac.i1false;  /* slc */
-               ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->ac.voidt,
-                                  params, 6, 0);
+               if (HAVE_LLVM >= 0x800) {
+                       params[4] = ctx->ac.i32_0; /* soffset */
+                       params[5] = glc ? ctx->ac.i32_1 : ctx->ac.i32_0;
+                       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.struct.buffer.store.format.v4f32", ctx->ac.voidt,
+                                          params, 6, 0);
+               } else {
+                       params[4] = glc;  /* glc */
+                       params[5] = ctx->ac.i1false;  /* slc */
+                       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->ac.voidt,
+                                          params, 6, 0);
+               }
        } else {
                struct ac_image_args args = {};
                args.opcode = ac_image_store;
@@ -2470,10 +2477,18 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                                ctx->ac.i32_0, ""); /* vindex */
                params[param_count++] = ctx->ac.i32_0; /* voffset */
-               params[param_count++] = ctx->ac.i1false;  /* slc */
+               if (HAVE_LLVM >= 0x800) {
+                       params[param_count++] = ctx->ac.i32_0; /* soffset */
+                       params[param_count++] = ctx->ac.i32_0;  /* slc */
 
-               length = snprintf(intrinsic_name, sizeof(intrinsic_name),
-                                 "llvm.amdgcn.buffer.atomic.%s", atomic_name);
+                       length = snprintf(intrinsic_name, sizeof(intrinsic_name),
+                                         "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name);
+               } else {
+                       params[param_count++] = ctx->ac.i1false;  /* slc */
+
+                       length = snprintf(intrinsic_name, sizeof(intrinsic_name),
+                                         "llvm.amdgcn.buffer.atomic.%s", atomic_name);
+               }
 
                assert(length < sizeof(intrinsic_name));
                return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32,
index f56eb01..2e6f88a 100644 (file)
@@ -3500,7 +3500,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
        ctx.abi.load_sampler_desc = radv_get_sampler_desc;
        ctx.abi.load_resource = radv_load_resource;
        ctx.abi.clamp_shadow_reference = false;
-       ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9;
+       ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800;
 
        if (shader_count >= 2)
                ac_init_exec_full_mask(&ctx.ac);