radeon/ac: use ds_swizzle for derivs on si/cik.
authorDave Airlie <airlied@redhat.com>
Tue, 1 Aug 2017 04:10:49 +0000 (05:10 +0100)
committerDave Airlie <airlied@redhat.com>
Tue, 1 Aug 2017 23:12:01 +0000 (00:12 +0100)
This looks like it's supported since llvm 3.9 at least,
so switch over radeonsi and radv to using it, -pro also
uses this. We can now drop creating lds for these operations
as the ds_swizzle operation doesn't actually write to lds at all.

Acked-by: Marek Olšák <marek.olsak@amd.com>
(stable requested due to fixing radv CIK conformance tests)
Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Dave Airlie <airlied@redhat.com>
src/amd/common/ac_llvm_build.c
src/amd/common/ac_llvm_build.h
src/amd/common/ac_nir_to_llvm.c
src/gallium/drivers/radeonsi/si_shader.c

index 9b939c1..a38aad6 100644 (file)
@@ -796,21 +796,21 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
              bool has_ds_bpermute,
              uint32_t mask,
              int idx,
-             LLVMValueRef lds,
              LLVMValueRef val)
 {
-       LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
+       LLVMValueRef tl, trbl, args[2];
        LLVMValueRef result;
 
-       thread_id = ac_get_thread_id(ctx);
+       if (has_ds_bpermute) {
+               LLVMValueRef thread_id, tl_tid, trbl_tid;
+               thread_id = ac_get_thread_id(ctx);
 
-       tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-                             LLVMConstInt(ctx->i32, mask, false), "");
+               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+                                     LLVMConstInt(ctx->i32, mask, false), "");
 
-       trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-                               LLVMConstInt(ctx->i32, idx, false), "");
+               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+                                       LLVMConstInt(ctx->i32, idx, false), "");
 
-       if (has_ds_bpermute) {
                args[0] = LLVMBuildMul(ctx->builder, tl_tid,
                                       LLVMConstInt(ctx->i32, 4, false), "");
                args[1] = val;
@@ -828,15 +828,42 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
                                          AC_FUNC_ATTR_READNONE |
                                          AC_FUNC_ATTR_CONVERGENT);
        } else {
-               LLVMValueRef store_ptr, load_ptr0, load_ptr1;
+               uint32_t masks[2];
+
+               switch (mask) {
+               case AC_TID_MASK_TOP_LEFT:
+                       masks[0] = 0x8000;
+                       if (idx == 1)
+                               masks[1] = 0x8055;
+                       else
+                               masks[1] = 0x80aa;
+
+                       break;
+               case AC_TID_MASK_TOP:
+                       masks[0] = 0x8044;
+                       masks[1] = 0x80ee;
+                       break;
+               case AC_TID_MASK_LEFT:
+                       masks[0] = 0x80a0;
+                       masks[1] = 0x80f5;
+                       break;
+               }
 
-               store_ptr = ac_build_gep0(ctx, lds, thread_id);
-               load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
-               load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
+               args[0] = val;
+               args[1] = LLVMConstInt(ctx->i32, masks[0], false);
 
-               LLVMBuildStore(ctx->builder, val, store_ptr);
-               tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
-               trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+               tl = ac_build_intrinsic(ctx,
+                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
+                                       args, 2,
+                                       AC_FUNC_ATTR_READNONE |
+                                       AC_FUNC_ATTR_CONVERGENT);
+
+               args[1] = LLVMConstInt(ctx->i32, masks[1], false);
+               trbl = ac_build_intrinsic(ctx,
+                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
+                                       args, 2,
+                                       AC_FUNC_ATTR_READNONE |
+                                       AC_FUNC_ATTR_CONVERGENT);
        }
 
        tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
index 09fd585..ee27d3c 100644 (file)
@@ -174,7 +174,6 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
              bool has_ds_bpermute,
              uint32_t mask,
              int idx,
-             LLVMValueRef lds,
              LLVMValueRef val);
 
 #define AC_SENDMSG_GS 2
index a05fd0e..3a62523 100644 (file)
@@ -68,8 +68,6 @@ struct ac_nir_context {
        int num_locals;
        LLVMValueRef *locals;
 
-       LLVMValueRef ddxy_lds;
-
        struct nir_to_llvm_context *nctx; /* TODO get rid of this */
 };
 
@@ -1463,11 +1461,6 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
        LLVMValueRef result;
        bool has_ds_bpermute = ctx->abi->chip_class >= VI;
 
-       if (!ctx->ddxy_lds && !has_ds_bpermute)
-               ctx->ddxy_lds = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-                                                      LLVMArrayType(ctx->ac.i32, 64),
-                                                      "ddxy_lds", LOCAL_ADDR_SPACE);
-
        if (op == nir_op_fddx_fine || op == nir_op_fddx)
                mask = AC_TID_MASK_LEFT;
        else if (op == nir_op_fddy_fine || op == nir_op_fddy)
@@ -1484,7 +1477,7 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
                idx = 2;
 
        result = ac_build_ddxy(&ctx->ac, has_ds_bpermute,
-                             mask, idx, ctx->ddxy_lds,
+                             mask, idx,
                              src0);
        return result;
 }
index 628e6f8..09053c3 100644 (file)
@@ -3591,7 +3591,7 @@ static void si_llvm_emit_ddxy(
 
        val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
        val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
-                           mask, idx, ctx->lds, val);
+                           mask, idx, val);
        emit_data->output[emit_data->chan] = val;
 }
 
@@ -4635,20 +4635,6 @@ static void create_function(struct si_shader_context *ctx)
        assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
        shader->info.num_input_vgprs -= num_prolog_vgprs;
 
-       if (!ctx->screen->has_ds_bpermute &&
-           bld_base->info &&
-           (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
-            bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
-            bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
-            bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
-            bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
-            bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
-               ctx->lds =
-                       LLVMAddGlobalInAddressSpace(gallivm->module,
-                                                   LLVMArrayType(ctx->i32, 64),
-                                                   "ddxy_lds",
-                                                   LOCAL_ADDR_SPACE);
-
        if (shader->key.as_ls ||
            ctx->type == PIPE_SHADER_TESS_CTRL ||
            /* GFX9 has the ESGS ring buffer in LDS. */