radeonsi: support ARB_compute_variable_group_size
authorNicolai Hähnle <nicolai.haehnle@amd.com>
Fri, 9 Sep 2016 08:08:11 +0000 (10:08 +0200)
committerNicolai Hähnle <nicolai.haehnle@amd.com>
Mon, 10 Oct 2016 08:36:42 +0000 (10:36 +0200)
Not sure if it's possible to avoid programming the block size twice (once for
the userdata and once for the dispatch).

Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
docs/features.txt
docs/relnotes/12.1.0.html
src/gallium/drivers/radeon/r600_pipe_common.c
src/gallium/drivers/radeon/r600_pipe_common.h
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h

index e91ef6c..533971f 100644 (file)
@@ -279,7 +279,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
 
   GL_ARB_bindless_texture                               started (airlied)
   GL_ARB_cl_event                                       not started
-  GL_ARB_compute_variable_group_size                    DONE (nvc0)
+  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
   GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
   GL_ARB_fragment_shader_interlock                      not started
   GL_ARB_gl_spirv                                       not started
index 9ddd99c..2e4b669 100644 (file)
@@ -49,7 +49,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_ES3_1_compatibility on i965</li>
 <li>GL_ARB_ES3_2_compatibility on i965/gen8+</li>
 <li>GL_ARB_clear_texture on r600, radeonsi</li>
-<li>GL_ARB_compute_variable_group_size on nvc0</li>
+<li>GL_ARB_compute_variable_group_size on nvc0, radeonsi</li>
 <li>GL_ARB_cull_distance on radeonsi</li>
 <li>GL_ARB_enhanced_layouts on i965</li>
 <li>GL_ARB_indirect_parameters on radeonsi</li>
index 44863ee..3dbcbc6 100644 (file)
@@ -1037,7 +1037,15 @@ static int r600_get_compute_param(struct pipe_screen *screen,
                }
                return sizeof(uint32_t);
        case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
-               return 0;
+               if (ret) {
+                       uint64_t *max_variable_threads_per_block = ret;
+                       if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+                           ir_type == PIPE_SHADER_IR_TGSI)
+                               *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+                       else
+                               *max_variable_threads_per_block = 0;
+               }
+               return sizeof(uint64_t);
        }
 
         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
index 54991e8..290b228 100644 (file)
 #define R600_MAP_BUFFER_ALIGNMENT 64
 #define R600_MAX_VIEWPORTS        16
 
+#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
+
 enum r600_coherency {
        R600_COHERENCY_NONE, /* no cache flushes needed */
        R600_COHERENCY_SHADER,
index 1d1df2f..e59bafe 100644 (file)
@@ -601,11 +601,19 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
                        radeon_emit(cs, 0);
                }
        } else {
+               struct si_compute *program = sctx->cs_shader_state.program;
+               bool variable_group_size =
+                       program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
 
-               radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+               radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3);
                radeon_emit(cs, info->grid[0]);
                radeon_emit(cs, info->grid[1]);
                radeon_emit(cs, info->grid[2]);
+               if (variable_group_size) {
+                       radeon_emit(cs, info->block[0]);
+                       radeon_emit(cs, info->block[1]);
+                       radeon_emit(cs, info->block[2]);
+               }
        }
 }
 
index ff51c8b..49d4121 100644 (file)
@@ -1770,16 +1770,21 @@ static void declare_system_value(
                LLVMValueRef values[3];
                unsigned i;
                unsigned *properties = ctx->shader->selector->info.properties;
-               unsigned sizes[3] = {
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
-               };
 
-               for (i = 0; i < 3; ++i)
-                       values[i] = lp_build_const_int32(gallivm, sizes[i]);
+               if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
+                       unsigned sizes[3] = {
+                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
+                       };
+
+                       for (i = 0; i < 3; ++i)
+                               values[i] = lp_build_const_int32(gallivm, sizes[i]);
 
-               value = lp_build_gather_values(gallivm, values, 3);
+                       value = lp_build_gather_values(gallivm, values, 3);
+               } else {
+                       value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
+               }
                break;
        }
 
@@ -5680,6 +5685,7 @@ static void create_function(struct si_shader_context *ctx)
 
        case PIPE_SHADER_COMPUTE:
                params[SI_PARAM_GRID_SIZE] = v3i32;
+               params[SI_PARAM_BLOCK_SIZE] = v3i32;
                params[SI_PARAM_BLOCK_ID] = v3i32;
                last_sgpr = SI_PARAM_BLOCK_ID;
 
@@ -5716,7 +5722,12 @@ static void create_function(struct si_shader_context *ctx)
                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
 
-               assert(max_work_group_size);
+               if (!max_work_group_size) {
+                       /* This is a variable group size compute shader,
+                        * compile it for the maximum possible group size.
+                        */
+                       max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+               }
 
                radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
                                          "amdgpu-max-work-group-size",
@@ -6653,11 +6664,16 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                unsigned max_vgprs = 256;
                unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
                unsigned max_sgprs_per_wave = 128;
-               unsigned min_waves_per_cu =
-                       DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
-                                    props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
-                                    props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
-                                    wave_size);
+               unsigned max_block_threads;
+
+               if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH])
+                       max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
+                                           props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
+                                           props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
+               else
+                       max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+
+               unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
                unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
 
                max_vgprs = max_vgprs / min_waves_per_simd;
index 67cb67d..f2618ac 100644 (file)
@@ -129,7 +129,8 @@ enum {
 
        /* CS only */
        SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
-       SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3
+       SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
+       SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
 };
 
 /* LLVM function parameter indices */
@@ -219,6 +220,7 @@ enum {
 
        /* CS only parameters */
        SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
+       SI_PARAM_BLOCK_SIZE,
        SI_PARAM_BLOCK_ID,
        SI_PARAM_THREAD_ID,