From 705395344d2541d038326a6f64fbff838b52a71e Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 4 Sep 2020 12:40:06 -0500 Subject: [PATCH] intel/fs: Add support for compiling bindless shaders with resume shaders Instead of depending on the driver to compile each resume shader separately, we compile them all in one go in the back-end and build an SBT as part of the shader program. Shader relocs are used to make the entries in the SBT point point to the correct resume shader. Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_compiler.h | 12 +++- src/intel/compiler/brw_fs.cpp | 119 +++++++++++++++++++++++++++----- src/intel/compiler/brw_fs.h | 1 + src/intel/compiler/brw_fs_generator.cpp | 18 +++++ 4 files changed, 130 insertions(+), 20 deletions(-) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index b60d09b..f24db8c 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -678,6 +678,7 @@ enum brw_param_builtin { enum brw_shader_reloc_id { BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH, + BRW_SHADER_RELOC_SHADER_START_OFFSET, }; enum brw_shader_reloc_type { @@ -1062,8 +1063,15 @@ brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data, struct brw_bs_prog_data { struct brw_stage_prog_data base; + + /** SIMD size of the root shader */ uint8_t simd_size; - uint32_t stack_size; + + /** Maximum stack size of all shaders */ + uint32_t max_stack_size; + + /** Offset into the shader where the resume SBT is located */ + uint32_t resume_sbt_offset; }; struct brw_ff_gs_prog_data { @@ -1675,6 +1683,8 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, const struct brw_bs_prog_key *key, struct brw_bs_prog_data *prog_data, struct nir_shader *shader, + unsigned num_resume_shaders, + struct nir_shader **resume_shaders, struct brw_compile_stats *stats, char **error_str); diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d83224b..f8bce4e 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -9875,19 +9875,22 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, return info; } -const unsigned * -brw_compile_bs(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_bs_prog_key *key, - struct brw_bs_prog_data *prog_data, - nir_shader *shader, - struct brw_compile_stats *stats, - char **error_str) +static uint8_t +compile_single_bs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_bs_prog_key *key, + struct brw_bs_prog_data *prog_data, + nir_shader *shader, + fs_generator *g, + struct brw_compile_stats *stats, + int *prog_offset, + char **error_str) { const bool debug_enabled = INTEL_DEBUG & DEBUG_RT; prog_data->base.stage = shader->info.stage; - prog_data->stack_size = shader->scratch_size; + prog_data->max_stack_size = MAX2(prog_data->max_stack_size, + shader->scratch_size); const unsigned max_dispatch_width = 16; brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width, true); @@ -9897,6 +9900,7 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL; bool has_spilled = false; + uint8_t simd_size = 0; if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, shader, @@ -9906,10 +9910,10 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, if (error_str) *error_str = ralloc_strdup(mem_ctx, v8->fail_msg); delete v8; - return NULL; + return 0; } else { v = v8; - prog_data->simd_size = 8; + simd_size = 8; if (v8->spilled_any_registers) has_spilled = true; } @@ -9932,11 +9936,11 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, v16->fail_msg); } delete v16; - return NULL; + return 0; } } else { v = v16; - prog_data->simd_size = 16; + simd_size = 16; if (v16->spilled_any_registers) has_spilled = true; } @@ -9948,13 +9952,55 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, *error_str = ralloc_strdup(mem_ctx, "Cannot satisfy INTEL_DEBUG flags SIMD restrictions"); } - return NULL; + return false; } assert(v); + int offset = g->generate_code(v->cfg, simd_size, v->shader_stats, + v->performance_analysis.require(), stats); + if (prog_offset) + *prog_offset = offset; + else + assert(offset == 0); + + delete v8; + delete v16; + + return simd_size; +} + +uint64_t +brw_bsr(const struct intel_device_info *devinfo, + uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset) +{ + assert(offset % 64 == 0); + assert(simd_size == 8 || simd_size == 16); + assert(local_arg_offset % 8 == 0); + + return offset | + SET_BITS(simd_size > 8, 4, 4) | + SET_BITS(local_arg_offset / 8, 2, 0); +} + +const unsigned * +brw_compile_bs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_bs_prog_key *key, + struct brw_bs_prog_data *prog_data, + nir_shader *shader, + unsigned num_resume_shaders, + struct nir_shader **resume_shaders, + struct brw_compile_stats *stats, + char **error_str) +{ + const bool debug_enabled = INTEL_DEBUG & DEBUG_RT; + + prog_data->base.stage = shader->info.stage; + prog_data->max_stack_size = 0; + fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, - v->runtime_check_aads_emit, shader->info.stage); + false, shader->info.stage); if (unlikely(debug_enabled)) { char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s", shader->info.label ? @@ -9964,13 +10010,48 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data, g.enable_debug(name); } - g.generate_code(v->cfg, prog_data->simd_size, v->shader_stats, - v->performance_analysis.require(), stats); + prog_data->simd_size = + compile_single_bs(compiler, log_data, mem_ctx, key, prog_data, + shader, &g, stats, NULL, error_str); + if (prog_data->simd_size == 0) + return NULL; - delete v8; - delete v16; + uint64_t *resume_sbt = ralloc_array(mem_ctx, uint64_t, num_resume_shaders); + for (unsigned i = 0; i < num_resume_shaders; i++) { + if (INTEL_DEBUG & DEBUG_RT) { + char *name = ralloc_asprintf(mem_ctx, "%s %s resume(%u) shader %s", + shader->info.label ? + shader->info.label : "unnamed", + gl_shader_stage_name(shader->info.stage), + i, shader->info.name); + g.enable_debug(name); + } + + /* TODO: Figure out shader stats etc. for resume shaders */ + int offset = 0; + uint8_t simd_size = + compile_single_bs(compiler, log_data, mem_ctx, key, prog_data, + resume_shaders[i], &g, NULL, &offset, error_str); + if (simd_size == 0) + return NULL; + + assert(offset > 0); + resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0); + } + + /* We only have one constant data so we want to make sure they're all the + * same. + */ + for (unsigned i = 0; i < num_resume_shaders; i++) { + assert(resume_shaders[i]->constant_data_size == + shader->constant_data_size); + assert(memcmp(resume_shaders[i]->constant_data, + shader->constant_data, + shader->constant_data_size) == 0); + } g.add_const_data(shader->constant_data, shader->constant_data_size); + g.add_resume_sbt(num_resume_shaders, resume_sbt); return g.get_assembly(); } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 57193a1..22cd226 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -478,6 +478,7 @@ public: const brw::performance &perf, struct brw_compile_stats *stats); void add_const_data(void *data, unsigned size); + void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt); const unsigned *get_assembly(); private: diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index c6e68ad..16240b1 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2805,6 +2805,24 @@ fs_generator::add_const_data(void *data, unsigned size) } } +void +fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt) +{ + assert(brw_shader_stage_is_bindless(stage)); + struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data); + if (num_resume_shaders > 0) { + bs_prog_data->resume_sbt_offset = + brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32); + for (unsigned i = 0; i < num_resume_shaders; i++) { + size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt); + assert(offset <= UINT32_MAX); + brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET, + BRW_SHADER_RELOC_TYPE_U32, + (uint32_t)offset, (uint32_t)sbt[i]); + } + } +} + const unsigned * fs_generator::get_assembly() { -- 2.7.4