intel/compiler: Add code for compiling CL-style SPIR-V kernels
authorJason Ekstrand <jason.ekstrand@intel.com>
Thu, 18 Feb 2021 22:09:31 +0000 (16:09 -0600)
committerMarge Bot <emma+marge@anholt.net>
Mon, 21 Mar 2022 11:26:44 +0000 (11:26 +0000)
v2: simplify INTEL_DEBUG expressions (Marcin)

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13171>

src/intel/compiler/brw_compiler.h
src/intel/compiler/brw_kernel.c [new file with mode: 0644]
src/intel/compiler/brw_kernel.h [new file with mode: 0644]
src/intel/compiler/meson.build

index fec4d6f..172d372 100644 (file)
@@ -25,6 +25,7 @@
 #define BRW_COMPILER_H
 
 #include <stdio.h>
+#include "c11/threads.h"
 #include "dev/intel_device_info.h"
 #include "main/config.h"
 #include "util/ralloc.h"
@@ -45,6 +46,11 @@ typedef struct nir_shader nir_shader;
 struct brw_compiler {
    const struct intel_device_info *devinfo;
 
+   /* This lock must be taken if the compiler is to be modified in any way,
+    * including adding something to the ralloc child list.
+    */
+   mtx_t mutex;
+
    struct {
       struct ra_regs *regs;
 
@@ -109,6 +115,8 @@ struct brw_compiler {
     * constant or data cache, UBOs must use VK_FORMAT_RAW.
     */
    bool indirect_ubos_use_sampler;
+
+   struct nir_shader *clc_shader;
 };
 
 #define brw_shader_debug_log(compiler, data, fmt, ... ) do {    \
diff --git a/src/intel/compiler/brw_kernel.c b/src/intel/compiler/brw_kernel.c
new file mode 100644 (file)
index 0000000..246343c
--- /dev/null
@@ -0,0 +1,362 @@
+/*
+ * Copyright © 2020 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_kernel.h"
+#include "brw_nir.h"
+
+#include "compiler/nir/nir_builder.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "dev/intel_debug.h"
+#include "util/u_atomic.h"
+
+static const nir_shader *
+load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache,
+                const nir_shader_compiler_options *nir_options,
+                const struct spirv_to_nir_options *spirv_options)
+{
+   if (compiler->clc_shader)
+      return compiler->clc_shader;
+
+   nir_shader *nir =  nir_load_libclc_shader(64, disk_cache,
+                                             spirv_options, nir_options);
+   if (nir == NULL)
+      return NULL;
+
+   const nir_shader *old_nir =
+      p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir);
+   if (old_nir == NULL) {
+      /* We won the race */
+      return nir;
+   } else {
+      /* Someone else built the shader first */
+      ralloc_free(nir);
+      return old_nir;
+   }
+}
+
+static void
+builder_init_new_impl(nir_builder *b, nir_function *func)
+{
+   nir_function_impl *impl = nir_function_impl_create(func);
+   nir_builder_init(b, impl);
+   b->cursor = nir_before_cf_list(&impl->body);
+}
+
+static bool
+lower_kernel_intrinsics(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   bool progress = false;
+
+   unsigned kernel_sysvals_start = 0;
+   unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals);
+   nir->num_uniforms += kernel_arg_start;
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_kernel_input: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+            load->num_components = intrin->num_components;
+            load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa));
+            nir_intrinsic_set_base(load, kernel_arg_start);
+            nir_intrinsic_set_range(load, nir->num_uniforms);
+            nir_ssa_dest_init(&load->instr, &load->dest,
+                              intrin->dest.ssa.num_components,
+                              intrin->dest.ssa.bit_size, NULL);
+            nir_builder_instr_insert(&b, &load->instr);
+
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa, &load->dest.ssa);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_load_constant_base_ptr: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+            nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(&b,
+               nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
+               nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa, const_data_base_addr);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_load_num_workgroups: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+            load->num_components = 3;
+            load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+            nir_intrinsic_set_base(load, kernel_sysvals_start +
+               offsetof(struct brw_kernel_sysvals, num_work_groups));
+            nir_intrinsic_set_range(load, 3 * 4);
+            nir_ssa_dest_init(&load->instr, &load->dest, 3, 32, NULL);
+            nir_builder_instr_insert(&b, &load->instr);
+
+            /* We may need to do a bit-size cast here */
+            nir_ssa_def *num_work_groups =
+               nir_u2u(&b, &load->dest.ssa, intrin->dest.ssa.bit_size);
+
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa, num_work_groups);
+            progress = true;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+bool
+brw_kernel_from_spirv(struct brw_compiler *compiler,
+                      struct disk_cache *disk_cache,
+                      struct brw_kernel *kernel,
+                      void *log_data, void *mem_ctx,
+                      const uint32_t *spirv, size_t spirv_size,
+                      const char *entrypoint_name,
+                      char **error_str)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_KERNEL];
+
+   struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .caps = {
+         .address = true,
+         .float16 = devinfo->ver >= 8,
+         .float64 = devinfo->ver >= 8,
+         .image_write_without_format = true,
+         .int8 = devinfo->ver >= 8,
+         .int16 = devinfo->ver >= 8,
+         .int64 = devinfo->ver >= 8,
+         .int64_atomics = devinfo->ver >= 9,
+         .kernel = true,
+         .float_controls = devinfo->ver >= 8,
+         .generic_pointers = true,
+         .storage_8bit = devinfo->ver >= 8,
+         .storage_16bit = devinfo->ver >= 8,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+
+         .intel_subgroup_shuffle = true,
+         .intel_subgroup_buffer_block_io = true,
+      },
+      .shared_addr_format = nir_address_format_62bit_generic,
+      .global_addr_format = nir_address_format_62bit_generic,
+      .temp_addr_format = nir_address_format_62bit_generic,
+      .constant_addr_format = nir_address_format_64bit_global,
+   };
+
+   spirv_options.clc_shader = load_clc_shader(compiler, disk_cache,
+                                              nir_options, &spirv_options);
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   entrypoint_name, &spirv_options, nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+   ralloc_steal(mem_ctx, nir);
+   nir->info.name = ralloc_strdup(nir, entrypoint_name);
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function(function, nir) {
+         if (function->impl)
+            nir_index_ssa_defs(function->impl);
+      }
+
+      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, nir_lower_libclc, spirv_options.clc_shader);
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   /* Pick off the single entrypoint that we want */
+   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         exec_node_remove(&func->node);
+   }
+   assert(exec_list_length(&nir->functions) == 1);
+
+   /* Now that we've deleted all but the main function, we can go ahead and
+    * lower the rest of the constant initializers.  We do this here so that
+    * nir_remove_dead_variables and split_per_member_structs below see the
+    * corresponding stores.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
+    */
+   NIR_PASS_V(nir, nir_lower_vec3_to_vec4,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global|
+              nir_var_mem_constant);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_uniform |
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              glsl_get_cl_type_size_align);
+
+   brw_preprocess_nir(compiler, nir, NULL);
+
+   int max_arg_idx = -1;
+   nir_foreach_uniform_variable(var, nir) {
+      assert(var->data.location < 256);
+      max_arg_idx = MAX2(max_arg_idx, var->data.location);
+   }
+
+   kernel->args_size = nir->num_uniforms;
+   kernel->arg_count = max_arg_idx + 1;
+
+   /* No bindings */
+   struct brw_kernel_arg_desc *args =
+      rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count);
+
+   nir_foreach_uniform_variable(var, nir) {
+      struct brw_kernel_arg_desc arg_desc = {
+         .offset = var->data.driver_location,
+         .size = glsl_get_explicit_size(var->type, false),
+      };
+      assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms);
+
+      assert(var->data.location >= 0);
+      args[var->data.location] = arg_desc;
+   }
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   nir->scratch_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+   if (nir->constant_data_size > 0) {
+      assert(nir->constant_data == NULL);
+      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
+      nir_gather_explicit_io_initializers(nir, nir->constant_data,
+                                          nir->constant_data_size,
+                                          nir_var_mem_constant);
+   }
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function(function, nir) {
+         if (function->impl)
+            nir_index_ssa_defs(function->impl);
+      }
+
+      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, nir_lower_memcpy);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
+              nir_address_format_64bit_global);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              nir_address_format_62bit_generic);
+
+   NIR_PASS_V(nir, nir_lower_frexp);
+   NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
+   NIR_PASS_V(nir, lower_kernel_intrinsics);
+
+   struct brw_cs_prog_key key = {
+      .base.subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING,
+   };
+
+   memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
+   kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);
+
+   struct brw_compile_cs_params params = {
+      .nir = nir,
+      .key = &key,
+      .prog_data = &kernel->prog_data,
+      .stats = &kernel->stats,
+      .log_data = log_data,
+   };
+
+   kernel->code = brw_compile_cs(compiler, mem_ctx, &params);
+
+   if (error_str)
+      *error_str = params.error_str;
+
+   return kernel->code != NULL;
+}
diff --git a/src/intel/compiler/brw_kernel.h b/src/intel/compiler/brw_kernel.h
new file mode 100644 (file)
index 0000000..837dc57
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2020 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_KERNEL_H
+#define BRW_KERNEL_H
+
+#include "brw_compiler.h"
+
+struct disk_cache;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Software interface for system values in kernels
+ *
+ * These are intended to go at the start of the kernel argument buffer.
+ */
+struct brw_kernel_sysvals {
+   uint32_t num_work_groups[3];
+   uint32_t pad[5];
+};
+
+struct brw_kernel_arg_desc {
+   uint16_t offset;
+   uint16_t size;
+};
+
+struct brw_kernel {
+   struct brw_cs_prog_data prog_data;
+
+   struct brw_compile_stats stats;
+
+   uint16_t args_size;
+   uint16_t arg_count;
+   const struct brw_kernel_arg_desc *args;
+
+   const void *code;
+};
+
+bool
+brw_kernel_from_spirv(struct brw_compiler *compiler,
+                      struct disk_cache *disk_cache,
+                      struct brw_kernel *kernel,
+                      void *log_data, void *mem_ctx,
+                      const uint32_t *spirv, size_t spirv_size,
+                      const char *entrypoint_name,
+                      char **error_str);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* BRW_KERNEL_H */
index 3a04973..c389dfd 100644 (file)
@@ -76,6 +76,7 @@ libintel_compiler_files = files(
   'brw_ir_performance.h',
   'brw_ir_performance.cpp',
   'brw_ir_vec4.h',
+  'brw_kernel.c',
   'brw_mesh.cpp',
   'brw_nir.h',
   'brw_nir.c',