microsoft: Add CLC frontend and kernel/compute support to DXIL converter
authorJesse Natalie <jenatali@microsoft.com>
Fri, 6 Nov 2020 16:09:30 +0000 (17:09 +0100)
committerMarge Bot <eric+marge@anholt.net>
Wed, 18 Nov 2020 04:05:37 +0000 (04:05 +0000)
This adds a standalone library which can convert through the pipeline of
OpenCL C -> SPIR -> SPIR-V -> NIR -> DXIL. It can add in the libclc
implementations of various library functions in the NIR phase, and
also massages the NIR to shift it more towards graphics-style compute.

This is leveraged by the out-of-tree OpenCLOn12 runtime
(https://github.com/microsoft/OpenCLOn12).

This is the combination of a lot of commits from our development branch,
containing code by several authors.

Co-authored-by: Boris Brezillon <boris.brezillon@collabora.com>
Co-authored-by: Daniel Stone <daniels@collabora.com>
Co-authored-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Acked-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7565>

21 files changed:
meson.build
meson_options.txt
src/compiler/nir/nir_intrinsics.py
src/meson.build
src/microsoft/clc/clc_compiler.c [new file with mode: 0644]
src/microsoft/clc/clc_compiler.h [new file with mode: 0644]
src/microsoft/clc/clc_compiler_test.cpp [new file with mode: 0644]
src/microsoft/clc/clc_helpers.cpp [new file with mode: 0644]
src/microsoft/clc/clc_helpers.h [new file with mode: 0644]
src/microsoft/clc/clc_nir.c [new file with mode: 0644]
src/microsoft/clc/clc_nir.h [new file with mode: 0644]
src/microsoft/clc/clglon12compiler.def [new file with mode: 0644]
src/microsoft/clc/compute_test.cpp [new file with mode: 0644]
src/microsoft/clc/compute_test.h [new file with mode: 0644]
src/microsoft/clc/meson.build [new file with mode: 0644]
src/microsoft/compiler/dxcapi.h [new file with mode: 0644]
src/microsoft/compiler/dxil_nir.c
src/microsoft/compiler/dxil_nir.h
src/microsoft/compiler/nir_to_dxil.c
src/microsoft/compiler/nir_to_dxil.h
src/microsoft/meson.build

index 614c987..e25f9fc 100644 (file)
@@ -299,6 +299,26 @@ if with_aco_tests and not with_amd_vk
   error('ACO tests require Radv')
 endif
 
+_microsoft_clc = get_option('microsoft-clc')
+if _microsoft_clc == 'auto'
+  with_microsoft_clc = false
+else
+  with_microsoft_clc = _microsoft_clc == 'true'
+endif
+
+if with_microsoft_clc
+  with_clc = true
+  dep_clang = dependency(
+    'clang',
+    method: 'cmake',
+    static: true,
+    modules: [
+      'clangBasic', 'clangCodeGen', 'clangDriver', 'clangFrontend', 'clangFrontendTool',
+      'clangHandleCXX', 'clangHandleLLVM',
+    ],
+  )
+endif
+
 if host_machine.system() == 'darwin'
   with_dri_platform = 'apple'
   pre_args += '-DBUILDING_MESA'
@@ -1470,8 +1490,13 @@ if with_gallium_opencl
     'lto', 'option', 'objcarcopts', 'profiledata',
   ]
 endif
+if with_microsoft_clc
+  llvm_modules += ['target', 'linker', 'irreader', 'option', 'libdriver']
+endif
 
-if with_amd_vk or with_gallium_radeonsi or with_gallium_opencl
+if with_microsoft_clc
+  _llvm_version = '>= 10.0.0'
+elif with_amd_vk or with_gallium_radeonsi or with_gallium_opencl
   _llvm_version = '>= 8.0.0'
 elif with_gallium_swr
   _llvm_version = '>= 6.0.0'
@@ -1521,7 +1546,7 @@ if _llvm != 'disabled'
     optional_modules : llvm_optional_modules,
     required : (
       with_amd_vk or with_gallium_radeonsi or with_gallium_swr or
-      with_gallium_opencl or _llvm == 'enabled'
+      with_gallium_opencl or with_microsoft_clc or _llvm == 'enabled'
     ),
     static : not _shared_llvm,
     method : _llvm_method,
@@ -1564,9 +1589,11 @@ elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr
   error('The following drivers require LLVM: Radv, RadeonSI, SWR. One of these is enabled, but LLVM is disabled.')
 elif with_gallium_opencl
   error('The OpenCL "Clover" state tracker requires LLVM, but LLVM is disabled.')
+elif with_microsoft_clc
+  error('The Microsoft CLC compiler requires LLVM, but LLVM is disabled.')
 endif
 
-with_opencl_spirv = _opencl != 'disabled' and get_option('opencl-spirv')
+with_opencl_spirv = (_opencl != 'disabled' and get_option('opencl-spirv')) or with_microsoft_clc
 if with_opencl_spirv
   chosen_llvm_version_array = dep_llvm.version().split('.')
   chosen_llvm_version_major = chosen_llvm_version_array[0].to_int()
index 7db6907..7637c4c 100644 (file)
@@ -262,6 +262,13 @@ option(
   description : 'Enable GLVND support.'
 )
 option(
+  'microsoft-clc',
+  type : 'combo',
+  value : 'auto',
+  choices : ['auto', 'true', 'false'],
+  description : 'Build support for the Microsoft CLC to DXIL compiler'
+)
+option(
    'glx-read-only-text',
    type : 'boolean',
    value : false,
index d21c13e..aee01cd 100644 (file)
@@ -946,9 +946,45 @@ load("global_ir3", [2, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN
 intrinsic("bindless_resource_ir3", [1], dest_comp=1, indices=[DESC_SET], flags=[CAN_ELIMINATE, CAN_REORDER])
 
 # DXIL specific intrinsics
+# src[] = { value, mask, index, offset }.
+intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1])
+# src[] = { value, index }.
+intrinsic("store_shared_dxil", [1, 1])
+# src[] = { value, mask, index }.
+intrinsic("store_shared_masked_dxil", [1, 1, 1])
+# src[] = { value, index }.
+intrinsic("store_scratch_dxil", [1, 1])
+# src[] = { index }.
+load("shared_dxil", [1], [], [CAN_ELIMINATE])
+# src[] = { index }.
+load("scratch_dxil", [1], [], [CAN_ELIMINATE])
+# src[] = { deref_var, offset }
+load("ptr_dxil", [1, 1], [], [])
 # src[] = { index, 16-byte-based-offset }
 load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE])
 
+# DXIL Shared atomic intrinsics
+#
+# All of the shared variable atomic memory operations read a value from
+# memory, compute a new value using one of the operations below, write the
+# new value to memory, and return the original value read.
+#
+# All operations take 2 sources:
+#
+# 0: The index in the i32 array for by the shared memory region
+# 1: The data parameter to the atomic function (i.e. the value to add
+#    in shared_atomic_add, etc).
+intrinsic("shared_atomic_add_dxil",  src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_imin_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_umin_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_imax_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_umax_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_and_dxil",  src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_or_dxil",   src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_xor_dxil",  src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_exchange_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_comp_swap_dxil", src_comp=[1, 1, 1], dest_comp=1)
+
 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
 # without applying any format conversion in the process. If the shader needs
index b5f4933..7deb202 100644 (file)
@@ -91,7 +91,7 @@ endif
 if with_any_intel
   subdir('intel')
 endif
-if with_gallium_d3d12
+if with_microsoft_clc or with_gallium_d3d12
   subdir('microsoft')
 endif
 subdir('mesa')
diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c
new file mode 100644 (file)
index 0000000..dc84186
--- /dev/null
@@ -0,0 +1,1447 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_serialize.h"
+#include "glsl_types.h"
+#include "nir_types.h"
+#include "clc_compiler.h"
+#include "clc_helpers.h"
+#include "clc_nir.h"
+#include "../compiler/dxil_nir.h"
+#include "../compiler/dxil_nir_lower_int_samplers.h"
+#include "../compiler/nir_to_dxil.h"
+
+#include "util/u_debug.h"
+#include <util/u_math.h>
+#include "spirv/nir_spirv.h"
+#include "nir_builder.h"
+#include "nir_builtin_builder.h"
+
+#include "git_sha1.h"
+
+enum clc_debug_flags {
+   CLC_DEBUG_DUMP_SPIRV = 1 << 0,
+   CLC_DEBUG_VERBOSE = 1 << 1,
+};
+
+static const struct debug_named_value debug_options[] = {
+   { "dump_spirv",  CLC_DEBUG_DUMP_SPIRV, "Dump spirv blobs" },
+   { "verbose",  CLC_DEBUG_VERBOSE, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(debug_clc, "CLC_DEBUG", debug_options, 0)
+
+static void
+clc_print_kernels_info(const struct clc_object *obj)
+{
+   fprintf(stdout, "Kernels:\n");
+   for (unsigned i = 0; i < obj->num_kernels; i++) {
+      const struct clc_kernel_arg *args = obj->kernels[i].args;
+      bool first = true;
+
+      fprintf(stdout, "\tvoid %s(", obj->kernels[i].name);
+      for (unsigned j = 0; j < obj->kernels[i].num_args; j++) {
+         if (!first)
+            fprintf(stdout, ", ");
+         else
+            first = false;
+
+         switch (args[j].address_qualifier) {
+         case CLC_KERNEL_ARG_ADDRESS_GLOBAL:
+            fprintf(stdout, "__global ");
+            break;
+         case CLC_KERNEL_ARG_ADDRESS_LOCAL:
+            fprintf(stdout, "__local ");
+            break;
+         case CLC_KERNEL_ARG_ADDRESS_CONSTANT:
+            fprintf(stdout, "__constant ");
+            break;
+         default:
+            break;
+         }
+
+         if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_VOLATILE)
+            fprintf(stdout, "volatile ");
+         if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_CONST)
+            fprintf(stdout, "const ");
+         if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_RESTRICT)
+            fprintf(stdout, "restrict ");
+
+         fprintf(stdout, "%s %s", args[j].type_name, args[j].name);
+      }
+      fprintf(stdout, ");\n");
+   }
+}
+
+struct clc_image_lower_context
+{
+   struct clc_dxil_metadata *metadata;
+   unsigned *num_srvs;
+   unsigned *num_uavs;
+   nir_deref_instr *deref;
+   unsigned num_buf_ids;
+   int metadata_index;
+};
+
+static int
+lower_image_deref_impl(nir_builder *b, struct clc_image_lower_context *context,
+                       const struct glsl_type *new_var_type,
+                       unsigned *num_bindings)
+{
+   nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+   nir_variable *uniform = nir_variable_create(b->shader, nir_var_uniform, new_var_type, NULL);
+   uniform->data.access = in_var->data.access;
+   uniform->data.binding = in_var->data.binding;
+   if (context->num_buf_ids > 0) {
+      // Need to assign a new binding
+      context->metadata->args[context->metadata_index].
+         image.buf_ids[context->num_buf_ids] = uniform->data.binding = (*num_bindings)++;
+   }
+   context->num_buf_ids++;
+   return uniform->data.binding;
+}
+
+static int
+lower_read_only_image_deref(nir_builder *b, struct clc_image_lower_context *context,
+                            nir_alu_type image_type)
+{
+   nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+
+   // Non-writeable images should be converted to samplers,
+   // since they may have texture operations done on them
+   const struct glsl_type *new_var_type =
+      glsl_sampler_type(glsl_get_sampler_dim(in_var->type),
+            false, glsl_sampler_type_is_array(in_var->type),
+            nir_get_glsl_base_type_for_nir_type(image_type | 32));
+   return lower_image_deref_impl(b, context, new_var_type, context->num_srvs);
+}
+
+static int
+lower_read_write_image_deref(nir_builder *b, struct clc_image_lower_context *context,
+                             nir_alu_type image_type)
+{
+   nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+   const struct glsl_type *new_var_type =
+      glsl_image_type(glsl_get_sampler_dim(in_var->type),
+         glsl_sampler_type_is_array(in_var->type),
+         nir_get_glsl_base_type_for_nir_type(image_type | 32));
+   return lower_image_deref_impl(b, context, new_var_type, context->num_uavs);
+}
+
+static void
+clc_lower_input_image_deref(nir_builder *b, struct clc_image_lower_context *context)
+{
+   // The input variable here isn't actually an image, it's just the
+   // image format data.
+   //
+   // For every use of an image in a different way, we'll add an
+   // appropriate uniform to match it. That can result in up to
+   // 3 uniforms (float4, int4, uint4) for each image. Only one of these
+   // formats will actually produce correct data, but a single kernel
+   // could use runtime conditionals to potentially access any of them.
+   //
+   // If the image is used in a query that doesn't have a corresponding
+   // DXIL intrinsic (CL image channel order or channel format), then
+   // we'll add a kernel input for that data that'll be lowered by the
+   // explicit IO pass later on.
+   //
+   // After all that, we can remove the image input variable and deref.
+
+   enum image_uniform_type {
+      FLOAT4,
+      INT4,
+      UINT4,
+      IMAGE_UNIFORM_TYPE_COUNT
+   };
+
+   int image_bindings[IMAGE_UNIFORM_TYPE_COUNT] = {-1, -1, -1};
+   nir_ssa_def *format_deref_dest = NULL, *order_deref_dest = NULL;
+
+   nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+   enum gl_access_qualifier access = in_var->data.access;
+
+   context->metadata_index = 0;
+   while (context->metadata->args[context->metadata_index].image.buf_ids[0] != in_var->data.binding)
+      context->metadata_index++;
+
+   context->num_buf_ids = 0;
+
+   /* Do this in 2 passes:
+    * 1. When encountering a strongly-typed access (load/store), replace the deref
+    *    with one that references an appropriately typed variable. When encountering
+    *    an untyped access (size query), if we have a strongly-typed variable already,
+    *    replace the deref to point to it.
+    * 2. If there's any references left, they should all be untyped. If we found
+    *    a strongly-typed access later in the 1st pass, then just replace the reference.
+    *    If we didn't, e.g. the resource is only used for a size query, then pick an
+    *    arbitrary type for it.
+    */
+   for (int pass = 0; pass < 2; ++pass) {
+      nir_foreach_use_safe(src, &context->deref->dest.ssa) {
+         enum image_uniform_type type;
+
+         if (src->parent_instr->type == nir_instr_type_intrinsic) {
+            nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(src->parent_instr);
+            enum nir_alu_type dest_type;
+
+            b->cursor = nir_before_instr(&intrinsic->instr);
+
+            switch (intrinsic->intrinsic) {
+            case nir_intrinsic_image_deref_load:
+            case nir_intrinsic_image_deref_store: {
+               dest_type = intrinsic->intrinsic == nir_intrinsic_image_deref_load ?
+                  nir_intrinsic_dest_type(intrinsic) : nir_intrinsic_src_type(intrinsic);
+
+               switch (nir_alu_type_get_base_type(dest_type)) {
+               case nir_type_float: type = FLOAT4; break;
+               case nir_type_int: type = INT4; break;
+               case nir_type_uint: type = UINT4; break;
+               default: unreachable("Unsupported image type for load.");
+               }
+
+               int image_binding = image_bindings[type];
+               if (image_binding < 0) {
+                  image_binding = image_bindings[type] =
+                     lower_read_write_image_deref(b, context, dest_type);
+               }
+
+               assert((in_var->data.access & ACCESS_NON_WRITEABLE) == 0);
+               nir_rewrite_image_intrinsic(intrinsic, nir_imm_int(b, image_binding), false);
+               break;
+            }
+
+            case nir_intrinsic_image_deref_size: {
+               int image_binding = -1;
+               for (unsigned i = 0; i < IMAGE_UNIFORM_TYPE_COUNT; ++i) {
+                  if (image_bindings[i] >= 0) {
+                     image_binding = image_bindings[i];
+                     break;
+                  }
+               }
+               if (image_binding < 0) {
+                  // Skip for now and come back to it
+                  if (pass == 0)
+                     break;
+
+                  type = FLOAT4;
+                  image_binding = image_bindings[type] =
+                     lower_read_write_image_deref(b, context, nir_type_float32);
+               }
+
+               assert((in_var->data.access & ACCESS_NON_WRITEABLE) == 0);
+               nir_rewrite_image_intrinsic(intrinsic, nir_imm_int(b, image_binding), false);
+               break;
+            }
+
+            case nir_intrinsic_image_deref_format:
+            case nir_intrinsic_image_deref_order: {
+               nir_ssa_def **cached_deref = intrinsic->intrinsic == nir_intrinsic_image_deref_format ?
+                  &format_deref_dest : &order_deref_dest;
+               if (!*cached_deref) {
+                  nir_variable *new_input = nir_variable_create(b->shader, nir_var_uniform, glsl_uint_type(), NULL);
+                  new_input->data.driver_location = in_var->data.driver_location;
+                  if (intrinsic->intrinsic == nir_intrinsic_image_deref_format) {
+                     /* Match cl_image_format { image_channel_order, image_channel_data_type }; */
+                     new_input->data.driver_location += glsl_get_cl_size(new_input->type);
+                  }
+
+                  b->cursor = nir_after_instr(&context->deref->instr);
+                  *cached_deref = nir_load_var(b, new_input);
+               }
+
+               /* No actual intrinsic needed here, just reference the loaded variable */
+               nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(*cached_deref));
+               nir_instr_remove(&intrinsic->instr);
+               break;
+            }
+
+            default:
+               unreachable("Unsupported image intrinsic");
+            }
+         } else if (src->parent_instr->type == nir_instr_type_tex) {
+            assert(in_var->data.access & ACCESS_NON_WRITEABLE);
+            nir_tex_instr *tex = nir_instr_as_tex(src->parent_instr);
+
+            switch (nir_alu_type_get_base_type(tex->dest_type)) {
+            case nir_type_float: type = FLOAT4; break;
+            case nir_type_int: type = INT4; break;
+            case nir_type_uint: type = UINT4; break;
+            default: unreachable("Unsupported image format for sample.");
+            }
+
+            int image_binding = image_bindings[type];
+            if (image_binding < 0) {
+               image_binding = image_bindings[type] =
+                  lower_read_only_image_deref(b, context, tex->dest_type);
+            }
+
+            nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_texture_deref));
+            tex->texture_index = image_binding;
+         }
+      }
+   }
+
+   context->metadata->args[context->metadata_index].image.num_buf_ids = context->num_buf_ids;
+
+   nir_instr_remove(&context->deref->instr);
+   exec_node_remove(&in_var->node);
+}
+
+static void
+clc_lower_images(nir_shader *nir, struct clc_image_lower_context *context)
+{
+   nir_foreach_function(func, nir) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type == nir_instr_type_deref) {
+               context->deref = nir_instr_as_deref(instr);
+
+               if (glsl_type_is_image(context->deref->type)) {
+                  assert(context->deref->deref_type == nir_deref_type_var);
+                  clc_lower_input_image_deref(&b, context);
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+clc_lower_64bit_semantics(nir_shader *nir)
+{
+   nir_foreach_function(func, nir) {
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type == nir_instr_type_intrinsic) {
+               nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+               switch (intrinsic->intrinsic) {
+               case nir_intrinsic_load_global_invocation_id:
+               case nir_intrinsic_load_global_invocation_id_zero_base:
+               case nir_intrinsic_load_base_global_invocation_id:
+               case nir_intrinsic_load_local_invocation_id:
+               case nir_intrinsic_load_work_group_id:
+               case nir_intrinsic_load_work_group_id_zero_base:
+               case nir_intrinsic_load_base_work_group_id:
+               case nir_intrinsic_load_num_work_groups:
+                  break;
+               default:
+                  continue;
+               }
+
+               if (nir_instr_ssa_def(instr)->bit_size != 64)
+                  continue;
+
+               intrinsic->dest.ssa.bit_size = 32;
+               b.cursor = nir_after_instr(instr);
+
+               nir_ssa_def *i64 = nir_u2u64(&b, &intrinsic->dest.ssa);
+               nir_ssa_def_rewrite_uses_after(
+                  &intrinsic->dest.ssa,
+                  nir_src_for_ssa(i64),
+                  i64->parent_instr);
+            }
+         }
+      }
+   }
+}
+
+static void
+clc_lower_nonnormalized_samplers(nir_shader *nir,
+                                 const dxil_wrap_sampler_state *states)
+{
+   nir_foreach_function(func, nir) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_tex)
+               continue;
+            nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+            int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+            if (sampler_src_idx == -1)
+               continue;
+
+            nir_src *sampler_src = &tex->src[sampler_src_idx].src;
+            assert(sampler_src->is_ssa && sampler_src->ssa->parent_instr->type == nir_instr_type_deref);
+            nir_variable *sampler = nir_deref_instr_get_variable(
+               nir_instr_as_deref(sampler_src->ssa->parent_instr));
+
+            // If the sampler returns ints, we'll handle this in the int lowering pass
+            if (nir_alu_type_get_base_type(tex->dest_type) != nir_type_float)
+               continue;
+
+            // If sampler uses normalized coords, nothing to do
+            if (!states[sampler->data.binding].is_nonnormalized_coords)
+               continue;
+
+            b.cursor = nir_before_instr(&tex->instr);
+
+            int coords_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+            assert(coords_idx != -1);
+            nir_ssa_def *coords =
+               nir_ssa_for_src(&b, tex->src[coords_idx].src, tex->coord_components);
+
+            nir_ssa_def *txs = nir_i2f32(&b, nir_get_texture_size(&b, tex));
+
+            // Normalize coords for tex
+            nir_ssa_def *scale = nir_frcp(&b, txs);
+            nir_ssa_def *comps[4];
+            for (unsigned i = 0; i < coords->num_components; ++i) {
+               comps[i] = nir_channel(&b, coords, i);
+               if (tex->is_array && i == coords->num_components - 1) {
+                  // Don't scale the array index, but do clamp it
+                  comps[i] = nir_fround_even(&b, comps[i]);
+                  comps[i] = nir_fmax(&b, comps[i], nir_imm_float(&b, 0.0f));
+                  comps[i] = nir_fmin(&b, comps[i], nir_fsub(&b, nir_channel(&b, txs, i), nir_imm_float(&b, 1.0f)));
+                  break;
+               }
+
+               // The CTS is pretty clear that this value has to be floored for nearest sampling
+               // but must not be for linear sampling.
+               if (!states[sampler->data.binding].is_linear_filtering)
+                  comps[i] = nir_fadd_imm(&b, nir_ffloor(&b, comps[i]), 0.5f);
+               comps[i] = nir_fmul(&b, comps[i], nir_channel(&b, scale, i));
+            }
+            nir_ssa_def *normalized_coords = nir_vec(&b, comps, coords->num_components);
+            nir_instr_rewrite_src(&tex->instr,
+                                  &tex->src[coords_idx].src,
+                                  nir_src_for_ssa(normalized_coords));
+         }
+      }
+   }
+}
+
+
+static void
+clc_context_optimize(nir_shader *s)
+{
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, s, nir_split_var_copies);
+      NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, s, nir_lower_var_copies);
+      NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+      NIR_PASS(progress, s, nir_copy_prop);
+      NIR_PASS(progress, s, nir_opt_remove_phis);
+      NIR_PASS(progress, s, nir_opt_dce);
+      NIR_PASS(progress, s, nir_opt_if, true);
+      NIR_PASS(progress, s, nir_opt_dead_cf);
+      NIR_PASS(progress, s, nir_opt_cse);
+      NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+      NIR_PASS(progress, s, nir_opt_algebraic);
+      NIR_PASS(progress, s, nir_opt_constant_folding);
+      NIR_PASS(progress, s, nir_opt_undef);
+      NIR_PASS(progress, s, nir_lower_undef_to_zero);
+      NIR_PASS(progress, s, nir_opt_deref);
+   } while (progress);
+}
+
+struct clc_context *
+clc_context_new(const struct clc_logger *logger, const struct clc_context_options *options)
+{
+   struct clc_context *ctx = rzalloc(NULL, struct clc_context);
+   if (!ctx) {
+      clc_error(logger, "D3D12: failed to allocate a clc_context");
+      return NULL;
+   }
+
+   const struct spirv_to_nir_options libclc_spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .create_library = true,
+      .constant_addr_format = nir_address_format_32bit_index_offset_pack64,
+      .global_addr_format = nir_address_format_32bit_index_offset_pack64,
+      .shared_addr_format = nir_address_format_32bit_offset_as_64bit,
+      .temp_addr_format = nir_address_format_32bit_offset_as_64bit,
+      .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32,
+      .caps = {
+         .address = true,
+         .float64 = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .kernel = true,
+      },
+   };
+   const struct nir_shader_compiler_options *libclc_nir_options =
+      dxil_get_nir_compiler_options();
+
+   glsl_type_singleton_init_or_ref();
+   nir_shader *s = nir_load_libclc_shader(64, NULL, &libclc_spirv_options, libclc_nir_options);
+   if (!s) {
+      clc_error(logger, "D3D12: spirv_to_nir failed on libclc blob");
+      ralloc_free(ctx);
+      return NULL;
+   }
+
+   if (options && options->optimize)
+      clc_context_optimize(s);
+
+   ctx->libclc_nir = s;
+   ralloc_steal(ctx, ctx->libclc_nir);
+
+   return ctx;
+}
+
+void
+clc_free_context(struct clc_context *ctx)
+{
+   ralloc_free(ctx);
+   glsl_type_singleton_decref();
+};
+
+void clc_context_serialize(struct clc_context *context,
+                           void **serialized,
+                           size_t *serialized_size)
+{
+   struct blob tmp;
+   blob_init(&tmp);
+   nir_serialize(&tmp, context->libclc_nir, true);
+
+   blob_finish_get_buffer(&tmp, serialized, serialized_size);
+}
+
+void clc_context_free_serialized(void *serialized)
+{
+   free(serialized);
+}
+
+struct clc_context *
+   clc_context_deserialize(const void *serialized, size_t serialized_size)
+{
+   struct clc_context *ctx = rzalloc(NULL, struct clc_context);
+   if (!ctx) {
+      return NULL;
+   }
+   const struct nir_shader_compiler_options *libclc_nir_options =
+      dxil_get_nir_compiler_options();
+
+   glsl_type_singleton_init_or_ref();
+
+   struct blob_reader tmp;
+   blob_reader_init(&tmp, serialized, serialized_size);
+
+   ctx->libclc_nir = nir_deserialize(NULL, libclc_nir_options, &tmp);
+   if (!ctx->libclc_nir) {
+      free(ctx);
+      return NULL;
+   }
+
+   ralloc_steal(ctx, ctx->libclc_nir);
+
+   return ctx;
+}
+
+struct clc_object *
+clc_compile(struct clc_context *ctx,
+            const struct clc_compile_args *args,
+            const struct clc_logger *logger)
+{
+   struct clc_object *obj;
+   int ret;
+
+   obj = calloc(1, sizeof(*obj));
+   if (!obj) {
+      clc_error(logger, "D3D12: failed to allocate a clc_object");
+      return NULL;
+   }
+
+   ret = clc_to_spirv(args, &obj->spvbin, logger);
+   if (ret < 0) {
+      free(obj);
+      return NULL;
+   }
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+      clc_dump_spirv(&obj->spvbin, stdout);
+
+   return obj;
+}
+
+struct clc_object *
+clc_link(struct clc_context *ctx,
+         const struct clc_linker_args *args,
+         const struct clc_logger *logger)
+{
+   struct clc_object *out_obj;
+   int ret;
+
+   out_obj = malloc(sizeof(*out_obj));
+   if (!out_obj) {
+      clc_error(logger, "failed to allocate a clc_object");
+      return NULL;
+   }
+
+   ret = clc_link_spirv_binaries(args, &out_obj->spvbin, logger);
+   if (ret < 0) {
+      free(out_obj);
+      return NULL;
+   }
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+      clc_dump_spirv(&out_obj->spvbin, stdout);
+
+   out_obj->kernels = clc_spirv_get_kernels_info(&out_obj->spvbin,
+                                                 &out_obj->num_kernels);
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_VERBOSE)
+      clc_print_kernels_info(out_obj);
+
+   return out_obj;
+}
+
+void clc_free_object(struct clc_object *obj)
+{
+   clc_free_kernels_info(obj->kernels, obj->num_kernels);
+   clc_free_spirv_binary(&obj->spvbin);
+   free(obj);
+}
+
+static nir_variable *
+add_kernel_inputs_var(struct clc_dxil_object *dxil, nir_shader *nir,
+                      unsigned *cbv_id)
+{
+   if (!dxil->kernel->num_args)
+      return NULL;
+
+   struct clc_dxil_metadata *metadata = &dxil->metadata;
+   unsigned size = 0;
+
+   nir_foreach_variable_with_modes(var, nir, nir_var_uniform)
+      size = MAX2(size,
+                  var->data.driver_location +
+                  glsl_get_cl_size(var->type));
+
+   size = align(size, 4);
+
+   nir_variable *var =
+      nir_variable_create(nir, nir_var_mem_ubo,
+                          glsl_array_type(glsl_uint_type(),
+                                          size / 4, 0),
+                          "kernel_inputs");
+   var->data.binding = (*cbv_id)++;
+   var->data.how_declared = nir_var_hidden;
+   return var;
+}
+
+static nir_variable *
+add_work_properties_var(struct clc_dxil_object *dxil,
+                           struct nir_shader *nir, unsigned *cbv_id)
+{
+   struct clc_dxil_metadata *metadata = &dxil->metadata;
+   nir_variable *var =
+      nir_variable_create(nir, nir_var_mem_ubo,
+                          glsl_array_type(glsl_uint_type(),
+                                          sizeof(struct clc_work_properties_data) / sizeof(unsigned),
+                                          0),
+                          "kernel_work_properies");
+   var->data.binding = (*cbv_id)++;
+   var->data.how_declared = nir_var_hidden;
+   return var;
+}
+
+static void
+clc_lower_constant_to_ssbo(nir_shader *nir,
+                      const struct clc_kernel_info *kerninfo, unsigned *uav_id)
+{
+   /* Update UBO vars and assign them a binding. */
+   nir_foreach_variable_with_modes(var, nir, nir_var_mem_constant) {
+      var->data.mode = nir_var_mem_ssbo;
+      var->data.binding = (*uav_id)++;
+   }
+
+   /* And finally patch all the derefs referincing the constant
+    * variables/pointers.
+    */
+   nir_foreach_function(func, nir) {
+      if (!func->is_entrypoint)
+         continue;
+
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+            if (deref->modes != nir_var_mem_constant)
+               continue;
+
+            deref->modes = nir_var_mem_ssbo;
+         }
+      }
+   }
+}
+
+static void
+clc_lower_global_to_ssbo(nir_shader *nir)
+{
+   nir_foreach_function(func, nir) {
+      if (!func->is_entrypoint)
+         continue;
+
+      assert(func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+            if (deref->modes != nir_var_mem_global)
+               continue;
+
+            deref->modes = nir_var_mem_ssbo;
+         }
+      }
+   }
+}
+
+static void
+copy_const_initializer(const nir_constant *constant, const struct glsl_type *type,
+                       uint8_t *data)
+{
+   unsigned size = glsl_get_cl_size(type);
+
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *elm_type = glsl_get_array_element(type);
+      unsigned step_size = glsl_get_explicit_stride(type);
+
+      for (unsigned i = 0; i < constant->num_elements; i++) {
+         copy_const_initializer(constant->elements[i], elm_type,
+                                data + (i * step_size));
+      }
+   } else if (glsl_type_is_struct(type)) {
+      for (unsigned i = 0; i < constant->num_elements; i++) {
+         const struct glsl_type *elm_type = glsl_get_struct_field(type, i);
+         int offset = glsl_get_struct_field_offset(type, i);
+         copy_const_initializer(constant->elements[i], elm_type, data + offset);
+      }
+   } else {
+      assert(glsl_type_is_vector_or_scalar(type));
+
+      for (unsigned i = 0; i < glsl_get_components(type); i++) {
+         switch (glsl_get_bit_size(type)) {
+         case 64:
+            *((uint64_t *)data) = constant->values[i].u64;
+            break;
+         case 32:
+            *((uint32_t *)data) = constant->values[i].u32;
+            break;
+         case 16:
+            *((uint16_t *)data) = constant->values[i].u16;
+            break;
+         case 8:
+            *((uint8_t *)data) = constant->values[i].u8;
+            break;
+         default:
+            unreachable("Invalid base type");
+         }
+
+         data += glsl_get_bit_size(type) / 8;
+      }
+   }
+}
+
+static const struct glsl_type *
+get_cast_type(unsigned bit_size)
+{
+   switch (bit_size) {
+   case 64:
+      return glsl_int64_t_type();
+   case 32:
+      return glsl_int_type();
+   case 16:
+      return glsl_int16_t_type();
+   case 8:
+      return glsl_int8_t_type();
+   }
+   unreachable("Invalid bit_size");
+}
+
+static void
+split_unaligned_load(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment)
+{
+   enum gl_access_qualifier access = nir_intrinsic_access(intrin);
+   nir_ssa_def *srcs[NIR_MAX_VEC_COMPONENTS * NIR_MAX_VEC_COMPONENTS * sizeof(int64_t) / 8];
+   unsigned comp_size = intrin->dest.ssa.bit_size / 8;
+   unsigned num_comps = intrin->dest.ssa.num_components;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]);
+
+   const struct glsl_type *cast_type = get_cast_type(alignment * 8);
+   nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment);
+
+   unsigned num_loads = DIV_ROUND_UP(comp_size * num_comps, alignment);
+   for (unsigned i = 0; i < num_loads; ++i) {
+      nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size));
+      srcs[i] = nir_load_deref_with_access(b, elem, access);
+   }
+
+   nir_ssa_def *new_dest = nir_extract_bits(b, srcs, num_loads, 0, num_comps, intrin->dest.ssa.bit_size);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(new_dest));
+   nir_instr_remove(&intrin->instr);
+}
+
+static void
+split_unaligned_store(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment)
+{
+   enum gl_access_qualifier access = nir_intrinsic_access(intrin);
+
+   assert(intrin->src[1].is_ssa);
+   nir_ssa_def *value = intrin->src[1].ssa;
+   unsigned comp_size = value->bit_size / 8;
+   unsigned num_comps = value->num_components;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]);
+
+   const struct glsl_type *cast_type = get_cast_type(alignment * 8);
+   nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment);
+
+   unsigned num_stores = DIV_ROUND_UP(comp_size * num_comps, alignment);
+   for (unsigned i = 0; i < num_stores; ++i) {
+      nir_ssa_def *substore_val = nir_extract_bits(b, &value, 1, i * alignment * 8, 1, alignment * 8);
+      nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size));
+      nir_store_deref_with_access(b, elem, substore_val, ~0, access);
+   }
+
+   nir_instr_remove(&intrin->instr);
+}
+
+static bool
+split_unaligned_loads_stores(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (!function->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, function->impl);
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_deref &&
+                intrin->intrinsic != nir_intrinsic_store_deref)
+               continue;
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+            unsigned align_mul = 0, align_offset = 0;
+            nir_get_explicit_deref_align(deref, true, &align_mul, &align_offset);
+
+            unsigned alignment = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
+
+            /* We can load anything at 4-byte alignment, except for 
+             * UBOs (AKA CBs where the granularity is 16 bytes).
+             */
+            if (alignment >= (deref->modes == nir_var_mem_ubo ? 16 : 4))
+               continue;
+
+            nir_ssa_def *val;
+            if (intrin->intrinsic == nir_intrinsic_load_deref) {
+               assert(intrin->dest.is_ssa);
+               val = &intrin->dest.ssa;
+            } else {
+               assert(intrin->src[1].is_ssa);
+               val = intrin->src[1].ssa;
+            }
+
+            unsigned natural_alignment =
+               val->bit_size / 8 *
+               (val->num_components == 3 ? 4 : val->num_components);
+
+            if (alignment >= natural_alignment)
+               continue;
+
+            if (intrin->intrinsic == nir_intrinsic_load_deref)
+               split_unaligned_load(&b, intrin, alignment);
+            else
+               split_unaligned_store(&b, intrin, alignment);
+            progress = true;
+         }
+      }
+   }
+
+   return progress;
+}
+
+static enum pipe_tex_wrap
+wrap_from_cl_addressing(unsigned addressing_mode)
+{
+   switch (addressing_mode)
+   {
+   default:
+   case SAMPLER_ADDRESSING_MODE_NONE:
+   case SAMPLER_ADDRESSING_MODE_CLAMP:
+      // Since OpenCL's only border color is 0's and D3D specs out-of-bounds loads to return 0, don't apply any wrap mode
+      return (enum pipe_tex_wrap)-1;
+   case SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: return PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   case SAMPLER_ADDRESSING_MODE_REPEAT: return PIPE_TEX_WRAP_REPEAT;
+   case SAMPLER_ADDRESSING_MODE_REPEAT_MIRRORED: return PIPE_TEX_WRAP_MIRROR_REPEAT;
+   }
+}
+
+static bool shader_has_double(nir_shader *nir)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+
+      assert(func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_alu)
+               continue;
+
+             nir_alu_instr *alu = nir_instr_as_alu(instr);
+             const nir_op_info *info = &nir_op_infos[alu->op];
+
+             if (info->output_type & nir_type_float &&
+                 nir_dest_bit_size(alu->dest.dest) == 64)
+                 return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+static bool
+scale_fdiv(nir_shader *nir)
+{
+   bool progress = false;
+   nir_foreach_function(func, nir) {
+      if (!func->impl)
+         continue;
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_alu)
+               continue;
+            nir_alu_instr *alu = nir_instr_as_alu(instr);
+            if (alu->op != nir_op_fdiv)
+               continue;
+
+            b.cursor = nir_before_instr(instr);
+            nir_ssa_def *fabs = nir_fabs(&b, alu->src[1].src.ssa);
+            nir_ssa_def *big = nir_flt(&b, nir_imm_int(&b, 0x7e800000), fabs);
+            nir_ssa_def *small = nir_flt(&b, fabs, nir_imm_int(&b, 0x00800000));
+
+            nir_ssa_def *scaled_down_a = nir_fmul_imm(&b, alu->src[0].src.ssa, 0.25);
+            nir_ssa_def *scaled_down_b = nir_fmul_imm(&b, alu->src[1].src.ssa, 0.25);
+            nir_ssa_def *scaled_up_a = nir_fmul_imm(&b, alu->src[0].src.ssa, 16777216.0);
+            nir_ssa_def *scaled_up_b = nir_fmul_imm(&b, alu->src[1].src.ssa, 16777216.0);
+
+            nir_ssa_def *final_a =
+               nir_bcsel(&b, big, scaled_down_a,
+              (nir_bcsel(&b, small, scaled_up_a, alu->src[0].src.ssa)));
+            nir_ssa_def *final_b =
+               nir_bcsel(&b, big, scaled_down_b,
+              (nir_bcsel(&b, small, scaled_up_b, alu->src[1].src.ssa)));
+
+            nir_instr_rewrite_src(instr, &alu->src[0].src, nir_src_for_ssa(final_a));
+            nir_instr_rewrite_src(instr, &alu->src[1].src, nir_src_for_ssa(final_b));
+            progress = true;
+         }
+      }
+   }
+   return progress;
+}
+
+struct clc_dxil_object *
+clc_to_dxil(struct clc_context *ctx,
+            const struct clc_object *obj,
+            const char *entrypoint,
+            const struct clc_runtime_kernel_conf *conf,
+            const struct clc_logger *logger)
+{
+   struct clc_dxil_object *dxil;
+   struct nir_shader *nir;
+   char *err_log;
+   int ret;
+
+   dxil = calloc(1, sizeof(*dxil));
+   if (!dxil) {
+      clc_error(logger, "failed to allocate the dxil object");
+      return NULL;
+   }
+
+   for (unsigned i = 0; i < obj->num_kernels; i++) {
+      if (!strcmp(obj->kernels[i].name, entrypoint)) {
+         dxil->kernel = &obj->kernels[i];
+         break;
+      }
+   }
+
+   if (!dxil->kernel) {
+      clc_error(logger, "no '%s' kernel found", entrypoint);
+      goto err_free_dxil;
+   }
+
+   const struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .clc_shader = ctx->libclc_nir,
+      .constant_addr_format = nir_address_format_32bit_index_offset_pack64,
+      .global_addr_format = nir_address_format_32bit_index_offset_pack64,
+      .shared_addr_format = nir_address_format_32bit_offset_as_64bit,
+      .temp_addr_format = nir_address_format_32bit_offset_as_64bit,
+      .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32,
+      .caps = {
+         .address = true,
+         .float64 = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .kernel = true,
+         .kernel_image = true,
+         .literal_sampler = true,
+      },
+   };
+   nir_shader_compiler_options nir_options =
+      *dxil_get_nir_compiler_options();
+
+   if (conf && conf->lower_bit_size & 64) {
+      nir_options.lower_pack_64_2x32_split = false;
+      nir_options.lower_unpack_64_2x32_split = false;
+      nir_options.lower_int64_options = ~0;
+   }
+
+   if (conf && conf->lower_bit_size & 16)
+      nir_options.support_16bit_alu = true;
+
+   glsl_type_singleton_init_or_ref();
+
+   nir = spirv_to_nir(obj->spvbin.data, obj->spvbin.size / 4,
+                      NULL, 0,
+                      MESA_SHADER_KERNEL, entrypoint,
+                      &spirv_options,
+                      &nir_options);
+   if (!nir) {
+      clc_error(logger, "spirv_to_nir() failed");
+      goto err_free_dxil;
+   }
+   nir->info.cs.local_size_variable = true;
+
+   NIR_PASS_V(nir, nir_lower_goto_ifs);
+   NIR_PASS_V(nir, nir_opt_dead_cf);
+
+   struct clc_dxil_metadata *metadata = &dxil->metadata;
+
+   metadata->args = calloc(dxil->kernel->num_args,
+                           sizeof(*metadata->args));
+   if (!metadata->args) {
+      clc_error(logger, "failed to allocate arg positions");
+      goto err_free_dxil;
+   }
+
+   // Calculate input offsets/metadata.
+   unsigned uav_id = 0, sampler_id = 0, offset = 0;
+   dxil_wrap_sampler_state int_sampler_states[PIPE_MAX_SHADER_SAMPLER_VIEWS] = {{{0}}};
+   nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
+      int i = var->data.location;
+      if (i < 0)
+         continue;
+
+      unsigned size = glsl_get_cl_size(var->type);
+      offset = align(offset, glsl_get_cl_alignment(var->type));
+      var->data.driver_location = offset;
+
+      metadata->args[i].offset = offset;
+      metadata->args[i].size = size;
+      metadata->kernel_inputs_buf_size = MAX2(metadata->kernel_inputs_buf_size,
+                                              offset + size);
+      if ((dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL ||
+           dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_CONSTANT) &&
+          // Ignore images during this pass - global memory buffers need to have contiguous bindings
+          !glsl_type_is_image(var->type)) {
+         metadata->args[i].globconstptr.buf_id = uav_id++;
+      } else if (glsl_type_is_sampler(var->type)) {
+         unsigned address_mode = conf ? conf->args[i].sampler.addressing_mode : 0u;
+         int_sampler_states[sampler_id].wrap[0] =
+            int_sampler_states[sampler_id].wrap[1] =
+            int_sampler_states[sampler_id].wrap[2] = wrap_from_cl_addressing(address_mode);
+         int_sampler_states[sampler_id].is_nonnormalized_coords =
+            conf ? !conf->args[i].sampler.normalized_coords : 0;
+         int_sampler_states[sampler_id].is_linear_filtering =
+            conf ? conf->args[i].sampler.linear_filtering : 0;
+         metadata->args[i].sampler.sampler_id = var->data.binding = sampler_id++;
+      }
+      offset += size;
+   }
+
+   unsigned num_global_inputs = uav_id;
+
+   // Second pass over inputs to calculate image bindings
+   unsigned srv_id = 0;
+   nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
+      int i = var->data.location;
+      if (i < 0)
+         continue;
+
+      if (glsl_type_is_image(var->type)) {
+         if (var->data.access == ACCESS_NON_WRITEABLE) {
+            metadata->args[i].image.buf_ids[0] = srv_id++;
+         } else {
+            // Write or read-write are UAVs
+            metadata->args[i].image.buf_ids[0] = uav_id++;
+         }
+
+         metadata->args[i].image.num_buf_ids = 1;
+         var->data.binding = metadata->args[i].image.buf_ids[0];
+      }
+   }
+
+   {
+      bool progress;
+      do
+      {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_undef);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+
+   // Inline all functions first.
+   // according to the comment on nir_inline_functions
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_lower_libclc, ctx->libclc_nir);
+   NIR_PASS_V(nir, nir_inline_functions);
+
+   // Pick off the single entrypoint that we want.
+   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         exec_node_remove(&func->node);
+   }
+   assert(exec_list_length(&nir->functions) == 1);
+
+   {
+      bool progress;
+      do
+      {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_undef);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_split_var_copies);
+         NIR_PASS(progress, nir, nir_lower_var_copies);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+         NIR_PASS(progress, nir, nir_opt_if, true);
+         NIR_PASS(progress, nir, nir_opt_dead_cf);
+         NIR_PASS(progress, nir, nir_opt_remove_phis);
+         NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+         NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
+      } while (progress);
+   }
+
+   // Before removing dead uniforms, dedupe constant samplers to make more dead uniforms
+   NIR_PASS_V(nir, clc_nir_dedupe_const_samplers);
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | nir_var_mem_constant | nir_var_function_temp, NULL);
+
+   NIR_PASS_V(nir, scale_fdiv);
+
+   // Assign bindings for constant samplers
+   nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
+      if (glsl_type_is_sampler(var->type) && var->data.sampler.is_inline_sampler) {
+         int_sampler_states[sampler_id].wrap[0] =
+            int_sampler_states[sampler_id].wrap[0] =
+            int_sampler_states[sampler_id].wrap[0] =
+            wrap_from_cl_addressing(var->data.sampler.addressing_mode);
+         int_sampler_states[sampler_id].is_nonnormalized_coords =
+            !var->data.sampler.normalized_coordinates;
+         int_sampler_states[sampler_id].is_linear_filtering =
+            var->data.sampler.filter_mode == SAMPLER_FILTER_MODE_LINEAR;
+         var->data.binding = sampler_id++;
+
+         assert(metadata->num_const_samplers < CLC_MAX_SAMPLERS);
+         metadata->const_samplers[metadata->num_const_samplers].sampler_id = var->data.binding;
+         metadata->const_samplers[metadata->num_const_samplers].addressing_mode = var->data.sampler.addressing_mode;
+         metadata->const_samplers[metadata->num_const_samplers].normalized_coords = var->data.sampler.normalized_coordinates;
+         metadata->const_samplers[metadata->num_const_samplers].filter_mode = var->data.sampler.filter_mode;
+         metadata->num_const_samplers++;
+      }
+   }
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_function_temp | nir_var_shader_temp));
+
+   // Lower memcpy
+   NIR_PASS_V(nir, dxil_nir_lower_memcpy_deref);
+
+   bool has_printf = false;
+   //NIR_PASS(has_printf, nir, clc_nir_lower_printf, uav_id);
+   metadata->printf_uav_id = has_printf ? uav_id++ : -1;
+
+   // copy propagate to prepare for lower_explicit_io
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_opt_copy_prop_vars);
+   NIR_PASS_V(nir, nir_lower_var_copies);
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_lower_alu);
+   NIR_PASS_V(nir, nir_opt_dce);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   // Needs to come before lower_explicit_io
+   NIR_PASS_V(nir, nir_lower_cl_images_to_tex);
+   struct clc_image_lower_context image_lower_context = { metadata, &srv_id, &uav_id };
+   NIR_PASS_V(nir, clc_lower_images, &image_lower_context);
+   NIR_PASS_V(nir, clc_lower_nonnormalized_samplers, int_sampler_states);
+   NIR_PASS_V(nir, nir_lower_samplers);
+   NIR_PASS_V(nir, dxil_lower_sample_to_txf_for_integer_tex,
+              int_sampler_states, NULL, 14.0f);
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
+   assert(nir->scratch_size == 0);
+   
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_mem_shared | nir_var_function_temp | nir_var_uniform | nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+
+   NIR_PASS_V(nir, dxil_nir_lower_ubo_to_temp);
+   NIR_PASS_V(nir, clc_lower_constant_to_ssbo, dxil->kernel, &uav_id);
+   NIR_PASS_V(nir, clc_lower_global_to_ssbo);
+   NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo);
+
+   NIR_PASS_V(nir, split_unaligned_loads_stores);
+
+   assert(nir->info.cs.ptr_size == 64);
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+              nir_address_format_32bit_index_offset_pack64);
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_mem_shared | nir_var_function_temp | nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_system_values);
+
+   nir_lower_compute_system_values_options compute_options = {
+      .has_base_global_invocation_id = (conf && conf->support_global_work_id_offsets),
+      .has_base_work_group_id = (conf && conf->support_work_group_id_offsets),
+   };
+   NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_options);
+
+   NIR_PASS_V(nir, clc_lower_64bit_semantics);
+
+   NIR_PASS_V(nir, nir_opt_deref);
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+   unsigned cbv_id = 0;
+
+   nir_variable *inputs_var =
+      add_kernel_inputs_var(dxil, nir, &cbv_id);
+   nir_variable *work_properties_var =
+      add_work_properties_var(dxil, nir, &cbv_id);
+
+   // Patch the localsize before calling clc_nir_lower_system_values().
+   if (conf) {
+      for (unsigned i = 0; i < ARRAY_SIZE(nir->info.cs.local_size); i++) {
+         if (!conf->local_size[i] ||
+             conf->local_size[i] == nir->info.cs.local_size[i])
+            continue;
+
+         if (nir->info.cs.local_size[i] &&
+             nir->info.cs.local_size[i] != conf->local_size[i]) {
+            debug_printf("D3D12: runtime local size does not match reqd_work_group_size() values\n");
+            goto err_free_dxil;
+         }
+
+         nir->info.cs.local_size[i] = conf->local_size[i];
+      }
+   }
+
+   NIR_PASS_V(nir, clc_nir_lower_kernel_input_loads, inputs_var);
+   NIR_PASS_V(nir, split_unaligned_loads_stores);
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
+              nir_address_format_32bit_index_offset);
+   NIR_PASS_V(nir, clc_nir_lower_system_values, work_properties_var);
+   NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil);
+   NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs);
+   NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil);
+   NIR_PASS_V(nir, dxil_nir_lower_fp16_casts);
+   NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
+
+   // Convert pack to pack_split
+   NIR_PASS_V(nir, nir_lower_pack);
+   // Lower pack_split to bit math
+   NIR_PASS_V(nir, nir_opt_algebraic);
+
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   nir_validate_shader(nir, "Validate before feeding NIR to the DXIL compiler");
+   struct nir_to_dxil_options opts = {
+      .interpolate_at_vertex = false,
+      .lower_int16 = (conf && (conf->lower_bit_size & 16) != 0),
+      .ubo_binding_offset = 0,
+      .disable_math_refactoring = true,
+      .num_kernel_globals = num_global_inputs,
+   };
+
+   for (unsigned i = 0; i < dxil->kernel->num_args; i++) {
+      if (dxil->kernel->args[i].address_qualifier != CLC_KERNEL_ARG_ADDRESS_LOCAL)
+         continue;
+
+      /* If we don't have the runtime conf yet, we just create a dummy variable.
+       * This will be adjusted when clc_to_dxil() is called with a conf
+       * argument.
+       */
+      unsigned size = 4;
+      if (conf && conf->args)
+         size = conf->args[i].localptr.size;
+
+      /* The alignment required for the pointee type is not easy to get from
+       * here, so let's base our logic on the size itself. Anything bigger than
+       * the maximum alignment constraint (which is 128 bytes, since ulong16 or
+       * doubl16 size are the biggest base types) should be aligned on this
+       * maximum alignment constraint. For smaller types, we use the size
+       * itself to calculate the alignment.
+       */
+      unsigned alignment = size < 128 ? (1 << (ffs(size) - 1)) : 128;
+
+      nir->info.cs.shared_size = align(nir->info.cs.shared_size, alignment);
+      metadata->args[i].localptr.sharedmem_offset = nir->info.cs.shared_size;
+      nir->info.cs.shared_size += size;
+   }
+
+   metadata->local_mem_size = nir->info.cs.shared_size;
+   metadata->priv_mem_size = nir->scratch_size;
+
+   /* DXIL double math is too limited compared to what NIR expects. Let's refuse
+    * to compile a shader when it contains double operations until we have
+    * double lowering hooked up.
+    */
+   if (shader_has_double(nir)) {
+      clc_error(logger, "NIR shader contains doubles, which we don't support yet");
+      goto err_free_dxil;
+   }
+
+   struct blob tmp;
+   if (!nir_to_dxil(nir, &opts, &tmp)) {
+      debug_printf("D3D12: nir_to_dxil failed\n");
+      goto err_free_dxil;
+   }
+
+   memcpy(metadata->local_size, nir->info.cs.local_size,
+          sizeof(metadata->local_size));
+   memcpy(metadata->local_size_hint, nir->info.cs.local_size_hint,
+          sizeof(metadata->local_size));
+
+   nir_foreach_variable_with_modes(var, nir, nir_var_mem_ssbo) {
+      if (var->constant_initializer) {
+         if (glsl_type_is_array(var->type)) {
+            int size = align(glsl_get_cl_size(var->type), 4);
+            uint8_t *data = malloc(size);
+            if (!data)
+               goto err_free_dxil;
+
+            copy_const_initializer(var->constant_initializer, var->type, data);
+            metadata->consts[metadata->num_consts].data = data;
+            metadata->consts[metadata->num_consts].size = size;
+            metadata->consts[metadata->num_consts].uav_id = var->data.binding;
+            metadata->num_consts++;
+         } else
+            unreachable("unexpected constant initializer");
+      }
+   }
+
+   metadata->kernel_inputs_cbv_id = inputs_var ? inputs_var->data.binding : 0;
+   metadata->work_properties_cbv_id = work_properties_var->data.binding;
+   metadata->num_uavs = uav_id;
+   metadata->num_srvs = srv_id;
+   metadata->num_samplers = sampler_id;
+
+   ralloc_free(nir);
+   glsl_type_singleton_decref();
+
+   blob_finish_get_buffer(&tmp, &dxil->binary.data,
+                          &dxil->binary.size);
+   return dxil;
+
+err_free_dxil:
+   clc_free_dxil_object(dxil);
+   return NULL;
+}
+
+void clc_free_dxil_object(struct clc_dxil_object *dxil)
+{
+   for (unsigned i = 0; i < dxil->metadata.num_consts; i++)
+      free(dxil->metadata.consts[i].data);
+
+   free(dxil->binary.data);
+   free(dxil);
+}
+
+uint64_t clc_compiler_get_version()
+{
+   const char sha1[] = MESA_GIT_SHA1;
+   const char* dash = strchr(sha1, '-');
+   if (dash) {
+      return strtoull(dash + 1, NULL, 16);
+   }
+   return 0;
+}
diff --git a/src/microsoft/clc/clc_compiler.h b/src/microsoft/clc/clc_compiler.h
new file mode 100644 (file)
index 0000000..8b73d9e
--- /dev/null
@@ -0,0 +1,266 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CLC_COMPILER_H
+#define CLC_COMPILER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct clc_named_value {
+   const char *name;
+   const char *value;
+};
+
+struct clc_compile_args {
+   const struct clc_named_value *headers;
+   unsigned num_headers;
+   struct clc_named_value source;
+   const char * const *args;
+   unsigned num_args;
+};
+
+struct clc_linker_args {
+   const struct clc_object * const *in_objs;
+   unsigned num_in_objs;
+   unsigned create_library;
+};
+
+typedef void (*clc_msg_callback)(void *priv, const char *msg);
+
+struct clc_logger {
+   void *priv;
+   clc_msg_callback error;
+   clc_msg_callback warning;
+};
+
+struct spirv_binary {
+   uint32_t *data;
+   size_t size;
+};
+
+enum clc_kernel_arg_type_qualifier {
+   CLC_KERNEL_ARG_TYPE_CONST = 1 << 0,
+   CLC_KERNEL_ARG_TYPE_RESTRICT = 1 << 1,
+   CLC_KERNEL_ARG_TYPE_VOLATILE = 1 << 2,
+};
+
+enum clc_kernel_arg_access_qualifier {
+   CLC_KERNEL_ARG_ACCESS_READ = 1 << 0,
+   CLC_KERNEL_ARG_ACCESS_WRITE = 1 << 1,
+};
+
+enum clc_kernel_arg_address_qualifier {
+   CLC_KERNEL_ARG_ADDRESS_PRIVATE,
+   CLC_KERNEL_ARG_ADDRESS_CONSTANT,
+   CLC_KERNEL_ARG_ADDRESS_LOCAL,
+   CLC_KERNEL_ARG_ADDRESS_GLOBAL,
+};
+
+struct clc_kernel_arg {
+   const char *name;
+   const char *type_name;
+   unsigned type_qualifier;
+   unsigned access_qualifier;
+   enum clc_kernel_arg_address_qualifier address_qualifier;
+};
+
+enum clc_vec_hint_type {
+   CLC_VEC_HINT_TYPE_CHAR = 0,
+   CLC_VEC_HINT_TYPE_SHORT = 1,
+   CLC_VEC_HINT_TYPE_INT = 2,
+   CLC_VEC_HINT_TYPE_LONG = 3,
+   CLC_VEC_HINT_TYPE_HALF = 4,
+   CLC_VEC_HINT_TYPE_FLOAT = 5,
+   CLC_VEC_HINT_TYPE_DOUBLE = 6
+};
+
+struct clc_kernel_info {
+   const char *name;
+   size_t num_args;
+   const struct clc_kernel_arg *args;
+
+   unsigned vec_hint_size;
+   enum clc_vec_hint_type vec_hint_type;
+};
+
+struct clc_object {
+   struct spirv_binary spvbin;
+   const struct clc_kernel_info *kernels;
+   unsigned num_kernels;
+};
+
+#define CLC_MAX_CONSTS 32
+#define CLC_MAX_BINDINGS_PER_ARG 3
+#define CLC_MAX_SAMPLERS 16
+
+struct clc_dxil_metadata {
+   struct {
+      unsigned offset;
+      unsigned size;
+      union {
+         struct {
+            unsigned buf_ids[CLC_MAX_BINDINGS_PER_ARG];
+            unsigned num_buf_ids;
+         } image;
+         struct {
+            unsigned sampler_id;
+         } sampler;
+         struct {
+            unsigned buf_id;
+         } globconstptr;
+         struct {
+            unsigned sharedmem_offset;
+        } localptr;
+      };
+   } *args;
+   unsigned kernel_inputs_cbv_id;
+   unsigned kernel_inputs_buf_size;
+   unsigned work_properties_cbv_id;
+   size_t num_uavs;
+   size_t num_srvs;
+   size_t num_samplers;
+
+   struct {
+      void *data;
+      size_t size;
+      unsigned uav_id;
+   } consts[CLC_MAX_CONSTS];
+   size_t num_consts;
+
+   struct {
+      unsigned sampler_id;
+      unsigned addressing_mode;
+      unsigned normalized_coords;
+      unsigned filter_mode;
+   } const_samplers[CLC_MAX_SAMPLERS];
+   size_t num_const_samplers;
+   size_t local_mem_size;
+   size_t priv_mem_size;
+
+   uint16_t local_size[3];
+   uint16_t local_size_hint[3];
+
+   int printf_uav_id;
+};
+
+struct clc_dxil_object {
+   const struct clc_kernel_info *kernel;
+   struct clc_dxil_metadata metadata;
+   struct {
+      void *data;
+      size_t size;
+   } binary;
+};
+
+struct clc_context {
+   const void *libclc_nir;
+};
+
+struct clc_context_options {
+   unsigned optimize;
+};
+
+struct clc_context *clc_context_new(const struct clc_logger *logger, const struct clc_context_options *options);
+
+void clc_free_context(struct clc_context *ctx);
+
+void clc_context_serialize(struct clc_context *ctx, void **serialized, size_t *size);
+void clc_context_free_serialized(void *serialized);
+struct clc_context *clc_context_deserialize(void *serialized, size_t size);
+
+struct clc_object *
+clc_compile(struct clc_context *ctx,
+            const struct clc_compile_args *args,
+            const struct clc_logger *logger);
+
+struct clc_object *
+clc_link(struct clc_context *ctx,
+         const struct clc_linker_args *args,
+         const struct clc_logger *logger);
+
+void clc_free_object(struct clc_object *obj);
+
+struct clc_runtime_arg_info {
+   union {
+      struct {
+         unsigned size;
+      } localptr;
+      struct {
+         unsigned normalized_coords;
+         unsigned addressing_mode; /* See SPIR-V spec for value meanings */
+         unsigned linear_filtering;
+      } sampler;
+   };
+};
+
+struct clc_runtime_kernel_conf {
+   uint16_t local_size[3];
+   struct clc_runtime_arg_info *args;
+   unsigned lower_bit_size;
+   unsigned support_global_work_id_offsets;
+   unsigned support_work_group_id_offsets;
+};
+
+struct clc_dxil_object *
+clc_to_dxil(struct clc_context *ctx,
+            const struct clc_object *obj,
+            const char *entrypoint,
+            const struct clc_runtime_kernel_conf *conf,
+            const struct clc_logger *logger);
+
+void clc_free_dxil_object(struct clc_dxil_object *dxil);
+
+/* This struct describes the layout of data expected in the CB bound at global_work_offset_cbv_id */
+struct clc_work_properties_data {
+   /* Returned from get_global_offset(), and added into get_global_id() */
+   unsigned global_offset_x;
+   unsigned global_offset_y;
+   unsigned global_offset_z;
+   /* Returned from get_work_dim() */
+   unsigned work_dim;
+   /* The number of work groups being launched (i.e. the parameters to Dispatch).
+    * If the requested global size doesn't fit in a single Dispatch, these values should
+    * indicate the total number of groups that *should* have been launched. */
+   unsigned group_count_total_x;
+   unsigned group_count_total_y;
+   unsigned group_count_total_z;
+   unsigned padding;
+   /* If the requested global size doesn't fit in a single Dispatch, subsequent dispatches
+    * should fill out these offsets to indicate how many groups have already been launched */
+   unsigned group_id_offset_x;
+   unsigned group_id_offset_y;
+   unsigned group_id_offset_z;
+};
+
+uint64_t clc_compiler_get_version();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/microsoft/clc/clc_compiler_test.cpp b/src/microsoft/clc/clc_compiler_test.cpp
new file mode 100644 (file)
index 0000000..eb7509d
--- /dev/null
@@ -0,0 +1,2187 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdexcept>
+#include <vector>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <gtest/gtest.h>
+#include <wrl.h>
+
+#include "compute_test.h"
+
+using std::vector;
+
+TEST_F(ComputeTest, runtime_memcpy)
+{
+   struct shift { uint8_t val; uint8_t shift; uint16_t ret; };
+   const char *kernel_source =
+   "struct shift { uchar val; uchar shift; ushort ret; };\n\
+   __kernel void main_test(__global struct shift *inout)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      uint id2 = id + get_global_id(1);\n\
+      struct shift lc[4] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }};\n\
+      lc[id] = inout[id];\n\
+      inout[id2].ret = (ushort) lc[id2].val << (ushort) lc[id2].shift;\n\
+   }\n";
+
+   auto inout = ShaderArg<struct shift>({
+         { 0x10, 1, 0xffff },
+         { 0x20, 2, 0xffff },
+         { 0x30, 3, 0xffff },
+         { 0x40, 4, 0xffff },
+      },
+      SHADER_ARG_INOUT);
+   const uint16_t expected[] = { 0x20, 0x80, 0x180, 0x400 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i].ret, expected[i]);
+}
+
+TEST_F(ComputeTest, two_global_arrays)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *g1, __global uint *g2)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       g1[idx] -= g2[idx];\n\
+   }\n";
+   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+   auto g2 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+   const uint32_t expected[] = {
+      9, 18, 27, 36
+   };
+
+   run_shader(kernel_source, g1.size(), 1, 1, g1, g2);
+   for (int i = 0; i < g1.size(); ++i)
+      EXPECT_EQ(g1[i], expected[i]);
+}
+
+TEST_F(ComputeTest, i64tof32)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global long *out, __constant long *in)\n\
+   {\n\
+       __local float tmp[12];\n\
+       uint idx = get_global_id(0);\n\
+       tmp[idx] = in[idx];\n\
+       barrier(CLK_LOCAL_MEM_FENCE);\n\
+       out[idx] = tmp[idx + get_global_id(1)];\n\
+   }\n";
+   auto in = ShaderArg<int64_t>({ 0x100000000LL,
+                                  -0x100000000LL,
+                                  0x7fffffffffffffffLL,
+                                  0x4000004000000000LL,
+                                  0x4000003fffffffffLL,
+                                  0x4000004000000001LL,
+                                  -1,
+                                  -0x4000004000000000LL,
+                                  -0x4000003fffffffffLL,
+                                  -0x4000004000000001LL,
+                                  0,
+                                 INT64_MIN },
+                                SHADER_ARG_INPUT);
+   auto out = ShaderArg<int64_t>(std::vector<int64_t>(12, 0xdeadbeed), SHADER_ARG_OUTPUT);
+   const int64_t expected[] = {
+      0x100000000LL,
+      -0x100000000LL,
+      0x7fffffffffffffffLL,
+      0x4000000000000000LL,
+      0x4000000000000000LL,
+      0x4000008000000000LL,
+      -1,
+      -0x4000000000000000LL,
+      -0x4000000000000000LL,
+      -0x4000008000000000LL,
+      0,
+      INT64_MIN,
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ((int64_t)out[i], expected[i]);
+   }
+}
+TEST_F(ComputeTest, two_constant_arrays)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__constant uint *c1, __global uint *g1, __constant uint *c2)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       g1[idx] -= c1[idx] + c2[idx];\n\
+   }\n";
+   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+   auto c1 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+   auto c2 = ShaderArg<uint32_t>(std::vector<uint32_t>(16384, 5), SHADER_ARG_INPUT);
+   const uint32_t expected[] = {
+      4, 13, 22, 31
+   };
+
+   run_shader(kernel_source, g1.size(), 1, 1, c1, g1, c2);
+   for (int i = 0; i < g1.size(); ++i)
+      EXPECT_EQ(g1[i], expected[i]);
+}
+
+TEST_F(ComputeTest, null_constant_ptr)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *g1, __constant uint *c1)\n\
+   {\n\
+       __constant uint fallback[] = {2, 3, 4, 5};\n\
+       __constant uint *c = c1 ? c1 : fallback;\n\
+       uint idx = get_global_id(0);\n\
+       g1[idx] -= c[idx];\n\
+   }\n";
+   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+   auto c1 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+   const uint32_t expected1[] = {
+      9, 18, 27, 36
+   };
+
+   run_shader(kernel_source, g1.size(), 1, 1, g1, c1);
+   for (int i = 0; i < g1.size(); ++i)
+      EXPECT_EQ(g1[i], expected1[i]);
+
+   const uint32_t expected2[] = {
+      8, 17, 26, 35
+   };
+
+   g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+   auto c2 = NullShaderArg();
+   run_shader(kernel_source, g1.size(), 1, 1, g1, c2);
+   for (int i = 0; i < g1.size(); ++i)
+      EXPECT_EQ(g1[i], expected2[i]);
+}
+
+/* This test seems to fail on older versions of WARP. */
+TEST_F(ComputeTest, DISABLED_null_global_ptr)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *g1, __global uint *g2)\n\
+   {\n\
+       __constant uint fallback[] = {2, 3, 4, 5};\n\
+       uint idx = get_global_id(0);\n\
+       g1[idx] -= g2 ? g2[idx] : fallback[idx];\n\
+   }\n";
+   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+   auto g2 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+   const uint32_t expected1[] = {
+      9, 18, 27, 36
+   };
+
+   run_shader(kernel_source, g1.size(), 1, 1, g1, g2);
+   for (int i = 0; i < g1.size(); ++i)
+      EXPECT_EQ(g1[i], expected1[i]);
+
+   const uint32_t expected2[] = {
+      8, 17, 26, 35
+   };
+
+   g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+   auto g2null = NullShaderArg();
+   run_shader(kernel_source, g1.size(), 1, 1, g1, g2null);
+   for (int i = 0; i < g1.size(); ++i)
+      EXPECT_EQ(g1[i], expected2[i]);
+}
+
+TEST_F(ComputeTest, ret_constant_ptr)
+{
+   struct s { uint64_t ptr; uint32_t val; };
+   const char *kernel_source =
+   "struct s { __constant uint *ptr; uint val; };\n\
+   __kernel void main_test(__global struct s *out, __constant uint *in)\n\
+   {\n\
+       __constant uint foo[] = { 1, 2 };\n\
+       uint idx = get_global_id(0);\n\
+       if (idx == 0)\n\
+          out[idx].ptr = foo;\n\
+       else\n\
+          out[idx].ptr = in;\n\
+       out[idx].val = out[idx].ptr[idx];\n\
+   }\n";
+   auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT);
+   auto in = ShaderArg<uint32_t>({ 3, 4 }, SHADER_ARG_INPUT);
+   const uint32_t expected_val[] = {
+      1, 4
+   };
+   const uint64_t expected_ptr[] = {
+      2ull << 32, 1ull << 32
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].val, expected_val[i]);
+      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+   }
+}
+
+TEST_F(ComputeTest, ret_global_ptr)
+{
+   struct s { uint64_t ptr; uint32_t val; };
+   const char *kernel_source =
+   "struct s { __global uint *ptr; uint val; };\n\
+   __kernel void main_test(__global struct s *out, __global uint *in1, __global uint *in2)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       out[idx].ptr = idx ? in2 : in1;\n\
+       out[idx].val = out[idx].ptr[idx];\n\
+   }\n";
+   auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT);
+   auto in1 = ShaderArg<uint32_t>({ 1, 2 }, SHADER_ARG_INPUT);
+   auto in2 = ShaderArg<uint32_t>({ 3, 4 }, SHADER_ARG_INPUT);
+   const uint32_t expected_val[] = {
+      1, 4
+   };
+   const uint64_t expected_ptr[] = {
+      1ull << 32, 2ull << 32
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out, in1, in2);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].val, expected_val[i]);
+      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+   }
+}
+
+TEST_F(ComputeTest, ret_local_ptr)
+{
+   struct s { uint64_t ptr; };
+   const char *kernel_source =
+   "struct s { __local uint *ptr; };\n\
+   __kernel void main_test(__global struct s *out)\n\
+   {\n\
+       __local uint tmp[2];\n\
+       uint idx = get_global_id(0);\n\
+       tmp[idx] = idx;\n\
+       out[idx].ptr = &tmp[idx];\n\
+   }\n";
+   auto out = ShaderArg<struct s>(std::vector<struct s>(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT);
+   const uint64_t expected_ptr[] = {
+      0, 4,
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+   }
+}
+
+TEST_F(ComputeTest, ret_private_ptr)
+{
+   struct s { uint64_t ptr; uint32_t value; };
+   const char *kernel_source =
+   "struct s { __private uint *ptr; uint value; };\n\
+   __kernel void main_test(__global struct s *out)\n\
+   {\n\
+       uint tmp[2] = {1, 2};\n\
+       uint idx = get_global_id(0);\n\
+       out[idx].ptr = &tmp[idx];\n\
+       out[idx].value = *out[idx].ptr;\n\
+   }\n";
+   auto out = ShaderArg<struct s>(std::vector<struct s>(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT);
+   const uint64_t expected_ptr[] = {
+      0, 4,
+   };
+   const uint32_t expected_value[] = {
+      1, 2
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+   }
+}
+
+TEST_F(ComputeTest, globals_8bit)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global unsigned char *inout)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       inout[idx] = inout[idx] + 1;\n\
+   }\n";
+   auto inout = ShaderArg<uint8_t> ({ 100, 110, 120, 130 }, SHADER_ARG_INOUT);
+   const uint8_t expected[] = {
+      101, 111, 121, 131
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, globals_16bit)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global unsigned short *inout)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       inout[idx] = inout[idx] + 1;\n\
+   }\n";
+   auto inout = ShaderArg<uint16_t> ({ 10000, 10010, 10020, 10030 }, SHADER_ARG_INOUT);
+   const uint16_t expected[] = {
+      10001, 10011, 10021, 10031
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_globals_64bit)
+{
+   /* Test disabled, because we need a fixed version of WARP that hasn't
+      been officially shipped yet */
+
+   const char *kernel_source =
+   "__kernel void main_test(__global unsigned long *inout)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       inout[idx] = inout[idx] + 1;\n\
+   }\n";
+   uint64_t base = 1ull << 50;
+   auto inout = ShaderArg<uint64_t>({ base, base + 10, base + 20, base + 30 },
+                                    SHADER_ARG_INOUT);
+   const uint64_t expected[] = {
+      base + 1, base + 11, base + 21, base + 31
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, built_ins_global_id)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = get_global_id(0);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      0, 1, 2, 3
+   };
+
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, built_ins_global_id_rmw)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       uint id = get_global_id(0);\n\
+       output[id] = output[id] * (id + 1);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({0x00000001, 0x10000001, 0x00020002, 0x04010203},
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0x00000001, 0x20000002, 0x00060006, 0x1004080c
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_float_basics)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = (uint)((float)get_global_id(0) + 1.5f);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      1, 2, 3, 4
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_types_double_basics)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = (uint)((double)get_global_id(0) + 1.5);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      1, 2, 3, 4
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_short_basics)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = (uint)((short)get_global_id(0) + (short)1);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      1, 2, 3, 4
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_char_basics)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = (uint)((char)get_global_id(0) + (char)1);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      1, 2, 3, 4
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_if_statement)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       int idx = get_global_id(0);\n\
+       if (idx > 0)\n\
+           output[idx] = ~idx;\n\
+       else\n\
+           output[0] = 0xff;\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      0xff, ~1u, ~2u, ~3u
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_do_while_loop)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       int value = 1;\n\
+       int i = 1, n = get_global_id(0);\n\
+       do {\n\
+          value *= i++;\n\
+       } while (i <= n);\n\
+       output[n] = value;\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(5, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      1, 1, 1*2, 1*2*3, 1*2*3*4
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_for_loop)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       int value = 1;\n\
+       int n = get_global_id(0);\n\
+       for (int i = 1; i <= n; ++i)\n\
+          value *= i;\n\
+       output[n] = value;\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(5, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      1, 1, 1*2, 1*2*3, 1*2*3*4
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_complex_types_local_array_long)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global ulong *inout)\n\
+   {\n\
+      ushort tmp[] = {\n\
+         get_global_id(1) + 0x00000000,\n\
+         get_global_id(1) + 0x10000001,\n\
+         get_global_id(1) + 0x20000020,\n\
+         get_global_id(1) + 0x30000300,\n\
+      };\n\
+      uint idx = get_global_id(0);\n\
+      inout[idx] = tmp[idx];\n\
+   }\n";
+   auto inout = ShaderArg<uint64_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+   const uint16_t expected[] = {
+      0x00000000, 0x10000001, 0x20000020, 0x30000300,
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_local_array_short)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global ushort *inout)\n\
+   {\n\
+      ushort tmp[] = {\n\
+         get_global_id(1) + 0x00,\n\
+         get_global_id(1) + 0x10,\n\
+         get_global_id(1) + 0x20,\n\
+         get_global_id(1) + 0x30,\n\
+      };\n\
+      uint idx = get_global_id(0);\n\
+      inout[idx] = tmp[idx];\n\
+   }\n";
+   auto inout = ShaderArg<uint16_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+   const uint16_t expected[] = {
+      0x00, 0x10, 0x20, 0x30,
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_local_array_struct_vec_float_misaligned)
+{
+   const char *kernel_source =
+   "struct has_vecs { uchar c; ushort s; float2 f; };\n\
+   __kernel void main_test(__global uint *inout)\n\
+   {\n\
+      struct has_vecs tmp[] = {\n\
+         { 10 + get_global_id(0), get_global_id(1), { 10.0f, 1.0f } },\n\
+         { 19 + get_global_id(0), get_global_id(1), { 20.0f, 4.0f } },\n\
+         { 28 + get_global_id(0), get_global_id(1), { 30.0f, 9.0f } },\n\
+         { 37 + get_global_id(0), get_global_id(1), { 40.0f, 16.0f } },\n\
+      };\n\
+      uint idx = get_global_id(0);\n\
+      uint mul = (tmp[idx].c + tmp[idx].s) * trunc(tmp[idx].f[0]);\n\
+      inout[idx] = mul + trunc(tmp[idx].f[1]);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+   const uint16_t expected[] = { 101, 404, 909, 1616 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_local_array)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+      uint tmp[] = {\n\
+         get_global_id(1) + 0x00,\n\
+         get_global_id(1) + 0x10,\n\
+         get_global_id(1) + 0x20,\n\
+         get_global_id(1) + 0x30,\n\
+      };\n\
+      uint idx = get_global_id(0);\n\
+      inout[idx] = tmp[idx];\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0x00, 0x10, 0x20, 0x30,
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_global_struct_array)
+{
+   struct two_vals { uint32_t add; uint32_t mul; };
+   const char *kernel_source =
+   "struct two_vals { uint add; uint mul; };\n\
+   __kernel void main_test(__global struct two_vals *in_out)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      in_out[id].add = in_out[id].add + id;\n\
+      in_out[id].mul = in_out[id].mul * id;\n\
+   }\n";
+   auto inout = ShaderArg<struct two_vals>({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } },
+                                           SHADER_ARG_INOUT);
+   const struct two_vals expected[] = {
+      { 8 + 0, 8 * 0 },
+      { 16 + 1, 16 * 1 },
+      { 64 + 2, 64 * 2 },
+      { 65536 + 3, 65536 * 3 }
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i].add, expected[i].add);
+      EXPECT_EQ(inout[i].mul, expected[i].mul);
+   }
+}
+
+TEST_F(ComputeTest, complex_types_global_uint2)
+{
+   struct uint2 { uint32_t x; uint32_t y; };
+   const char *kernel_source =
+   "__kernel void main_test(__global uint2 *inout)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      inout[id].x = inout[id].x + id;\n\
+      inout[id].y = inout[id].y * id;\n\
+   }\n";
+   auto inout = ShaderArg<struct uint2>({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } },
+                                        SHADER_ARG_INOUT);
+   const struct uint2 expected[] = {
+      { 8 + 0, 8 * 0 },
+      { 16 + 1, 16 * 1 },
+      { 64 + 2, 64 * 2 },
+      { 65536 + 3, 65536 * 3 }
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i].x, expected[i].x);
+      EXPECT_EQ(inout[i].y, expected[i].y);
+   }
+}
+
+TEST_F(ComputeTest, complex_types_global_ushort2)
+{
+   struct ushort2 { uint16_t x; uint16_t y; };
+   const char *kernel_source =
+   "__kernel void main_test(__global ushort2 *inout)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      inout[id].x = inout[id].x + id;\n\
+      inout[id].y = inout[id].y * id;\n\
+   }\n";
+   auto inout = ShaderArg<struct ushort2>({ { 8, 8 }, { 16, 16 }, { 64, 64 },
+                                            { (uint16_t)65536, (uint16_t)65536 } },
+                                          SHADER_ARG_INOUT);
+   const struct ushort2 expected[] = {
+      { 8 + 0, 8 * 0 },
+      { 16 + 1, 16 * 1 },
+      { 64 + 2, 64 * 2 },
+      { (uint16_t)(65536 + 3), (uint16_t)(65536 * 3) }
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i].x, expected[i].x);
+      EXPECT_EQ(inout[i].y, expected[i].y);
+   }
+}
+
+TEST_F(ComputeTest, complex_types_global_uchar3)
+{
+   struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; };
+   const char *kernel_source =
+   "__kernel void main_test(__global uchar3 *inout)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      inout[id].x = inout[id].x + id;\n\
+      inout[id].y = inout[id].y * id;\n\
+      inout[id].z = inout[id].y + inout[id].x;\n\
+   }\n";
+   auto inout = ShaderArg<struct uchar3>({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } },
+                                         SHADER_ARG_INOUT);
+   const struct uchar3 expected[] = {
+      { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) },
+      { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) },
+      { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) },
+      { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) }
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i].x, expected[i].x);
+      EXPECT_EQ(inout[i].y, expected[i].y);
+      EXPECT_EQ(inout[i].z, expected[i].z);
+   }
+}
+
+TEST_F(ComputeTest, complex_types_constant_uchar3)
+{
+   struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; };
+   const char *kernel_source =
+   "__kernel void main_test(__global uchar3 *out, __constant uchar3 *in)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      out[id].x = in[id].x + id;\n\
+      out[id].y = in[id].y * id;\n\
+      out[id].z = out[id].y + out[id].x;\n\
+   }\n";
+   auto in = ShaderArg<struct uchar3>({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } },
+                                      SHADER_ARG_INPUT);
+   auto out = ShaderArg<struct uchar3>(std::vector<struct uchar3>(4, { 0xff, 0xff, 0xff }),
+                                      SHADER_ARG_OUTPUT);
+   const struct uchar3 expected[] = {
+      { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) },
+      { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) },
+      { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) },
+      { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) }
+   };
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].x, expected[i].x);
+      EXPECT_EQ(out[i].y, expected[i].y);
+      EXPECT_EQ(out[i].z, expected[i].z);
+   }
+}
+
+TEST_F(ComputeTest, complex_types_global_uint8)
+{
+   struct uint8 {
+      uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
+      uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
+   };
+   const char *kernel_source =
+   "__kernel void main_test(__global uint8 *inout)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      inout[id].s01234567 = inout[id].s01234567 * 2;\n\
+   }\n";
+   auto inout = ShaderArg<struct uint8>({ { 1, 2, 3, 4, 5, 6, 7, 8 } },
+                                        SHADER_ARG_INOUT);
+   const struct uint8 expected[] = {
+      { 2, 4, 6, 8, 10, 12, 14, 16 }
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i].s0, expected[i].s0);
+      EXPECT_EQ(inout[i].s1, expected[i].s1);
+      EXPECT_EQ(inout[i].s2, expected[i].s2);
+      EXPECT_EQ(inout[i].s3, expected[i].s3);
+      EXPECT_EQ(inout[i].s4, expected[i].s4);
+      EXPECT_EQ(inout[i].s5, expected[i].s5);
+      EXPECT_EQ(inout[i].s6, expected[i].s6);
+      EXPECT_EQ(inout[i].s7, expected[i].s7);
+   }
+}
+
+TEST_F(ComputeTest, complex_types_local_ulong16)
+{
+   struct ulong16 {
+      uint64_t values[16];
+   };
+   const char *kernel_source =
+   R"(__kernel void main_test(__global ulong16 *inout)
+   {
+      __local ulong16 local_array[2];
+      uint id = get_global_id(0);
+      local_array[id] = inout[id];
+      barrier(CLK_LOCAL_MEM_FENCE);
+      inout[id] = local_array[0] * 2;
+   })";
+   auto inout = ShaderArg<struct ulong16>({ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } },
+                                        SHADER_ARG_INOUT);
+   const struct ulong16 expected[] = {
+      { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      for (int j = 0; j < 16; ++j) {
+         EXPECT_EQ(inout[i].values[j], expected[i].values[j]);
+      }
+   }
+}
+
+TEST_F(ComputeTest, complex_types_constant_uint8)
+{
+   struct uint8 {
+      uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
+      uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
+   };
+   const char *kernel_source =
+   "__kernel void main_test(__global uint8 *out, __constant uint8 *in)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      out[id].s01234567 = in[id].s01234567 * 2;\n\
+   }\n";
+   auto in = ShaderArg<struct uint8>({ { 1, 2, 3, 4, 5, 6, 7, 8 } },
+                                     SHADER_ARG_INPUT);
+   auto out = ShaderArg<struct uint8>({ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } },
+                                      SHADER_ARG_INOUT);
+   const struct uint8 expected[] = {
+      { 2, 4, 6, 8, 10, 12, 14, 16 }
+   };
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].s0, expected[i].s0);
+      EXPECT_EQ(out[i].s1, expected[i].s1);
+      EXPECT_EQ(out[i].s2, expected[i].s2);
+      EXPECT_EQ(out[i].s3, expected[i].s3);
+      EXPECT_EQ(out[i].s4, expected[i].s4);
+      EXPECT_EQ(out[i].s5, expected[i].s5);
+      EXPECT_EQ(out[i].s6, expected[i].s6);
+      EXPECT_EQ(out[i].s7, expected[i].s7);
+   }
+}
+
+TEST_F(ComputeTest, DISABLED_complex_types_const_array)
+{
+   /* DISABLED because current release versions of WARP either return
+    * rubbish from reads or crash: they are not prepared to handle
+    * non-float global constants */
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       const uint foo[] = { 100, 101, 102, 103 };\n\
+       output[get_global_id(0)] = foo[get_global_id(0) % 4];\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      100, 101, 102, 103
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, mem_access_load_store_ordering)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+       uint foo[4];\n\
+       foo[0] = 0x11111111;\n\
+       foo[1] = 0x22222222;\n\
+       foo[2] = 0x44444444;\n\
+       foo[3] = 0x88888888;\n\
+       foo[get_global_id(1)] -= 0x11111111; // foo[0] = 0 \n\
+       foo[0] += get_global_id(0); // foo[0] = tid\n\
+       foo[foo[get_global_id(1)]] = get_global_id(0); // foo[tid] = tid\n\
+       output[get_global_id(0)] = foo[get_global_id(0)]; // output[tid] = tid\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint16_t expected[] = {
+      0, 1, 2, 3
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_two_const_arrays)
+{
+   /* DISABLED because current release versions of WARP either return
+    * rubbish from reads or crash: they are not prepared to handle
+    * non-float global constants */
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *output)\n\
+   {\n\
+      uint id = get_global_id(0);\n\
+      uint foo[4] = {100, 101, 102, 103};\n\
+      uint bar[4] = {1, 2, 3, 4};\n\
+      output[id] = foo[id] * bar[id];\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      100, 202, 306, 412
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, imod_pos)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global int *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = inout[get_global_id(0)] % 3;\n\
+   }\n";
+   auto inout = ShaderArg<int32_t>({ -4, -3, -2, -1, 0, 1, 2, 3, 4 },
+                                   SHADER_ARG_INOUT);
+   const int32_t expected[] = {
+      -1, 0, -2, -1,  0, 1, 2, 0, 1
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, imod_neg)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global int *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = inout[get_global_id(0)] % -3;\n\
+   }\n";
+   auto inout = ShaderArg<int32_t>({ -4, -3, -2, -1, 0, 1, 2, 3, 4 },
+                                   SHADER_ARG_INOUT);
+   const int32_t expected[] = {
+      -1, 0, -2, -1,  0, 1, 2, 0, 1
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, umod)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = inout[get_global_id(0)] % 0xfffffffc;\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0xfffffffa, 0xfffffffb, 0xfffffffc, 0xfffffffd, 0xfffffffe },
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0xfffffffa, 0xfffffffb, 0, 1, 2
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, rotate)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = rotate(inout[get_global_id(0)], (uint)get_global_id(0) * 4);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0xdeadbeef, 0xeadbeefd, 0xadbeefde, 0xdbeefdea
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, popcount)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = popcount(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 0x1, 0x3, 0x101, 0x110011, ~0u },
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0, 1, 2, 2, 4, 32
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, hadd)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = hadd(inout[get_global_id(0)], 1u << 31);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff },
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      (1u << 31) >> 1,
+      ((1u << 31) + 1) >> 1,
+      ((1u << 31) + 2) >> 1,
+      ((1u << 31) + 3) >> 1,
+      ((1ull << 31) + 0xfffffffc) >> 1,
+      ((1ull << 31) + 0xfffffffd) >> 1,
+      ((1ull << 31) + 0xfffffffe) >> 1,
+      ((1ull << 31) + 0xffffffff) >> 1,
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, rhadd)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = rhadd(inout[get_global_id(0)], 1u << 31);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff },
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      ((1u << 31) + 1) >> 1,
+      ((1u << 31) + 2) >> 1,
+      ((1u << 31) + 3) >> 1,
+      ((1u << 31) + 4) >> 1,
+      ((1ull << 31) + 0xfffffffd) >> 1,
+      ((1ull << 31) + 0xfffffffe) >> 1,
+      ((1ull << 31) + 0xffffffff) >> 1,
+      ((1ull << 31) + (1ull << 32)) >> 1,
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, add_sat)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = add_sat(inout[get_global_id(0)], 2u);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0xffffffff - 3, 0xffffffff - 2, 0xffffffff - 1, 0xffffffff },
+                                    SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0xffffffff - 1, 0xffffffff, 0xffffffff, 0xffffffff
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, sub_sat)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = sub_sat(inout[get_global_id(0)], 2u);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3 }, SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0, 0, 0, 1
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, mul_hi)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = mul_hi(inout[get_global_id(0)], 1u << 31);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, (1u << 31) }, SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      0, 0, 1, 1, (1u << 30)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, ldexp_x)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], 5);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 2.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      ldexp(0.0f, 5), ldexp(0.5f, 5), ldexp(1.0f, 5), ldexp(2.0f, 5)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, ldexp_y)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], get_global_id(0));\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.25f, 0.5f, 0.75f, 1.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      ldexp(0.25f, 0), ldexp(0.5f, 1), ldexp(0.75f, 2), ldexp(1.0f, 3)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, frexp_ret)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       int exp;\n\
+       inout[get_global_id(0)] = frexp(inout[get_global_id(0)], &exp);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      0.0f, 0.5f, 0.5f, 0.75f
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, frexp_exp)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       int exp;\n\
+       frexp(inout[get_global_id(0)], &exp);\n\
+       inout[get_global_id(0)] = (float)exp;\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      0.0f, 0.0f, 1.0f, 2.0f
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, clz)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = clz(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 0, 1, 0xffff,  (1u << 30), (1u << 31) }, SHADER_ARG_INOUT);
+   const uint32_t expected[] = {
+      32, 31, 16, 1, 0
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, sin)
+{
+   struct sin_vals { float in; float clc; float native; };
+   const char *kernel_source =
+   "struct sin_vals { float in; float clc; float native; };\n\
+   __kernel void main_test(__global struct sin_vals *inout)\n\
+   {\n\
+       inout[get_global_id(0)].clc = sin(inout[get_global_id(0)].in);\n\
+       inout[get_global_id(0)].native = native_sin(inout[get_global_id(0)].in);\n\
+   }\n";
+   const vector<sin_vals> input = {
+      { 0.0f, 0.0f, 0.0f },
+      { 1.0f, 0.0f, 0.0f },
+      { 2.0f, 0.0f, 0.0f },
+      { 3.0f, 0.0f, 0.0f },
+   };
+   auto inout = ShaderArg<sin_vals>(input, SHADER_ARG_INOUT);
+   const struct sin_vals expected[] = {
+      { 0.0f, 0.0f,       0.0f       },
+      { 1.0f, sin(1.0f), sin(1.0f) },
+      { 2.0f, sin(2.0f), sin(2.0f) },
+      { 3.0f, sin(3.0f), sin(3.0f) },
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_FLOAT_EQ(inout[i].in, inout[i].in);
+      EXPECT_FLOAT_EQ(inout[i].clc, inout[i].clc);
+      EXPECT_NEAR(inout[i].clc, inout[i].native, 0.008f); // range from DXIL spec
+   }
+}
+
+TEST_F(ComputeTest, DISABLED_cosh)
+{
+   /* Disabled because of WARP failures, where we fetch incorrect results when
+    * sourcing from non-float ICBs */
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = cosh(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      cosh(0.0f), cosh(1.0f), cosh(2.0f), cosh(3.0f)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, exp)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = native_exp(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      exp(0.0f), exp(1.0f), exp(2.0f), exp(3.0f)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, exp10)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = native_exp10(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      pow(10.0f, 0.0f), pow(10.0f, 1.0f), pow(10.0f, 2.0f), pow(10.0f, 3.0f)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, exp2)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = native_exp2(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      pow(2.0f, 0.0f), pow(2.0f, 1.0f), pow(2.0f, 2.0f), pow(2.0f, 3.0f)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, log)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = native_log(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      log(0.0f), log(1.0f), log(2.0f), log(3.0f)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, log10)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = native_log10(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      log10(0.0f), log10(1.0f), log10(2.0f), log10(3.0f)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, log2)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = native_log2(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      log(0.0f) / log(2), log(1.0f) / log(2), log(2.0f) / log(2), log(3.0f) / log(2)
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, rint)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+      inout[get_global_id(0)] = rint(inout[get_global_id(0)]);\n\
+   }\n";
+
+   auto inout = ShaderArg<float>({ 0.5f, 1.5f, -0.5f, -1.5f, 1.4f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      0.0f, 2.0f, 0.0f, -2.0f, 1.0f,
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, round)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = round(inout[get_global_id(0)]);\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
+                                 SHADER_ARG_INOUT);
+   const float expected[] = {
+      0.0f, 0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, arg_by_val)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout, float mul)\n\
+   {\n\
+       inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
+                                 SHADER_ARG_INOUT);
+   auto mul = ShaderArg<float>(10.0f, SHADER_ARG_INPUT);
+   const float expected[] = {
+      0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout, mul);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, uint8_by_val)
+{
+   struct uint8 {
+      uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
+      uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
+   };
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *out, uint8 val)\n\
+   {\n\
+       out[get_global_id(0)] = val.s0 + val.s1 + val.s2 + val.s3 +\n\
+                               val.s4 + val.s5 + val.s6 + val.s7;\n\
+   }\n";
+   auto out = ShaderArg<uint32_t>({ 0 }, SHADER_ARG_OUTPUT);
+   auto val = ShaderArg<struct uint8>({ {0, 1, 2, 3, 4, 5, 6, 7 }}, SHADER_ARG_INPUT);
+   const uint32_t expected[] = { 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 };
+   run_shader(kernel_source, out.size(), 1, 1, out, val);
+   for (int i = 0; i < out.size(); ++i)
+      EXPECT_EQ(out[i], expected[i]);
+}
+
+TEST_F(ComputeTest, link)
+{
+   const char *foo_src =
+   "float foo(float in)\n\
+   {\n\
+       return in * in;\n\
+   }\n";
+   const char *kernel_source =
+   "float foo(float in);\n\
+   __kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\
+   }\n";
+   std::vector<const char *> srcs = { foo_src, kernel_source };
+   auto inout = ShaderArg<float>({ 2.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      4.0f,
+   };
+   run_shader(srcs, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, link_library)
+{
+   const char *bar_src =
+   "float bar(float in)\n\
+   {\n\
+      return in * 5;\n\
+   }\n";
+   const char *foo_src =
+   "float bar(float in);\n\
+   float foo(float in)\n\
+   {\n\
+       return in * bar(in);\n\
+   }\n";
+   const char *kernel_source =
+   "float foo(float in);\n\
+   __kernel void main_test(__global float *inout)\n\
+   {\n\
+       inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\
+   }\n";
+   std::vector<Shader> libraries = {
+      compile({ bar_src, kernel_source }, {}, true),
+      compile({ foo_src }, {}, true)
+   };
+   Shader exe = link(libraries);
+   auto inout = ShaderArg<float>({ 2.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      20.0f,
+   };
+   run_shader(exe, { (unsigned)inout.size(), 1, 1 }, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, localvar)
+{
+   const char *kernel_source =
+   "__kernel __attribute__((reqd_work_group_size(2, 1, 1)))\n\
+   void main_test(__global float *inout)\n\
+   {\n\
+      __local float2 tmp[2];\n\
+      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
+   }\n";
+
+   auto inout = ShaderArg<float>({ 2.0f, 4.0f }, SHADER_ARG_INOUT);
+   const float expected[] = {
+      9.0f, 5.0f
+   };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, localvar_uchar2)
+{
+   const char *kernel_source =
+   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+   __kernel void main_test(__global uchar *inout)\n\
+   {\n\
+      __local uchar2 tmp[2];\n\
+      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
+   }\n";
+
+   auto inout = ShaderArg<uint8_t>({ 2, 4 }, SHADER_ARG_INOUT);
+   const uint8_t expected[] = { 9, 5 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, work_group_size_hint)
+{
+   const char *kernel_source =
+   "__attribute__((work_group_size_hint(2, 1, 1)))\n\
+   __kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = get_local_id(0);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      0, 1, 2, 3
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, reqd_work_group_size)
+{
+   const char *kernel_source =
+   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+   __kernel void main_test(__global uint *output)\n\
+   {\n\
+       output[get_global_id(0)] = get_local_id(0);\n\
+   }\n";
+   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+                                     SHADER_ARG_OUTPUT);
+   const uint32_t expected[] = {
+      0, 1, 0, 1
+   };
+   run_shader(kernel_source, output.size(), 1, 1, output);
+   for (int i = 0; i < output.size(); ++i)
+      EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, image)
+{
+   const char* kernel_source =
+   "__kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
+   {\n\
+      int2 coords = (int2)(get_global_id(0), get_global_id(1));\n\
+      write_imagef(output, coords, read_imagef(input, coords));\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, image_two_reads)
+{
+   const char* kernel_source =
+   "__kernel void main_test(image2d_t image, int is_float, __global float* output)\n\
+   {\n\
+      if (is_float)\n\
+         output[get_global_id(0)] = read_imagef(image, (int2)(0, 0)).x;\n\
+      else \n\
+         output[get_global_id(0)] = (float)read_imagei(image, (int2)(0, 0)).x;\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, sampler)
+{
+   const char* kernel_source =
+   "__kernel void main_test(image2d_t image, sampler_t sampler, __global float* output)\n\
+   {\n\
+      output[get_global_id(0)] = read_imagef(image, sampler, (int2)(0, 0)).x;\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, image_dims)
+{
+   const char* kernel_source =
+   "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\
+   {\n\
+      output[get_global_id(0)] = get_image_width(roimage);\n\
+      output[get_global_id(0) + 1] = get_image_width(woimage);\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, image_format)
+{
+   const char* kernel_source =
+   "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\
+   {\n\
+      output[get_global_id(0)] = get_image_channel_data_type(roimage);\n\
+      output[get_global_id(0) + 1] = get_image_channel_order(woimage);\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, image1d_buffer_t)
+{
+   const char* kernel_source =
+   "__kernel void main_test(read_only image1d_buffer_t input, write_only image1d_buffer_t output)\n\
+   {\n\
+      write_imageui(output, get_global_id(0), read_imageui(input, get_global_id(0)));\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, local_ptr)
+{
+   struct uint2 { uint32_t x, y; };
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout, __local uint2 *tmp)\n\
+   {\n\
+      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 2, 4 }, SHADER_ARG_INOUT);
+   auto tmp = ShaderArg<struct uint2>(std::vector<struct uint2>(4096), SHADER_ARG_INPUT);
+   const uint8_t expected[] = { 9, 5 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout, tmp);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, two_local_ptrs)
+{
+   struct uint2 { uint32_t x, y; };
+   const char *kernel_source =
+   "__kernel void main_test(__global uint *inout, __local uint2 *tmp, __local uint *tmp2)\n\
+   {\n\
+      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+      tmp2[get_local_id(0)] = get_global_id(0);\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y + tmp2[get_local_id(0) % 2];\n\
+   }\n";
+   auto inout = ShaderArg<uint32_t>({ 2, 4 }, SHADER_ARG_INOUT);
+   auto tmp = ShaderArg<struct uint2>(std::vector<struct uint2>(1024), SHADER_ARG_INPUT);
+   auto tmp2 = ShaderArg<uint32_t>(std::vector<uint32_t>(1024), SHADER_ARG_INPUT);
+   const uint8_t expected[] = { 9, 6 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout, tmp, tmp2);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, int8_to_float)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global char* in, __global float* out)\n\
+   {\n\
+      uint pos = get_global_id(0);\n\
+      out[pos] = in[pos] / 100.0f;\n\
+   }";
+   auto in = ShaderArg<char>({ 10, 20, 30, 40 }, SHADER_ARG_INPUT);
+   auto out = ShaderArg<float>(std::vector<float>(4, std::numeric_limits<float>::infinity()), SHADER_ARG_OUTPUT);
+   const float expected[] = { 0.1f, 0.2f, 0.3f, 0.4f };
+   run_shader(kernel_source, in.size(), 1, 1, in, out);
+   for (int i = 0; i < in.size(); ++i)
+      EXPECT_FLOAT_EQ(out[i], expected[i]);
+}
+
+TEST_F(ComputeTest, vec_hint_float4)
+{
+   const char *kernel_source =
+   "__kernel __attribute__((vec_type_hint(float4))) void main_test(__global float *inout)\n\
+   {\n\
+      inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
+   }";
+   Shader shader = compile({ kernel_source });
+   EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 4);
+   EXPECT_EQ(shader.obj->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_FLOAT);
+}
+
+TEST_F(ComputeTest, vec_hint_uchar2)
+{
+   const char *kernel_source =
+   "__kernel __attribute__((vec_type_hint(uchar2))) void main_test(__global float *inout)\n\
+   {\n\
+      inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
+   }";
+   Shader shader = compile({ kernel_source });
+   EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 2);
+   EXPECT_EQ(shader.obj->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_CHAR);
+}
+
+TEST_F(ComputeTest, vec_hint_none)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout)\n\
+   {\n\
+      inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
+   }";
+   Shader shader = compile({ kernel_source });
+   EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 0);
+}
+
+TEST_F(ComputeTest, DISABLED_debug_layer_failure)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *inout, float mul)\n\
+   {\n\
+       inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\
+   }\n";
+   auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
+                                 SHADER_ARG_INOUT);
+   auto mul = ShaderArg<float>(10.0f, SHADER_ARG_INPUT);
+   const float expected[] = {
+      0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f
+   };
+   ComPtr<ID3D12InfoQueue> info_queue;
+   dev->QueryInterface(info_queue.ReleaseAndGetAddressOf());
+   if (!info_queue) {
+      GTEST_SKIP() << "No info queue";
+      return;
+   }
+
+   info_queue->AddApplicationMessage(D3D12_MESSAGE_SEVERITY_ERROR, "This should cause the test to fail");
+   run_shader(kernel_source, inout.size(), 1, 1, inout, mul);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, compiler_defines)
+{
+   const char *kernel_source =
+      "__kernel void main_test(__global int* out)\n\
+   {\n\
+      out[0] = OUT_VAL0;\n\
+      out[1] = __OPENCL_C_VERSION__;\n\
+   }";
+   auto out = ShaderArg<int>(std::vector<int>(2, 0), SHADER_ARG_OUTPUT);
+   CompileArgs compile_args = { 1, 1, 1 };
+   compile_args.compiler_command_line = { "-DOUT_VAL0=5", "-cl-std=cl" };
+   std::vector<RawShaderArg *> raw_args = { &out };
+   run_shader({ kernel_source }, compile_args, out);
+   EXPECT_EQ(out[0], 5);
+   EXPECT_EQ(out[1], 100);
+}
+
+/* There's a bug in WARP turning atomic_add(ptr, x) into
+ * atomic_add(ptr, x * 4). Works fine on intel HW.
+ */
+TEST_F(ComputeTest, DISABLED_global_atomic_add)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global int *inout, __global int *old)\n\
+   {\n\
+      old[get_global_id(0)] = atomic_add(inout + get_global_id(0), 3);\n\
+   }\n";
+   auto inout = ShaderArg<int32_t>({ 2, 4 }, SHADER_ARG_INOUT);
+   auto old = ShaderArg<int32_t>(std::vector<int32_t>(2, 0xdeadbeef), SHADER_ARG_OUTPUT);
+   const int32_t expected_inout[] = { 5, 7 };
+   const int32_t expected_old[] = { 2, 4 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout, old);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i], expected_inout[i]);
+      EXPECT_EQ(old[i], expected_old[i]);
+   }
+}
+
+TEST_F(ComputeTest, global_atomic_imin)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global int *inout, __global int *old)\n\
+   {\n\
+      old[get_global_id(0)] = atomic_min(inout + get_global_id(0), 1);\n\
+   }\n";
+   auto inout = ShaderArg<int32_t>({ 0, 2, -1 }, SHADER_ARG_INOUT);
+   auto old = ShaderArg<int32_t>(std::vector<int32_t>(3, 0xdeadbeef), SHADER_ARG_OUTPUT);
+   const int32_t expected_inout[] = { 0, 1, -1 };
+   const int32_t expected_old[] = { 0, 2, -1 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout, old);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i], expected_inout[i]);
+      EXPECT_EQ(old[i], expected_old[i]);
+   }
+}
+
+TEST_F(ComputeTest, global_atomic_and_or)
+{
+   const char *kernel_source =
+   "__attribute__((reqd_work_group_size(3, 1, 1)))\n\
+   __kernel void main_test(__global int *inout)\n\
+   {\n\
+      atomic_and(inout, ~(1 << get_global_id(0)));\n\
+      atomic_or(inout, (1 << (get_global_id(0) + 4)));\n\
+   }\n";
+   auto inout = ShaderArg<int32_t>(0xf, SHADER_ARG_INOUT);
+   const int32_t expected[] = { 0x78 };
+   run_shader(kernel_source, 3, 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, global_atomic_cmpxchg)
+{
+   const char *kernel_source =
+   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+   __kernel void main_test(__global int *inout)\n\
+   {\n\
+      while (atomic_cmpxchg(inout, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\
+         ;\n\
+   }\n";
+   auto inout = ShaderArg<int32_t>(0, SHADER_ARG_INOUT);
+   const int32_t expected_inout[] = { 2 };
+   run_shader(kernel_source, 2, 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected_inout[i]);
+}
+
+TEST_F(ComputeTest, local_atomic_and_or)
+{
+   const char *kernel_source =
+   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+   __kernel void main_test(__global ushort *inout)\n\
+   {\n\
+      __local ushort tmp;\n\
+      atomic_and(&tmp, ~(0xff << (get_global_id(0) * 8)));\n\
+      atomic_or(&tmp, inout[get_global_id(0)] << (get_global_id(0) * 8));\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      inout[get_global_id(0)] = tmp;\n\
+   }\n";
+   auto inout = ShaderArg<uint16_t>({ 2, 4 }, SHADER_ARG_INOUT);
+   const uint16_t expected[] = { 0x402, 0x402 };
+   run_shader(kernel_source, inout.size(), 1, 1, inout);
+   for (int i = 0; i < inout.size(); ++i)
+      EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, local_atomic_cmpxchg)
+{
+   const char *kernel_source =
+   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+   __kernel void main_test(__global int *out)\n\
+   {\n\
+      __local uint tmp;\n\
+      tmp = 0;\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      while (atomic_cmpxchg(&tmp, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\
+         ;\n\
+      barrier(CLK_LOCAL_MEM_FENCE);\n\
+      out[0] = tmp;\n\
+   }\n";
+
+   auto out = ShaderArg<uint32_t>(0xdeadbeef, SHADER_ARG_OUTPUT);
+   const uint16_t expected[] = { 2 };
+   run_shader(kernel_source, 2, 1, 1, out);
+   for (int i = 0; i < out.size(); ++i)
+      EXPECT_EQ(out[i], expected[i]);
+}
+
+TEST_F(ComputeTest, constant_sampler)
+{
+   const char* kernel_source =
+   "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;\n\
+   __kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
+   {\n\
+      int2 coordsi = (int2)(get_global_id(0), get_global_id(1));\n\
+      float2 coordsf = (float2)((float)coordsi.x / get_image_width(input), (float)coordsi.y / get_image_height(input));\n\
+      write_imagef(output, coordsi, \n\
+         read_imagef(input, sampler, coordsf) + \n\
+         read_imagef(input, sampler, coordsf + (float2)(0.1, 0.1)));\n\
+   }\n";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+   EXPECT_EQ(shader.dxil->metadata.num_const_samplers, 1);
+}
+
+TEST_F(ComputeTest, hi)
+{
+   const char *kernel_source = R"(
+   __kernel void main_test(__global char3 *srcA, __global char2 *dst)
+   {
+       int  tid = get_global_id(0);
+
+       char2 tmp = srcA[tid].hi;
+       dst[tid] = tmp;
+   })";
+   Shader shader = compile(std::vector<const char*>({ kernel_source }));
+   validate(shader);
+}
+
+TEST_F(ComputeTest, system_values)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global uint* outputs)\n\
+   {\n\
+      outputs[0] = get_work_dim();\n\
+      outputs[1] = get_global_size(0);\n\
+      outputs[2] = get_local_size(0);\n\
+      outputs[3] = get_num_groups(0);\n\
+      outputs[4] = get_group_id(0);\n\
+      outputs[5] = get_global_offset(0);\n\
+      outputs[6] = get_global_id(0);\n\
+   }\n";
+   auto out = ShaderArg<uint32_t>(std::vector<uint32_t>(6, 0xdeadbeef), SHADER_ARG_OUTPUT);
+   const uint16_t expected[] = { 3, 1, 1, 1, 0, 0, 0, };
+   CompileArgs args = { 1, 1, 1 };
+   Shader shader = compile({ kernel_source });
+   run_shader(shader, args, out);
+   for (int i = 0; i < out.size(); ++i)
+      EXPECT_EQ(out[i], expected[i]);
+
+   args.work_props.work_dim = 2;
+   args.work_props.global_offset_x = 100;
+   args.work_props.group_id_offset_x = 2;
+   args.work_props.group_count_total_x = 5;
+   const uint32_t expected_withoffsets[] = { 2, 5, 1, 5, 2, 100, 102 };
+   run_shader(shader, args, out);
+   for (int i = 0; i < out.size(); ++i)
+      EXPECT_EQ(out[i], expected_withoffsets[i]);
+}
+
+TEST_F(ComputeTest, convert_round_sat)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float *f, __global uchar *u)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       u[idx] = convert_uchar_sat_rtp(f[idx]);\n\
+   }\n";
+   auto f = ShaderArg<float>({ -1.0f, 1.1f, 20.0f, 255.5f }, SHADER_ARG_INPUT);
+   auto u = ShaderArg<uint8_t>({ 255, 0, 0, 0 }, SHADER_ARG_OUTPUT);
+   const uint8_t expected[] = {
+      0, 2, 20, 255
+   };
+
+   run_shader(kernel_source, f.size(), 1, 1, f, u);
+   for (int i = 0; i < u.size(); ++i)
+      EXPECT_EQ(u[i], expected[i]);
+}
+
+TEST_F(ComputeTest, convert_round_sat_vec)
+{
+   const char *kernel_source =
+   "__kernel void main_test(__global float16 *f, __global uchar16 *u)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       u[idx] = convert_uchar16_sat_rtp(f[idx]);\n\
+   }\n";
+   auto f = ShaderArg<float>({
+      -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+      -0.5f, 1.9f, 20.0f, 254.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+       0.0f, 1.3f, 20.0f, 255.1f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+      -0.0f, 1.5555f, 20.0f, 254.9f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+   }, SHADER_ARG_INPUT);
+   auto u = ShaderArg<uint8_t>({
+      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+   }, SHADER_ARG_OUTPUT);
+   const uint8_t expected[] = {
+      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+   };
+
+   run_shader(kernel_source, 4, 1, 1, f, u);
+   for (int i = 0; i < u.size(); ++i)
+      EXPECT_EQ(u[i], expected[i]);
+}
+
+TEST_F(ComputeTest, convert_char2_uchar2)
+{
+   const char *kernel_source =
+   "__kernel void main_test( __global char2 *src, __global uchar2 *dest )\n\
+   {\n\
+      size_t i = get_global_id(0);\n\
+      dest[i] = convert_uchar2_sat( src[i] );\n\
+   }\n";
+
+   auto c = ShaderArg<int8_t>({ -127, -4, 0, 4, 126, 127, 16, 32 }, SHADER_ARG_INPUT);
+   auto u = ShaderArg<uint8_t>({ 99, 99, 99, 99, 99, 99, 99, 99 }, SHADER_ARG_OUTPUT);
+   const uint8_t expected[] = { 0, 0, 0, 4, 126, 127, 16, 32 };
+   run_shader(kernel_source, 4, 1, 1, c, u);
+   for (int i = 0; i < u.size(); i++)
+      EXPECT_EQ(u[i], expected[i]);
+}
+
+TEST_F(ComputeTest, async_copy)
+{
+   const char *kernel_source = R"(
+   __kernel void main_test( const __global char *src, __global char *dst, __local char *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )
+   {
+    int i;
+    for(i=0; i<copiesPerWorkItem; i++)
+        localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (char)(char)0;
+       barrier( CLK_LOCAL_MEM_FENCE );
+       event_t event;
+       event = async_work_group_copy( (__local char*)localBuffer, (__global const char*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 );
+       wait_group_events( 1, &event );
+    for(i=0; i<copiesPerWorkItem; i++)
+     dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];
+   })";
+   Shader shader = compile({ kernel_source });
+   validate(shader);
+}
+
+TEST_F(ComputeTest, packed_struct_global)
+{
+#pragma pack(push, 1)
+   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+   const char *kernel_source =
+   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+   __kernel void main_test(__global struct s *inout, global uint *size)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       inout[idx].uc = idx + 1;\n\
+       inout[idx].ul = ((ulong)(idx + 1 + 0xfbfcfdfe) << 32) | 0x12345678;\n\
+       inout[idx].us = ((ulong)(idx + 1 + 0xa0) << 8) | 0x12;\n\
+       *size = sizeof(struct s);\n\
+   }\n";
+   auto inout = ShaderArg<struct s>({0, 0, 0}, SHADER_ARG_OUTPUT);
+   auto size = ShaderArg<uint32_t>(0, SHADER_ARG_OUTPUT);
+   const struct s expected[] = {
+      { 1, 0xfbfcfdff12345678, 0xa112 }
+   };
+
+   run_shader(kernel_source, inout.size(), 1, 1, inout, size);
+   for (int i = 0; i < inout.size(); ++i) {
+      EXPECT_EQ(inout[i].uc, expected[i].uc);
+      EXPECT_EQ(inout[i].ul, expected[i].ul);
+      EXPECT_EQ(inout[i].us, expected[i].us);
+   }
+   EXPECT_EQ(size, sizeof(struct s));
+}
+
+TEST_F(ComputeTest, packed_struct_arg)
+{
+#pragma pack(push, 1)
+   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+   const char *kernel_source =
+   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+   __kernel void main_test(__global struct s *out, struct s in)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       out[idx].uc = in.uc + 0x12;\n\
+       out[idx].ul = in.ul + 0x123456789abcdef;\n\
+       out[idx].us = in.us + 0x1234;\n\
+   }\n";
+   auto out = ShaderArg<struct s>({0, 0, 0}, SHADER_ARG_OUTPUT);
+   auto in = ShaderArg<struct s>({1, 2, 3}, SHADER_ARG_INPUT);
+   const struct s expected[] = {
+      { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 }
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].uc, expected[i].uc);
+      EXPECT_EQ(out[i].ul, expected[i].ul);
+      EXPECT_EQ(out[i].us, expected[i].us);
+   }
+}
+
+TEST_F(ComputeTest, packed_struct_local)
+{
+#pragma pack(push, 1)
+   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+   const char *kernel_source =
+   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+   __kernel void main_test(__global struct s *out, __constant struct s *in)\n\
+   {\n\
+       uint idx = get_global_id(0);\n\
+       __local struct s tmp[2];\n\
+       tmp[get_local_id(0)] = in[idx];\n\
+       barrier(CLK_LOCAL_MEM_FENCE);\n\
+       out[idx] = tmp[(get_local_id(0) + 1) % 2];\n\
+   }\n";
+   auto out = ShaderArg<struct s>({{0, 0, 0}, {0, 0, 0}}, SHADER_ARG_OUTPUT);
+   auto in = ShaderArg<struct s>({{1, 2, 3}, {0x12, 0x123456789abcdef, 0x1234} }, SHADER_ARG_INPUT);
+   const struct s expected[] = {
+      { 0x12, 0x123456789abcdef, 0x1234 },
+      { 1, 2, 3 },
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].uc, expected[i].uc);
+      EXPECT_EQ(out[i].ul, expected[i].ul);
+      EXPECT_EQ(out[i].us, expected[i].us);
+   }
+}
+
+/* DISABLED because current release versions of WARP either return
+ * rubbish from reads or crash: they are not prepared to handle
+ * non-float global constants */
+TEST_F(ComputeTest, DISABLED_packed_struct_const)
+{
+#pragma pack(push, 1)
+   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+   const char *kernel_source =
+   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+   __kernel void main_test(__global struct s *out, struct s in)\n\
+   {\n\
+       __constant struct s base[] = {\n\
+          {0x12, 0x123456789abcdef, 0x1234},\n\
+          {0x11, 0x123456789abcdee, 0x1233},\n\
+       };\n\
+       uint idx = get_global_id(0);\n\
+       out[idx].uc = base[idx % 2].uc + in.uc;\n\
+       out[idx].ul = base[idx % 2].ul + in.ul;\n\
+       out[idx].us = base[idx % 2].us + in.us;\n\
+   }\n";
+   auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0, 0, 0}), SHADER_ARG_OUTPUT);
+   auto in = ShaderArg<struct s>({1, 2, 3}, SHADER_ARG_INPUT);
+   const struct s expected[] = {
+      { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 },
+      { 0x11 + 1, 0x123456789abcdee + 2, 0x1233 + 3 },
+   };
+
+   run_shader(kernel_source, out.size(), 1, 1, out, in);
+   for (int i = 0; i < out.size(); ++i) {
+      EXPECT_EQ(out[i].uc, expected[i].uc);
+      EXPECT_EQ(out[i].ul, expected[i].ul);
+      EXPECT_EQ(out[i].us, expected[i].us);
+   }
+}
+
+TEST_F(ComputeTest, DISABLED_printf)
+{
+   const char *kernel_source = R"(
+   __kernel void main_test(__global float *src, __global uint *dest)
+   {
+      __constant char *format_str = "%s: %f";
+      __constant char *str_val = "Test";
+      *dest = printf(format_str, str_val, src[0]);
+   })";
+
+   auto src = ShaderArg<float>({ 1.0f }, SHADER_ARG_INPUT);
+   auto dest = ShaderArg<uint32_t>({ 0xdeadbeef }, SHADER_ARG_OUTPUT);
+   run_shader(kernel_source, 1, 1, 1, src, dest);
+   EXPECT_EQ(dest[0], 0);
+}
+
+TEST_F(ComputeTest, vload_half)
+{
+   const char *kernel_source = R"(
+   __kernel void main_test(__global half *src, __global float4 *dest)
+   {
+      int offset = get_global_id(0);
+      dest[offset] = vload_half4(offset, src);
+   })";
+   auto src = ShaderArg<uint16_t>({ 0x3c00, 0x4000, 0x4200, 0x4400,
+                                    0x4500, 0x4600, 0x4700, 0x4800 }, SHADER_ARG_INPUT);
+   auto dest = ShaderArg<float>({ FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+                                  FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }, SHADER_ARG_OUTPUT);
+   run_shader(kernel_source, 2, 1, 1, src, dest);
+   for (unsigned i = 0; i < 8; ++i)
+      EXPECT_FLOAT_EQ(dest[i], (float)(i + 1));
+}
+
+TEST_F(ComputeTest, vstore_half)
+{
+   const char *kernel_source = R"(
+   __kernel void main_test(__global half *dst, __global float4 *src)
+   {
+      int offset = get_global_id(0);
+      vstore_half4(src[offset], offset, dst);
+   })";
+   auto dest = ShaderArg<uint16_t>({0xdead, 0xdead, 0xdead, 0xdead,
+                                   0xdead, 0xdead, 0xdead, 0xdead}, SHADER_ARG_OUTPUT);
+   auto src = ShaderArg<float>({ 1.0, 2.0, 3.0, 4.0,
+                                  5.0, 6.0, 7.0, 8.0 }, SHADER_ARG_INPUT);
+   run_shader(kernel_source, 2, 1, 1, dest, src);
+   const uint16_t expected[] = { 0x3c00, 0x4000, 0x4200, 0x4400,
+                                 0x4500, 0x4600, 0x4700, 0x4800 };
+   for (unsigned i = 0; i < 8; ++i)
+      EXPECT_EQ(dest[i], expected[i]);
+}
diff --git a/src/microsoft/clc/clc_helpers.cpp b/src/microsoft/clc/clc_helpers.cpp
new file mode 100644 (file)
index 0000000..38642b8
--- /dev/null
@@ -0,0 +1,811 @@
+//
+// Copyright 2012-2016 Francisco Jerez
+// Copyright 2012-2016 Advanced Micro Devices, Inc.
+// Copyright 2014-2016 Jan Vesely
+// Copyright 2014-2015 Serge Martin
+// Copyright 2015 Zoltan Gilian
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <sstream>
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/DiagnosticPrinter.h>
+#include <llvm/IR/DiagnosticInfo.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm-c/Core.h>
+#include <llvm-c/Target.h>
+#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
+
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Lex/PreprocessorOptions.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/TextDiagnosticBuffer.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Basic/TargetInfo.h>
+
+#include <spirv-tools/libspirv.hpp>
+#include <spirv-tools/linker.hpp>
+
+#include "util/macros.h"
+#include "glsl_types.h"
+#include "nir.h"
+#include "nir_types.h"
+
+#include "clc_helpers.h"
+#include "spirv.h"
+
+#include "opencl-c.h.h"
+#include "opencl-c-base.h.h"
+
+using ::llvm::Function;
+using ::llvm::LLVMContext;
+using ::llvm::Module;
+using ::llvm::raw_string_ostream;
+
+static void
+llvm_log_handler(const ::llvm::DiagnosticInfo &di, void *data) {
+   raw_string_ostream os { *reinterpret_cast<std::string *>(data) };
+   ::llvm::DiagnosticPrinterRawOStream printer { os };
+   di.print(printer);
+}
+
+class SPIRVKernelArg {
+public:
+   SPIRVKernelArg(uint32_t id, uint32_t typeId) : id(id), typeId(typeId),
+                                                  addrQualifier(CLC_KERNEL_ARG_ADDRESS_PRIVATE),
+                                                  accessQualifier(0),
+                                                  typeQualifier(0) { }
+   ~SPIRVKernelArg() { }
+
+   uint32_t id;
+   uint32_t typeId;
+   std::string name;
+   std::string typeName;
+   enum clc_kernel_arg_address_qualifier addrQualifier;
+   unsigned accessQualifier;
+   unsigned typeQualifier;
+};
+
+class SPIRVKernelInfo {
+public:
+   SPIRVKernelInfo(uint32_t fid, const char *nm) : funcId(fid), name(nm), vecHint(0) { }
+   ~SPIRVKernelInfo() { }
+
+   uint32_t funcId;
+   std::string name;
+   std::vector<SPIRVKernelArg> args;
+   unsigned vecHint;
+};
+
+class SPIRVKernelParser {
+public:
+   SPIRVKernelParser() : curKernel(NULL)
+   {
+      ctx = spvContextCreate(SPV_ENV_UNIVERSAL_1_0);
+   }
+
+   ~SPIRVKernelParser()
+   {
+     spvContextDestroy(ctx);
+   }
+
+   void parseEntryPoint(const spv_parsed_instruction_t *ins)
+   {
+      assert(ins->num_operands >= 3);
+
+      const spv_parsed_operand_t *op = &ins->operands[1];
+
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+
+      uint32_t funcId = ins->words[op->offset];
+
+      for (auto &iter : kernels) {
+         if (funcId == iter.funcId)
+            return;
+      }
+
+      op = &ins->operands[2];
+      assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+      const char *name = reinterpret_cast<const char *>(ins->words + op->offset);
+
+      kernels.push_back(SPIRVKernelInfo(funcId, name));
+   }
+
+   void parseFunction(const spv_parsed_instruction_t *ins)
+   {
+      assert(ins->num_operands == 4);
+
+      const spv_parsed_operand_t *op = &ins->operands[1];
+
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+
+      uint32_t funcId = ins->words[op->offset];
+
+      SPIRVKernelInfo *kernel = NULL;
+
+      for (auto &kernel : kernels) {
+         if (funcId == kernel.funcId && !kernel.args.size()) {
+            curKernel = &kernel;
+           return;
+         }
+      }
+   }
+
+   void parseFunctionParam(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      uint32_t id, typeId;
+
+      if (!curKernel)
+         return;
+
+      assert(ins->num_operands == 2);
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_TYPE_ID);
+      typeId = ins->words[op->offset];
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+      id = ins->words[op->offset];
+      curKernel->args.push_back(SPIRVKernelArg(id, typeId));
+   }
+
+   void parseName(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      const char *name;
+      uint32_t id;
+
+      assert(ins->num_operands == 2);
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+      id = ins->words[op->offset];
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+      name = reinterpret_cast<const char *>(ins->words + op->offset);
+
+      for (auto &kernel : kernels) {
+         for (auto &arg : kernel.args) {
+            if (arg.id == id && arg.name.empty()) {
+              arg.name = name;
+              break;
+           }
+         }
+      }
+   }
+
+   void parseTypePointer(const spv_parsed_instruction_t *ins)
+   {
+      enum clc_kernel_arg_address_qualifier addrQualifier;
+      uint32_t typeId, targetTypeId, storageClass;
+      const spv_parsed_operand_t *op;
+      const char *typeName;
+
+      assert(ins->num_operands == 3);
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+      typeId = ins->words[op->offset];
+
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_STORAGE_CLASS);
+      storageClass = ins->words[op->offset];
+      switch (storageClass) {
+      case SpvStorageClassCrossWorkgroup:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL;
+         break;
+      case SpvStorageClassWorkgroup:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_LOCAL;
+         break;
+      case SpvStorageClassUniformConstant:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_CONSTANT;
+         break;
+      default:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_PRIVATE;
+         break;
+      }
+
+      for (auto &kernel : kernels) {
+        for (auto &arg : kernel.args) {
+            if (arg.typeId == typeId)
+               arg.addrQualifier = addrQualifier;
+         }
+      }
+   }
+
+   void parseOpString(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      std::string str;
+
+      assert(ins->num_operands == 2);
+
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+      str = reinterpret_cast<const char *>(ins->words + op->offset);
+
+      if (str.find("kernel_arg_type.") != 0)
+         return;
+
+      size_t start = sizeof("kernel_arg_type.") - 1;
+
+      for (auto &kernel : kernels) {
+         size_t pos;
+
+        pos = str.find(kernel.name, start);
+         if (pos == std::string::npos ||
+             pos != start || str[start + kernel.name.size()] != '.')
+            continue;
+
+        pos = start + kernel.name.size();
+         if (str[pos++] != '.')
+            continue;
+
+         for (auto &arg : kernel.args) {
+            if (arg.name.empty())
+               break;
+
+            size_t typeEnd = str.find(',', pos);
+           if (typeEnd == std::string::npos)
+               break;
+
+            arg.typeName = str.substr(pos, typeEnd - pos);
+            pos = typeEnd + 1;
+         }
+      }
+   }
+
+   void applyDecoration(uint32_t id, const spv_parsed_instruction_t *ins)
+   {
+      auto iter = decorationGroups.find(id);
+      if (iter != decorationGroups.end()) {
+         for (uint32_t entry : iter->second)
+            applyDecoration(entry, ins);
+         return;
+      }
+
+      const spv_parsed_operand_t *op;
+      uint32_t decoration;
+
+      assert(ins->num_operands >= 2);
+
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_DECORATION);
+      decoration = ins->words[op->offset];
+
+      for (auto &kernel : kernels) {
+         for (auto &arg : kernel.args) {
+            if (arg.id == id) {
+               switch (decoration) {
+               case SpvDecorationVolatile:
+                  arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_VOLATILE;
+                  break;
+               case SpvDecorationConstant:
+                  arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST;
+                  break;
+               case SpvDecorationRestrict:
+                  arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT;
+                  break;
+               case SpvDecorationFuncParamAttr:
+                  op = &ins->operands[2];
+                  assert(op->type == SPV_OPERAND_TYPE_FUNCTION_PARAMETER_ATTRIBUTE);
+                  switch (ins->words[op->offset]) {
+                  case SpvFunctionParameterAttributeNoAlias:
+                     arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT;
+                     break;
+                  case SpvFunctionParameterAttributeNoWrite:
+                     arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST;
+                     break;
+                  }
+                  break;
+               }
+            }
+
+         }
+      }
+   }
+
+   void parseOpDecorate(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      uint32_t id, decoration;
+
+      assert(ins->num_operands >= 2);
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+      id = ins->words[op->offset];
+
+      applyDecoration(id, ins);
+   }
+
+   void parseOpGroupDecorate(const spv_parsed_instruction_t *ins)
+   {
+      assert(ins->num_operands >= 2);
+
+      const spv_parsed_operand_t *op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+      uint32_t groupId = ins->words[op->offset];
+
+      auto lowerBound = decorationGroups.lower_bound(groupId);
+      if (lowerBound != decorationGroups.end() &&
+          lowerBound->first == groupId)
+         // Group already filled out
+         return;
+
+      auto iter = decorationGroups.emplace_hint(lowerBound, groupId, std::vector<uint32_t>{});
+      auto& vec = iter->second;
+      vec.reserve(ins->num_operands - 1);
+      for (uint32_t i = 1; i < ins->num_operands; ++i) {
+         op = &ins->operands[i];
+         assert(op->type == SPV_OPERAND_TYPE_ID);
+         vec.push_back(ins->words[op->offset]);
+      }
+   }
+
+   void parseOpTypeImage(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      uint32_t typeId;
+      unsigned accessQualifier = CLC_KERNEL_ARG_ACCESS_READ;
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+      typeId = ins->words[op->offset];
+
+      if (ins->num_operands >= 9) {
+         op = &ins->operands[8];
+         assert(op->type == SPV_OPERAND_TYPE_ACCESS_QUALIFIER);
+         switch (ins->words[op->offset]) {
+         case SpvAccessQualifierReadOnly:
+            accessQualifier = CLC_KERNEL_ARG_ACCESS_READ;
+            break;
+         case SpvAccessQualifierWriteOnly:
+            accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE;
+            break;
+         case SpvAccessQualifierReadWrite:
+            accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE |
+               CLC_KERNEL_ARG_ACCESS_READ;
+            break;
+         }
+      }
+
+      for (auto &kernel : kernels) {
+        for (auto &arg : kernel.args) {
+            if (arg.typeId == typeId) {
+               arg.accessQualifier = accessQualifier;
+               arg.addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL;
+            }
+         }
+      }
+   }
+
+   void parseExecutionMode(const spv_parsed_instruction_t *ins)
+   {
+      uint32_t executionMode = ins->words[ins->operands[1].offset];
+      if (executionMode != SpvExecutionModeVecTypeHint)
+         return;
+
+      uint32_t funcId = ins->words[ins->operands[0].offset];
+      uint32_t vecHint = ins->words[ins->operands[2].offset];
+      for (auto& kernel : kernels) {
+         if (kernel.funcId == funcId)
+            kernel.vecHint = vecHint;
+      }
+   }
+
+   static spv_result_t
+   parseInstruction(void *data, const spv_parsed_instruction_t *ins)
+   {
+      SPIRVKernelParser *parser = reinterpret_cast<SPIRVKernelParser *>(data);
+
+      switch (ins->opcode) {
+      case SpvOpName:
+         parser->parseName(ins);
+         break;
+      case SpvOpEntryPoint:
+         parser->parseEntryPoint(ins);
+         break;
+      case SpvOpFunction:
+         parser->parseFunction(ins);
+         break;
+      case SpvOpFunctionParameter:
+         parser->parseFunctionParam(ins);
+         break;
+      case SpvOpFunctionEnd:
+      case SpvOpLabel:
+         parser->curKernel = NULL;
+         break;
+      case SpvOpTypePointer:
+         parser->parseTypePointer(ins);
+         break;
+      case SpvOpTypeImage:
+         parser->parseOpTypeImage(ins);
+         break;
+      case SpvOpString:
+         parser->parseOpString(ins);
+         break;
+      case SpvOpDecorate:
+         parser->parseOpDecorate(ins);
+         break;
+      case SpvOpGroupDecorate:
+         parser->parseOpGroupDecorate(ins);
+         break;
+      case SpvOpExecutionMode:
+         parser->parseExecutionMode(ins);
+         break;
+      default:
+         break;
+      }
+
+      return SPV_SUCCESS;
+   }
+
+   bool parsingComplete()
+   {
+      for (auto &kernel : kernels) {
+         if (kernel.name.empty())
+            return false;
+
+         for (auto &arg : kernel.args) {
+            if (arg.name.empty() || arg.typeName.empty())
+               return false;
+         }
+      }
+
+      return true;
+   }
+
+   void parseBinary(const struct spirv_binary &spvbin)
+   {
+      /* 3 passes should be enough to retrieve all kernel information:
+       * 1st pass: all entry point name and number of args
+       * 2nd pass: argument names and type names
+       * 3rd pass: pointer type names
+       */
+      for (unsigned pass = 0; pass < 3; pass++) {
+         spvBinaryParse(ctx, reinterpret_cast<void *>(this),
+                        spvbin.data, spvbin.size / 4,
+                        NULL, parseInstruction, NULL);
+
+         if (parsingComplete())
+            return;
+      }
+
+      assert(0);
+   }
+
+   std::vector<SPIRVKernelInfo> kernels;
+   std::map<uint32_t, std::vector<uint32_t>> decorationGroups;
+   SPIRVKernelInfo *curKernel;
+   spv_context ctx;
+};
+
+const struct clc_kernel_info *
+clc_spirv_get_kernels_info(const struct spirv_binary *spvbin,
+                           unsigned *num_kernels)
+{
+   struct clc_kernel_info *kernels;
+
+   SPIRVKernelParser parser;
+
+   parser.parseBinary(*spvbin);
+   *num_kernels = parser.kernels.size();
+   if (!*num_kernels)
+      return NULL;
+
+   kernels = reinterpret_cast<struct clc_kernel_info *>(calloc(*num_kernels,
+                                                               sizeof(*kernels)));
+   assert(kernels);
+   for (unsigned i = 0; i < parser.kernels.size(); i++) {
+      kernels[i].name = strdup(parser.kernels[i].name.c_str());
+      kernels[i].num_args = parser.kernels[i].args.size();
+      kernels[i].vec_hint_size = parser.kernels[i].vecHint >> 16;
+      kernels[i].vec_hint_type = (enum clc_vec_hint_type)(parser.kernels[i].vecHint & 0xFFFF);
+      if (!kernels[i].num_args)
+         continue;
+
+      struct clc_kernel_arg *args;
+
+      args = reinterpret_cast<struct clc_kernel_arg *>(calloc(kernels[i].num_args,
+                                                       sizeof(*kernels->args)));
+      kernels[i].args = args;
+      assert(args);
+      for (unsigned j = 0; j < kernels[i].num_args; j++) {
+         if (!parser.kernels[i].args[j].name.empty())
+            args[j].name = strdup(parser.kernels[i].args[j].name.c_str());
+         args[j].type_name = strdup(parser.kernels[i].args[j].typeName.c_str());
+         args[j].address_qualifier = parser.kernels[i].args[j].addrQualifier;
+         args[j].type_qualifier = parser.kernels[i].args[j].typeQualifier;
+         args[j].access_qualifier = parser.kernels[i].args[j].accessQualifier;
+      }
+   }
+
+   return kernels;
+}
+
+void
+clc_free_kernels_info(const struct clc_kernel_info *kernels,
+                      unsigned num_kernels)
+{
+   if (!kernels)
+      return;
+
+   for (unsigned i = 0; i < num_kernels; i++) {
+      if (kernels[i].args) {
+         for (unsigned j = 0; j < kernels[i].num_args; j++) {
+            free((void *)kernels[i].args[j].name);
+            free((void *)kernels[i].args[j].type_name);
+         }
+      }
+      free((void *)kernels[i].name);
+   }
+
+   free((void *)kernels);
+}
+
+int
+clc_to_spirv(const struct clc_compile_args *args,
+             struct spirv_binary *spvbin,
+             const struct clc_logger *logger)
+{
+   LLVMInitializeAllTargets();
+   LLVMInitializeAllTargetInfos();
+   LLVMInitializeAllTargetMCs();
+   LLVMInitializeAllAsmPrinters();
+
+   std::string log;
+   std::unique_ptr<LLVMContext> llvm_ctx { new LLVMContext };
+   llvm_ctx->setDiagnosticHandlerCallBack(llvm_log_handler, &log);
+
+   std::unique_ptr<clang::CompilerInstance> c { new clang::CompilerInstance };
+   clang::DiagnosticsEngine diag { new clang::DiagnosticIDs,
+         new clang::DiagnosticOptions,
+         new clang::TextDiagnosticPrinter(*new raw_string_ostream(log),
+                                          &c->getDiagnosticOpts(), true)};
+
+   std::vector<const char *> clang_opts = {
+      args->source.name,
+      "-triple", "spir64-unknown-unknown",
+      // By default, clang prefers to use modules to pull in the default headers,
+      // which doesn't work with our technique of embedding the headers in our binary
+      "-finclude-default-header",
+      // Add a default CL compiler version. Clang will pick the last one specified
+      // on the command line, so the app can override this one.
+      "-cl-std=cl1.2",
+      // The LLVM-SPIRV-Translator doesn't support memset with variable size
+      "-fno-builtin-memset",
+      // LLVM's optimizations can produce code that the translator can't translate
+      "-O0",
+   };
+   // We assume there's appropriate defines for __OPENCL_VERSION__ and __IMAGE_SUPPORT__
+   // being provided by the caller here.
+   clang_opts.insert(clang_opts.end(), args->args, args->args + args->num_args);
+
+   if (!clang::CompilerInvocation::CreateFromArgs(c->getInvocation(),
+#if LLVM_VERSION_MAJOR >= 10
+                                                  clang_opts,
+#else
+                                                  clang_opts.data(),
+                                                  clang_opts.data() + clang_opts.size(),
+#endif
+                                                  diag)) {
+      log += "Couldn't create Clang invocation.\n";
+      clc_error(logger, log.c_str());
+      return -1;
+   }
+
+   if (diag.hasErrorOccurred()) {
+      log += "Errors occurred during Clang invocation.\n";
+      clc_error(logger, log.c_str());
+      return -1;
+   }
+
+   // This is a workaround for a Clang bug which causes the number
+   // of warnings and errors to be printed to stderr.
+   // http://www.llvm.org/bugs/show_bug.cgi?id=19735
+   c->getDiagnosticOpts().ShowCarets = false;
+
+   c->createDiagnostics(new clang::TextDiagnosticPrinter(
+                           *new raw_string_ostream(log),
+                           &c->getDiagnosticOpts(), true));
+
+   c->setTarget(clang::TargetInfo::CreateTargetInfo(
+                   c->getDiagnostics(), c->getInvocation().TargetOpts));
+
+   c->getFrontendOpts().ProgramAction = clang::frontend::EmitLLVMOnly;
+   c->getHeaderSearchOpts().UseBuiltinIncludes = false;
+   c->getHeaderSearchOpts().UseStandardSystemIncludes = false;
+
+   // Add opencl-c generic search path
+   {
+      ::llvm::SmallString<128> system_header_path;
+      ::llvm::sys::path::system_temp_directory(true, system_header_path);
+      ::llvm::sys::path::append(system_header_path, "openclon12");
+      c->getHeaderSearchOpts().AddPath(system_header_path.str(),
+                                       clang::frontend::Angled,
+                                       false, false);
+
+      ::llvm::sys::path::append(system_header_path, "opencl-c.h");
+      c->getPreprocessorOpts().addRemappedFile(system_header_path.str(),
+         ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_source, _countof(opencl_c_source) - 1)).release());
+
+      ::llvm::sys::path::remove_filename(system_header_path);
+      ::llvm::sys::path::append(system_header_path, "opencl-c-base.h");
+      c->getPreprocessorOpts().addRemappedFile(system_header_path.str(),
+         ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_base_source, _countof(opencl_c_base_source) - 1)).release());
+   }
+
+   if (args->num_headers) {
+      ::llvm::SmallString<128> tmp_header_path;
+      ::llvm::sys::path::system_temp_directory(true, tmp_header_path);
+      ::llvm::sys::path::append(tmp_header_path, "openclon12");
+
+      c->getHeaderSearchOpts().AddPath(tmp_header_path.str(),
+                                       clang::frontend::Quoted,
+                                       false, false);
+
+      for (size_t i = 0; i < args->num_headers; i++) {
+         auto path_copy = tmp_header_path;
+         ::llvm::sys::path::append(path_copy, ::llvm::sys::path::convert_to_slash(args->headers[i].name));
+         c->getPreprocessorOpts().addRemappedFile(path_copy.str(),
+            ::llvm::MemoryBuffer::getMemBufferCopy(args->headers[i].value).release());
+      }
+   }
+
+   c->getPreprocessorOpts().addRemappedFile(
+           args->source.name,
+           ::llvm::MemoryBuffer::getMemBufferCopy(std::string(args->source.value)).release());
+
+   // Compile the code
+   clang::EmitLLVMOnlyAction act(llvm_ctx.get());
+   if (!c->ExecuteAction(act)) {
+      log += "Error executing LLVM compilation action.\n";
+      clc_error(logger, log.c_str());
+      return -1;
+   }
+
+   auto mod = act.takeModule();
+   std::ostringstream spv_stream;
+   if (!::llvm::writeSpirv(mod.get(), spv_stream, log)) {
+      log += "Translation from LLVM IR to SPIR-V failed.\n";
+      clc_error(logger, log.c_str());
+      return -1;
+   }
+
+   const std::string spv_out = spv_stream.str();
+   spvbin->size = spv_out.size();
+   spvbin->data = static_cast<uint32_t *>(malloc(spvbin->size));
+   memcpy(spvbin->data, spv_out.data(), spvbin->size);
+
+   return 0;
+}
+
+static const char *
+spv_result_to_str(spv_result_t res)
+{
+   switch (res) {
+   case SPV_SUCCESS: return "success";
+   case SPV_UNSUPPORTED: return "unsupported";
+   case SPV_END_OF_STREAM: return "end of stream";
+   case SPV_WARNING: return "warning";
+   case SPV_FAILED_MATCH: return "failed match";
+   case SPV_REQUESTED_TERMINATION: return "requested termination";
+   case SPV_ERROR_INTERNAL: return "internal error";
+   case SPV_ERROR_OUT_OF_MEMORY: return "out of memory";
+   case SPV_ERROR_INVALID_POINTER: return "invalid pointer";
+   case SPV_ERROR_INVALID_BINARY: return "invalid binary";
+   case SPV_ERROR_INVALID_TEXT: return "invalid text";
+   case SPV_ERROR_INVALID_TABLE: return "invalid table";
+   case SPV_ERROR_INVALID_VALUE: return "invalid value";
+   case SPV_ERROR_INVALID_DIAGNOSTIC: return "invalid diagnostic";
+   case SPV_ERROR_INVALID_LOOKUP: return "invalid lookup";
+   case SPV_ERROR_INVALID_ID: return "invalid id";
+   case SPV_ERROR_INVALID_CFG: return "invalid config";
+   case SPV_ERROR_INVALID_LAYOUT: return "invalid layout";
+   case SPV_ERROR_INVALID_CAPABILITY: return "invalid capability";
+   case SPV_ERROR_INVALID_DATA: return "invalid data";
+   case SPV_ERROR_MISSING_EXTENSION: return "missing extension";
+   case SPV_ERROR_WRONG_VERSION: return "wrong version";
+   default: return "unknown error";
+   }
+}
+
+class SPIRVMessageConsumer {
+public:
+   SPIRVMessageConsumer(const struct clc_logger *logger): logger(logger) {}
+
+   void operator()(spv_message_level_t level, const char *src,
+                   const spv_position_t &pos, const char *msg)
+   {
+      switch(level) {
+      case SPV_MSG_FATAL:
+      case SPV_MSG_INTERNAL_ERROR:
+      case SPV_MSG_ERROR:
+         clc_error(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s",
+                   src, pos.line, pos.column, pos.index, msg);
+         break;
+
+      case SPV_MSG_WARNING:
+         clc_warning(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s",
+                     src, pos.line, pos.column, pos.index, msg);
+         break;
+
+      default:
+         break;
+      }
+   }
+
+private:
+   const struct clc_logger *logger;
+};
+
+int
+clc_link_spirv_binaries(const struct clc_linker_args *args,
+                        struct spirv_binary *dst_bin,
+                        const struct clc_logger *logger)
+{
+   std::vector<std::vector<uint32_t>> binaries;
+
+   for (unsigned i = 0; i < args->num_in_objs; i++) {
+      std::vector<uint32_t> bin(args->in_objs[i]->spvbin.data,
+                                args->in_objs[i]->spvbin.data +
+                                   (args->in_objs[i]->spvbin.size / 4));
+      binaries.push_back(bin);
+   }
+
+   SPIRVMessageConsumer msgconsumer(logger);
+   spvtools::Context context(SPV_ENV_UNIVERSAL_1_0);
+   context.SetMessageConsumer(msgconsumer);
+   spvtools::LinkerOptions options;
+   options.SetAllowPartialLinkage(args->create_library);
+   options.SetCreateLibrary(args->create_library);
+   std::vector<uint32_t> linkingResult;
+   spv_result_t status = spvtools::Link(context, binaries, &linkingResult, options);
+   if (status != SPV_SUCCESS) {
+      return -1;
+   }
+
+   dst_bin->size = linkingResult.size() * 4;
+   dst_bin->data = static_cast<uint32_t *>(malloc(dst_bin->size));
+   memcpy(dst_bin->data, linkingResult.data(), dst_bin->size);
+
+   return 0;
+}
+
+void
+clc_dump_spirv(const struct spirv_binary *spvbin, FILE *f)
+{
+   spvtools::SpirvTools tools(SPV_ENV_UNIVERSAL_1_0);
+   std::vector<uint32_t> bin(spvbin->data, spvbin->data + (spvbin->size / 4));
+   std::string out;
+   tools.Disassemble(bin, &out,
+                     SPV_BINARY_TO_TEXT_OPTION_INDENT |
+                     SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES);
+   fwrite(out.c_str(), out.size(), 1, f);
+}
+
+void
+clc_free_spirv_binary(struct spirv_binary *spvbin)
+{
+   free(spvbin->data);
+}
diff --git a/src/microsoft/clc/clc_helpers.h b/src/microsoft/clc/clc_helpers.h
new file mode 100644 (file)
index 0000000..48f8c2d
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CLC_TO_NIR_H
+#define CLC_TO_NIR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nir_types.h"
+
+#include "clc_compiler.h"
+#include "util/u_string.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+
+const struct clc_kernel_info *
+clc_spirv_get_kernels_info(const struct spirv_binary *spvbin,
+                           unsigned *num_kernels);
+
+void
+clc_free_kernels_info(const struct clc_kernel_info *kernels,
+                      unsigned num_kernels);
+
+int
+clc_to_spirv(const struct clc_compile_args *args,
+             struct spirv_binary *spvbin,
+             const struct clc_logger *logger);
+
+int
+clc_link_spirv_binaries(const struct clc_linker_args *args,
+                        struct spirv_binary *dst_bin,
+                        const struct clc_logger *logger);
+
+void
+clc_dump_spirv(const struct spirv_binary *spvbin, FILE *f);
+
+void
+clc_free_spirv_binary(struct spirv_binary *spvbin);
+
+#define clc_log(logger, level, fmt, ...) do {        \
+      if (!logger || !logger->level) break;          \
+      char *msg = NULL;                              \
+      asprintf(&msg, fmt, __VA_ARGS__);              \
+      assert(msg);                                   \
+      logger->level(logger->priv, msg);              \
+      free(msg);                                     \
+   } while (0)
+
+#define clc_error(logger, fmt, ...) clc_log(logger, error, fmt, __VA_ARGS__)
+#define clc_warning(logger, fmt, ...) clc_log(logger, warning, fmt, __VA_ARGS__)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/microsoft/clc/clc_nir.c b/src/microsoft/clc/clc_nir.c
new file mode 100644 (file)
index 0000000..2dfeb92
--- /dev/null
@@ -0,0 +1,388 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "u_math.h"
+#include "nir.h"
+#include "glsl_types.h"
+#include "nir_types.h"
+#include "nir_builder.h"
+
+#include "clc_nir.h"
+#include "clc_compiler.h"
+#include "../compiler/dxil_nir.h"
+
+static bool
+lower_load_base_global_invocation_id(nir_builder *b, nir_intrinsic_instr *intr,
+                                    nir_variable *var)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_ssa_def *offset =
+      build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+                          nir_imm_int(b,
+                                      offsetof(struct clc_work_properties_data,
+                                               global_offset_x)),
+                          nir_dest_num_components(intr->dest),
+                          nir_dest_bit_size(intr->dest));
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(offset));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_load_work_dim(nir_builder *b, nir_intrinsic_instr *intr,
+                    nir_variable *var)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_ssa_def *dim =
+      build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+                          nir_imm_int(b,
+                                      offsetof(struct clc_work_properties_data,
+                                               work_dim)),
+                          nir_dest_num_components(intr->dest),
+                          nir_dest_bit_size(intr->dest));
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(dim));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_load_local_group_size(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_const_value v[3] = {
+      nir_const_value_for_int(b->shader->info.cs.local_size[0], 32),
+      nir_const_value_for_int(b->shader->info.cs.local_size[1], 32),
+      nir_const_value_for_int(b->shader->info.cs.local_size[2], 32)
+   };
+   nir_ssa_def *size = nir_build_imm(b, 3, 32, v);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(size));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_load_num_work_groups(nir_builder *b, nir_intrinsic_instr *intr,
+                           nir_variable *var)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_ssa_def *count =
+      build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+                         nir_imm_int(b,
+                                     offsetof(struct clc_work_properties_data,
+                                              group_count_total_x)),
+                         nir_dest_num_components(intr->dest),
+                         nir_dest_bit_size(intr->dest));
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(count));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_load_base_work_group_id(nir_builder *b, nir_intrinsic_instr *intr,
+                             nir_variable *var)
+{
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_ssa_def *offset =
+      build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+                         nir_imm_int(b,
+                                     offsetof(struct clc_work_properties_data,
+                                              group_id_offset_x)),
+                         nir_dest_num_components(intr->dest),
+                         nir_dest_bit_size(intr->dest));
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(offset));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+clc_nir_lower_system_values(nir_shader *nir, nir_variable *var)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+            switch (intr->intrinsic) {
+            case nir_intrinsic_load_base_global_invocation_id:
+               progress |= lower_load_base_global_invocation_id(&b, intr, var);
+               break;
+            case nir_intrinsic_load_work_dim:
+               progress |= lower_load_work_dim(&b, intr, var);
+               break;
+            case nir_intrinsic_load_local_group_size:
+               lower_load_local_group_size(&b, intr);
+               break;
+            case nir_intrinsic_load_num_work_groups:
+               lower_load_num_work_groups(&b, intr, var);
+               break;
+            case nir_intrinsic_load_base_work_group_id:
+               lower_load_base_work_group_id(&b, intr, var);
+               break;
+            default: break;
+            }
+         }
+      }
+   }
+
+   return progress;
+}
+
+static bool
+lower_load_kernel_input(nir_builder *b, nir_intrinsic_instr *intr,
+                        nir_variable *var)
+{
+   nir_intrinsic_instr *load;
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   unsigned bit_size = nir_dest_bit_size(intr->dest);
+   enum glsl_base_type base_type;
+
+   switch (bit_size) {
+   case 64:
+      base_type = GLSL_TYPE_UINT64;
+      break;
+   case 32:
+      base_type = GLSL_TYPE_UINT;
+      break;
+    case 16:
+      base_type = GLSL_TYPE_UINT16;
+      break;
+    case 8:
+      base_type = GLSL_TYPE_UINT8;
+      break;
+   }
+
+   const struct glsl_type *type =
+      glsl_vector_type(base_type, nir_dest_num_components(intr->dest));
+   nir_ssa_def *ptr = nir_vec2(b, nir_imm_int(b, var->data.binding),
+                                  nir_u2u(b, intr->src[0].ssa, 32));
+   nir_deref_instr *deref = nir_build_deref_cast(b, ptr, nir_var_mem_ubo, type,
+                                                    bit_size / 8);
+   deref->cast.align_mul = nir_intrinsic_align_mul(intr);
+   deref->cast.align_offset = nir_intrinsic_align_offset(intr);
+
+   nir_ssa_def *result =
+      nir_load_deref(b, deref);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+clc_nir_lower_kernel_input_loads(nir_shader *nir, nir_variable *var)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+            if (intr->intrinsic == nir_intrinsic_load_kernel_input)
+               progress |= lower_load_kernel_input(&b, intr, var);
+         }
+      }
+   }
+
+   return progress;
+}
+
+
+static nir_variable *
+add_printf_var(struct nir_shader *nir, unsigned uav_id)
+{
+   /* This size is arbitrary. Minimum required per spec is 1MB */
+   const unsigned max_printf_size = 1 * 1024 * 1024;
+   const unsigned printf_array_size = max_printf_size / sizeof(unsigned);
+   nir_variable *var =
+      nir_variable_create(nir, nir_var_mem_ssbo,
+                          glsl_array_type(glsl_uint_type(), printf_array_size, sizeof(unsigned)),
+                          "kernel_work_properies");
+   var->data.binding = uav_id;
+   return var;
+}
+
+static void
+lower_printf_impl(nir_builder *b, nir_intrinsic_instr *instr, nir_variable *var)
+{
+   /* Atomic add a buffer size counter to determine where to write.
+    * If overflowed, return -1, otherwise, store the arguments and return 0.
+    */
+   b->cursor = nir_before_instr(&instr->instr);
+   nir_deref_instr *ssbo_deref = nir_build_deref_var(b, var);
+   nir_deref_instr *counter_deref = nir_build_deref_array_imm(b, ssbo_deref, 0);
+   nir_deref_instr *struct_deref = nir_instr_as_deref(instr->src[1].ssa->parent_instr);
+   nir_variable *struct_var = nir_deref_instr_get_variable(struct_deref);
+   const struct glsl_type *struct_type = struct_var->type;
+   /* Align the struct size to 4 for natural SSBO alignment */
+   int struct_size = align(glsl_get_cl_size(struct_type), 4);
+
+   /* Hardcoding 64bit pointers to simplify some code below */
+   assert(instr->src[0].ssa->num_components == 1 && instr->src[0].ssa->bit_size == 64);
+
+   nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, nir_intrinsic_deref_atomic_add);
+   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);
+   atomic->src[0] = nir_src_for_ssa(&counter_deref->dest.ssa);
+   atomic->src[1] = nir_src_for_ssa(nir_imm_int(b, struct_size + sizeof(uint64_t)));
+   nir_builder_instr_insert(b, &atomic->instr);
+
+   int max_valid_offset =
+      glsl_get_cl_size(var->type) - /* buffer size */
+      struct_size - /* printf args size */
+      sizeof(uint64_t) - /* format string */
+      sizeof(int); /* the first int in the buffer is for the counter */
+   nir_push_if(b, nir_ilt(b, &atomic->dest.ssa, nir_imm_int(b, max_valid_offset)));
+   nir_ssa_def *printf_succ_val = nir_imm_int(b, 0);
+
+   nir_ssa_def *start_offset = nir_u2u64(b, nir_iadd(b, &atomic->dest.ssa, nir_imm_int(b, sizeof(int))));
+   nir_deref_instr *as_byte_array = nir_build_deref_cast(b, &ssbo_deref->dest.ssa, nir_var_mem_ssbo, glsl_uint8_t_type(), 1);
+   nir_deref_instr *as_offset_byte_array = nir_build_deref_ptr_as_array(b, as_byte_array, start_offset);
+   nir_deref_instr *format_string_write_deref =
+      nir_build_deref_cast(b, &as_offset_byte_array->dest.ssa, nir_var_mem_ssbo, glsl_uint64_t_type(), 8);
+   nir_store_deref(b, format_string_write_deref, instr->src[0].ssa, ~0);
+
+   for (unsigned i = 0; i < glsl_get_length(struct_type); ++i) {
+      nir_ssa_def *field_offset_from_start = nir_imm_int64(b, glsl_get_struct_field_offset(struct_type, i) + sizeof(uint64_t));
+      nir_ssa_def *field_offset = nir_iadd(b, start_offset, field_offset_from_start);
+
+      const struct glsl_type *field_type = glsl_get_struct_field(struct_type, i);
+      nir_deref_instr *field_read_deref = nir_build_deref_struct(b, struct_deref, i);
+      nir_ssa_def *field_value = nir_load_deref(b, field_read_deref);
+
+      /* Clang does promotion of arguments to their "native" size. That means that any floats
+       * have been converted to doubles for the call to printf. Since we don't support doubles,
+       * convert them back here; copy-prop and other optimizations should remove all hint of doubles.
+       */
+      if (glsl_get_base_type(field_type) == GLSL_TYPE_DOUBLE) {
+         field_value = nir_f2f32(b, field_value);
+         field_type = glsl_float_type();
+      }
+
+      as_offset_byte_array = nir_build_deref_ptr_as_array(b, as_byte_array, field_offset);
+      nir_deref_instr *field_write_deref =
+         nir_build_deref_cast(b, &as_offset_byte_array->dest.ssa, nir_var_mem_ssbo, field_type, glsl_get_cl_size(field_type));
+
+      nir_store_deref(b, field_write_deref, field_value, ~0);
+   }
+
+   nir_push_else(b, NULL);
+   nir_ssa_def *printf_fail_val = nir_imm_int(b, -1);
+   nir_pop_if(b, NULL);
+
+   nir_ssa_def *return_value = nir_if_phi(b, printf_succ_val, printf_fail_val);
+   nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(return_value));
+   nir_instr_remove(&instr->instr);
+}
+
+static nir_variable *
+find_identical_const_sampler(nir_shader *nir, nir_variable *sampler)
+{
+   nir_foreach_variable_with_modes(uniform, nir, nir_var_uniform) {
+      if (!glsl_type_is_sampler(uniform->type) || !uniform->data.sampler.is_inline_sampler)
+         continue;
+      if (uniform->data.sampler.addressing_mode == sampler->data.sampler.addressing_mode &&
+          uniform->data.sampler.normalized_coordinates == sampler->data.sampler.normalized_coordinates &&
+          uniform->data.sampler.filter_mode == sampler->data.sampler.filter_mode)
+         return uniform;
+   }
+   unreachable("Should have at least found the input sampler");
+}
+
+bool
+clc_nir_dedupe_const_samplers(nir_shader *nir)
+{
+   bool progress = false;
+   nir_foreach_function(func, nir) {
+      if (!func->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_tex)
+               continue;
+
+            nir_tex_instr *tex = nir_instr_as_tex(instr);
+            int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+            if (sampler_idx == -1)
+               continue;
+
+            nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_idx].src);
+            nir_variable *sampler = nir_deref_instr_get_variable(deref);
+            if (!sampler)
+               continue;
+
+            assert(sampler->data.mode == nir_var_uniform);
+
+            if (!sampler->data.sampler.is_inline_sampler)
+               continue;
+
+            nir_variable *replacement = find_identical_const_sampler(nir, sampler);
+            if (replacement == sampler)
+               continue;
+
+            b.cursor = nir_before_instr(&tex->instr);
+            nir_deref_instr *replacement_deref = nir_build_deref_var(&b, replacement);
+            nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_idx].src,
+                                  nir_src_for_ssa(&replacement_deref->dest.ssa));
+            nir_deref_instr_remove_if_unused(deref);
+            progress = true;
+         }
+      }
+
+      if (progress) {
+         nir_metadata_preserve(func->impl, nir_metadata_block_index | nir_metadata_dominance);
+      }
+   }
+   return progress;
+}
diff --git a/src/microsoft/clc/clc_nir.h b/src/microsoft/clc/clc_nir.h
new file mode 100644 (file)
index 0000000..a452b7a
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CLC_NIR_H
+#define CLC_NIR_H
+
+#include <stdbool.h>
+#include "nir.h"
+
+bool
+clc_nir_lower_system_values(nir_shader *nir, nir_variable *var);
+bool dxil_nir_lower_kernel_input_loads(nir_shader *nir, nir_variable *var);
+
+bool
+clc_nir_lower_printf(nir_shader *nir, unsigned uav_id);
+
+bool
+clc_nir_dedupe_const_samplers(nir_shader *nir);
+
+#endif
diff --git a/src/microsoft/clc/clglon12compiler.def b/src/microsoft/clc/clglon12compiler.def
new file mode 100644 (file)
index 0000000..924f7aa
--- /dev/null
@@ -0,0 +1,12 @@
+EXPORTS
+    clc_context_new
+    clc_free_context
+    clc_context_serialize
+    clc_context_free_serialized
+    clc_context_deserialize
+    clc_compile
+    clc_link
+    clc_free_object
+    clc_to_dxil
+    clc_free_dxil_object
+    clc_compiler_get_version
diff --git a/src/microsoft/clc/compute_test.cpp b/src/microsoft/clc/compute_test.cpp
new file mode 100644 (file)
index 0000000..46f5d87
--- /dev/null
@@ -0,0 +1,880 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdexcept>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <gtest/gtest.h>
+#include <wrl.h>
+
+#include "util/u_debug.h"
+#include "clc_compiler.h"
+#include "compute_test.h"
+#include "dxcapi.h"
+
+using std::runtime_error;
+using Microsoft::WRL::ComPtr;
+
+enum compute_test_debug_flags {
+   COMPUTE_DEBUG_EXPERIMENTAL_SHADERS = 1 << 0,
+   COMPUTE_DEBUG_USE_HW_D3D           = 1 << 1,
+   COMPUTE_DEBUG_OPTIMIZE_LIBCLC      = 1 << 2,
+   COMPUTE_DEBUG_SERIALIZE_LIBCLC     = 1 << 3,
+};
+
+static const struct debug_named_value debug_options[] = {
+   { "experimental_shaders",  COMPUTE_DEBUG_EXPERIMENTAL_SHADERS, "Enable experimental shaders" },
+   { "use_hw_d3d",            COMPUTE_DEBUG_USE_HW_D3D,           "Use a hardware D3D device"   },
+   { "optimize_libclc",       COMPUTE_DEBUG_OPTIMIZE_LIBCLC,      "Optimize the clc_context before using it" },
+   { "serialize_libclc",      COMPUTE_DEBUG_SERIALIZE_LIBCLC,     "Serialize and deserialize the clc_context" },
+   DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(debug_compute, "COMPUTE_TEST_DEBUG", debug_options, 0)
+
+static void warning_callback(void *priv, const char *msg)
+{
+   fprintf(stderr, "WARNING: %s\n", msg);
+}
+
+static void error_callback(void *priv, const char *msg)
+{
+   fprintf(stderr, "ERROR: %s\n", msg);
+}
+
+static const struct clc_logger logger = {
+   NULL,
+   error_callback,
+   warning_callback,
+};
+
+void
+ComputeTest::enable_d3d12_debug_layer()
+{
+   HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL");
+   if (!hD3D12Mod) {
+      fprintf(stderr, "D3D12: failed to load D3D12.DLL\n");
+      return;
+   }
+
+   typedef HRESULT(WINAPI * PFN_D3D12_GET_DEBUG_INTERFACE)(REFIID riid,
+                                                           void **ppFactory);
+   PFN_D3D12_GET_DEBUG_INTERFACE D3D12GetDebugInterface = (PFN_D3D12_GET_DEBUG_INTERFACE)GetProcAddress(hD3D12Mod, "D3D12GetDebugInterface");
+   if (!D3D12GetDebugInterface) {
+      fprintf(stderr, "D3D12: failed to load D3D12GetDebugInterface from D3D12.DLL\n");
+      return;
+   }
+
+   ID3D12Debug *debug;
+   if (FAILED(D3D12GetDebugInterface(__uuidof(ID3D12Debug), (void **)& debug))) {
+      fprintf(stderr, "D3D12: D3D12GetDebugInterface failed\n");
+      return;
+   }
+
+   debug->EnableDebugLayer();
+}
+
+IDXGIFactory4 *
+ComputeTest::get_dxgi_factory()
+{
+   static const GUID IID_IDXGIFactory4 = {
+      0x1bc6ea02, 0xef36, 0x464f,
+      { 0xbf, 0x0c, 0x21, 0xca, 0x39, 0xe5, 0x16, 0x8a }
+   };
+
+   typedef HRESULT(WINAPI * PFN_CREATE_DXGI_FACTORY)(REFIID riid,
+                                                     void **ppFactory);
+   PFN_CREATE_DXGI_FACTORY CreateDXGIFactory;
+
+   HMODULE hDXGIMod = LoadLibrary("DXGI.DLL");
+   if (!hDXGIMod)
+      throw runtime_error("Failed to load DXGI.DLL");
+
+   CreateDXGIFactory = (PFN_CREATE_DXGI_FACTORY)GetProcAddress(hDXGIMod, "CreateDXGIFactory");
+   if (!CreateDXGIFactory)
+      throw runtime_error("Failed to load CreateDXGIFactory from DXGI.DLL");
+
+   IDXGIFactory4 *factory = NULL;
+   HRESULT hr = CreateDXGIFactory(IID_IDXGIFactory4, (void **)&factory);
+   if (FAILED(hr))
+      throw runtime_error("CreateDXGIFactory failed");
+
+   return factory;
+}
+
+IDXGIAdapter1 *
+ComputeTest::choose_adapter(IDXGIFactory4 *factory)
+{
+   IDXGIAdapter1 *ret;
+
+   if (debug_get_option_debug_compute() & COMPUTE_DEBUG_USE_HW_D3D) {
+      for (unsigned i = 0; SUCCEEDED(factory->EnumAdapters1(i, &ret)); i++) {
+         DXGI_ADAPTER_DESC1 desc;
+         ret->GetDesc1(&desc);
+         if (!(desc.Flags & D3D_DRIVER_TYPE_SOFTWARE))
+            return ret;
+      }
+      throw runtime_error("Failed to enum hardware adapter");
+   } else {
+      if (FAILED(factory->EnumWarpAdapter(__uuidof(IDXGIAdapter1),
+         (void **)& ret)))
+         throw runtime_error("Failed to enum warp adapter");
+      return ret;
+   }
+}
+
+ID3D12Device *
+ComputeTest::create_device(IDXGIAdapter1 *adapter)
+{
+   typedef HRESULT(WINAPI *PFN_D3D12CREATEDEVICE)(IUnknown *, D3D_FEATURE_LEVEL, REFIID, void **);
+   PFN_D3D12CREATEDEVICE D3D12CreateDevice;
+
+   HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL");
+   if (!hD3D12Mod)
+      throw runtime_error("failed to load D3D12.DLL");
+
+   if (debug_get_option_debug_compute() & COMPUTE_DEBUG_EXPERIMENTAL_SHADERS) {
+      typedef HRESULT(WINAPI *PFN_D3D12ENABLEEXPERIMENTALFEATURES)(UINT, const IID *, void *, UINT *);
+      PFN_D3D12ENABLEEXPERIMENTALFEATURES D3D12EnableExperimentalFeatures;
+      D3D12EnableExperimentalFeatures = (PFN_D3D12ENABLEEXPERIMENTALFEATURES)
+         GetProcAddress(hD3D12Mod, "D3D12EnableExperimentalFeatures");
+      if (FAILED(D3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModels, NULL, NULL)))
+         throw runtime_error("failed to enable experimental shader models");
+   }
+
+   D3D12CreateDevice = (PFN_D3D12CREATEDEVICE)GetProcAddress(hD3D12Mod, "D3D12CreateDevice");
+   if (!D3D12CreateDevice)
+      throw runtime_error("failed to load D3D12CreateDevice from D3D12.DLL");
+
+   ID3D12Device *dev;
+   if (FAILED(D3D12CreateDevice(adapter, D3D_FEATURE_LEVEL_12_0,
+       __uuidof(ID3D12Device), (void **)& dev)))
+      throw runtime_error("D3D12CreateDevice failed");
+
+   return dev;
+}
+
+ComPtr<ID3D12RootSignature>
+ComputeTest::create_root_signature(const ComputeTest::Resources &resources)
+{
+   D3D12_ROOT_PARAMETER1 root_param;
+   root_param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+   root_param.DescriptorTable.NumDescriptorRanges = resources.ranges.size();
+   root_param.DescriptorTable.pDescriptorRanges = resources.ranges.data();
+   root_param.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+
+   D3D12_ROOT_SIGNATURE_DESC1 root_sig_desc;
+   root_sig_desc.NumParameters = 1;
+   root_sig_desc.pParameters = &root_param;
+   root_sig_desc.NumStaticSamplers = 0;
+   root_sig_desc.pStaticSamplers = NULL;
+   root_sig_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
+
+   D3D12_VERSIONED_ROOT_SIGNATURE_DESC versioned_desc;
+   versioned_desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+   versioned_desc.Desc_1_1 = root_sig_desc;
+
+   ID3DBlob *sig, *error;
+   if (FAILED(D3D12SerializeVersionedRootSignature(&versioned_desc,
+       &sig, &error)))
+      throw runtime_error("D3D12SerializeVersionedRootSignature failed");
+
+   ComPtr<ID3D12RootSignature> ret;
+   if (FAILED(dev->CreateRootSignature(0,
+       sig->GetBufferPointer(),
+       sig->GetBufferSize(),
+       __uuidof(ret),
+       (void **)& ret)))
+      throw runtime_error("CreateRootSignature failed");
+
+   return ret;
+}
+
+ComPtr<ID3D12PipelineState>
+ComputeTest::create_pipeline_state(ComPtr<ID3D12RootSignature> &root_sig,
+                                   const struct clc_dxil_object &dxil)
+{
+   D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc = { root_sig.Get() };
+   pipeline_desc.CS.pShaderBytecode = dxil.binary.data;
+   pipeline_desc.CS.BytecodeLength = dxil.binary.size;
+
+   ComPtr<ID3D12PipelineState> pipeline_state;
+   if (FAILED(dev->CreateComputePipelineState(&pipeline_desc,
+                                              __uuidof(pipeline_state),
+                                              (void **)& pipeline_state)))
+      throw runtime_error("Failed to create pipeline state");
+   return pipeline_state;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::create_buffer(int size, D3D12_HEAP_TYPE heap_type)
+{
+   D3D12_RESOURCE_DESC desc;
+   desc.Format = DXGI_FORMAT_UNKNOWN;
+   desc.Alignment = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT;
+   desc.Width = size;
+   desc.Height = 1;
+   desc.DepthOrArraySize = 1;
+   desc.MipLevels = 1;
+   desc.SampleDesc.Count = 1;
+   desc.SampleDesc.Quality = 0;
+   desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+   desc.Flags = heap_type == D3D12_HEAP_TYPE_DEFAULT ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE;
+   desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+
+   D3D12_HEAP_PROPERTIES heap_pris = dev->GetCustomHeapProperties(0, heap_type);
+
+   D3D12_RESOURCE_STATES initial_state = D3D12_RESOURCE_STATE_COMMON;
+   switch (heap_type) {
+   case D3D12_HEAP_TYPE_UPLOAD:
+      initial_state = D3D12_RESOURCE_STATE_GENERIC_READ;
+      break;
+
+   case D3D12_HEAP_TYPE_READBACK:
+      initial_state = D3D12_RESOURCE_STATE_COPY_DEST;
+      break;
+   }
+
+   ComPtr<ID3D12Resource> res;
+   if (FAILED(dev->CreateCommittedResource(&heap_pris,
+       D3D12_HEAP_FLAG_NONE, &desc, initial_state,
+       NULL, __uuidof(ID3D12Resource), (void **)&res)))
+      throw runtime_error("CreateCommittedResource failed");
+
+   return res;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::create_upload_buffer_with_data(const void *data, size_t size)
+{
+   auto upload_res = create_buffer(size, D3D12_HEAP_TYPE_UPLOAD);
+
+   void *ptr = NULL;
+   D3D12_RANGE res_range = { 0, (SIZE_T)size };
+   if (FAILED(upload_res->Map(0, &res_range, (void **)&ptr)))
+      throw runtime_error("Failed to map upload-buffer");
+   assert(ptr);
+   memcpy(ptr, data, size);
+   upload_res->Unmap(0, &res_range);
+   return upload_res;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::create_sized_buffer_with_data(size_t buffer_size,
+                                           const void *data,
+                                           size_t data_size)
+{
+   auto upload_res = create_upload_buffer_with_data(data, data_size);
+
+   auto res = create_buffer(buffer_size, D3D12_HEAP_TYPE_DEFAULT);
+   resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST);
+   cmdlist->CopyBufferRegion(res.Get(), 0, upload_res.Get(), 0, data_size);
+   resource_barrier(res, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON);
+   execute_cmdlist();
+
+   return res;
+}
+
+void
+ComputeTest::get_buffer_data(ComPtr<ID3D12Resource> res,
+                             void *buf, size_t size)
+{
+   auto readback_res = create_buffer(align(size, 4), D3D12_HEAP_TYPE_READBACK);
+   resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE);
+   cmdlist->CopyResource(readback_res.Get(), res.Get());
+   resource_barrier(res, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_COMMON);
+   execute_cmdlist();
+
+   void *ptr = NULL;
+   D3D12_RANGE res_range = { 0, size };
+   if (FAILED(readback_res->Map(0, &res_range, &ptr)))
+      throw runtime_error("Failed to map readback-buffer");
+
+   memcpy(buf, ptr, size);
+
+   D3D12_RANGE empty_range = { 0, 0 };
+   readback_res->Unmap(0, &empty_range);
+}
+
+void
+ComputeTest::resource_barrier(ComPtr<ID3D12Resource> &res,
+                              D3D12_RESOURCE_STATES state_before,
+                              D3D12_RESOURCE_STATES state_after)
+{
+   D3D12_RESOURCE_BARRIER barrier;
+   barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+   barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+   barrier.Transition.pResource = res.Get();
+   barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+   barrier.Transition.StateBefore = state_before;
+   barrier.Transition.StateAfter = state_after;
+   cmdlist->ResourceBarrier(1, &barrier);
+}
+
+void
+ComputeTest::execute_cmdlist()
+{
+   if (FAILED(cmdlist->Close()))
+      throw runtime_error("Closing ID3D12GraphicsCommandList failed");
+
+   ID3D12CommandList *cmdlists[] = { cmdlist };
+   cmdqueue->ExecuteCommandLists(1, cmdlists);
+   cmdqueue_fence->SetEventOnCompletion(fence_value, event);
+   cmdqueue->Signal(cmdqueue_fence, fence_value);
+   fence_value++;
+   WaitForSingleObject(event, INFINITE);
+
+   if (FAILED(cmdalloc->Reset()))
+      throw runtime_error("resetting ID3D12CommandAllocator failed");
+
+   if (FAILED(cmdlist->Reset(cmdalloc, NULL)))
+      throw runtime_error("resetting ID3D12GraphicsCommandList failed");
+}
+
+void
+ComputeTest::create_uav_buffer(ComPtr<ID3D12Resource> res,
+                               size_t width, size_t byte_stride,
+                               D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle)
+{
+   D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc;
+   uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+   uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+   uav_desc.Buffer.FirstElement = 0;
+   uav_desc.Buffer.NumElements = DIV_ROUND_UP(width * byte_stride, 4);
+   uav_desc.Buffer.StructureByteStride = 0;
+   uav_desc.Buffer.CounterOffsetInBytes = 0;
+   uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
+
+   dev->CreateUnorderedAccessView(res.Get(), NULL, &uav_desc, cpu_handle);
+}
+
+void
+ComputeTest::create_cbv(ComPtr<ID3D12Resource> res, size_t size,
+                        D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle)
+{
+   D3D12_CONSTANT_BUFFER_VIEW_DESC cbv_desc;
+   cbv_desc.BufferLocation = res ? res->GetGPUVirtualAddress() : 0;
+   cbv_desc.SizeInBytes = size;
+
+   dev->CreateConstantBufferView(&cbv_desc, cpu_handle);
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::add_uav_resource(ComputeTest::Resources &resources,
+                              unsigned spaceid, unsigned resid,
+                              const void *data, size_t num_elems,
+                              size_t elem_size)
+{
+   size_t size = align(elem_size * num_elems, 4);
+   D3D12_CPU_DESCRIPTOR_HANDLE handle;
+   ComPtr<ID3D12Resource> res;
+   handle = uav_heap->GetCPUDescriptorHandleForHeapStart();
+   handle = offset_cpu_handle(handle, resources.descs.size() * uav_heap_incr);
+
+   if (size) {
+      if (data)
+         res = create_buffer_with_data(data, size);
+      else
+         res = create_buffer(size, D3D12_HEAP_TYPE_DEFAULT);
+
+      resource_barrier(res, D3D12_RESOURCE_STATE_COMMON,
+                       D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+   }
+   create_uav_buffer(res, num_elems, elem_size, handle);
+   resources.add(res, D3D12_DESCRIPTOR_RANGE_TYPE_UAV, spaceid, resid);
+   return res;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::add_cbv_resource(ComputeTest::Resources &resources,
+                              unsigned spaceid, unsigned resid,
+                              const void *data, size_t size)
+{
+   unsigned aligned_size = align(size, 256);
+   D3D12_CPU_DESCRIPTOR_HANDLE handle;
+   ComPtr<ID3D12Resource> res;
+   handle = uav_heap->GetCPUDescriptorHandleForHeapStart();
+   handle = offset_cpu_handle(handle, resources.descs.size() * uav_heap_incr);
+
+   if (size) {
+     assert(data);
+     res = create_sized_buffer_with_data(aligned_size, data, size);
+   }
+   create_cbv(res, aligned_size, handle);
+   resources.add(res, D3D12_DESCRIPTOR_RANGE_TYPE_CBV, spaceid, resid);
+   return res;
+}
+
+void
+ComputeTest::run_shader_with_raw_args(Shader shader,
+                                      const CompileArgs &compile_args,
+                                      const std::vector<RawShaderArg *> &args)
+{
+   if (args.size() < 1)
+      throw runtime_error("no inputs");
+
+   static HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL");
+   if (!hD3D12Mod)
+      throw runtime_error("Failed to load D3D12.DLL");
+
+   D3D12SerializeVersionedRootSignature = (PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE)GetProcAddress(hD3D12Mod, "D3D12SerializeVersionedRootSignature");
+
+   if (args.size() != shader.dxil->kernel->num_args)
+      throw runtime_error("incorrect number of inputs");
+
+   struct clc_runtime_kernel_conf conf = { 0 };
+
+   // Older WARP and some hardware doesn't support int64, so for these tests, unconditionally lower away int64
+   // A more complex runtime can be smarter about detecting when this needs to be done
+   conf.lower_bit_size = 64;
+
+   if (!shader.dxil->metadata.local_size[0])
+      conf.local_size[0] = compile_args.x;
+   else
+      conf.local_size[0] = shader.dxil->metadata.local_size[0];
+
+   if (!shader.dxil->metadata.local_size[1])
+      conf.local_size[1] = compile_args.y;
+   else
+      conf.local_size[1] = shader.dxil->metadata.local_size[1];
+
+   if (!shader.dxil->metadata.local_size[2])
+      conf.local_size[2] = compile_args.z;
+   else
+      conf.local_size[2] = shader.dxil->metadata.local_size[2];
+
+   if (compile_args.x % conf.local_size[0] ||
+       compile_args.y % conf.local_size[1] ||
+       compile_args.z % conf.local_size[2])
+      throw runtime_error("invalid global size must be a multiple of local size");
+
+   std::vector<struct clc_runtime_arg_info> argsinfo(args.size());
+
+   conf.args = argsinfo.data();
+   conf.support_global_work_id_offsets =
+      compile_args.work_props.global_offset_x != 0 ||
+      compile_args.work_props.global_offset_y != 0 ||
+      compile_args.work_props.global_offset_z != 0;
+   conf.support_work_group_id_offsets =
+      compile_args.work_props.group_id_offset_x != 0 ||
+      compile_args.work_props.group_id_offset_y != 0 ||
+      compile_args.work_props.group_id_offset_z != 0;
+
+   for (unsigned i = 0; i < shader.dxil->kernel->num_args; ++i) {
+      RawShaderArg *arg = args[i];
+      size_t size = arg->get_elem_size() * arg->get_num_elems();
+
+      switch (shader.dxil->kernel->args[i].address_qualifier) {
+      case CLC_KERNEL_ARG_ADDRESS_LOCAL:
+         argsinfo[i].localptr.size = size;
+         break;
+      default:
+         break;
+      }
+   }
+
+   configure(shader, &conf);
+   validate(shader);
+
+   std::shared_ptr<struct clc_dxil_object> &dxil = shader.dxil;
+
+   std::vector<uint8_t> argsbuf(dxil->metadata.kernel_inputs_buf_size);
+   std::vector<ComPtr<ID3D12Resource>> argres(shader.dxil->kernel->num_args);
+   clc_work_properties_data work_props = compile_args.work_props;
+   if (!conf.support_work_group_id_offsets) {
+      work_props.group_count_total_x = compile_args.x / conf.local_size[0];
+      work_props.group_count_total_y = compile_args.y / conf.local_size[1];
+      work_props.group_count_total_z = compile_args.z / conf.local_size[2];
+   }
+   if (work_props.work_dim == 0)
+      work_props.work_dim = 3;
+   Resources resources;
+
+   for (unsigned i = 0; i < dxil->kernel->num_args; ++i) {
+      RawShaderArg *arg = args[i];
+      size_t size = arg->get_elem_size() * arg->get_num_elems();
+      void *slot = argsbuf.data() + dxil->metadata.args[i].offset;
+
+      switch (dxil->kernel->args[i].address_qualifier) {
+      case CLC_KERNEL_ARG_ADDRESS_CONSTANT:
+      case CLC_KERNEL_ARG_ADDRESS_GLOBAL: {
+         assert(dxil->metadata.args[i].size == sizeof(uint64_t));
+         uint64_t *ptr_slot = (uint64_t *)slot;
+         if (arg->get_data())
+            *ptr_slot = (uint64_t)dxil->metadata.args[i].globconstptr.buf_id << 32;
+         else
+            *ptr_slot = ~0ull;
+         break;
+      }
+      case CLC_KERNEL_ARG_ADDRESS_LOCAL: {
+         assert(dxil->metadata.args[i].size == sizeof(uint64_t));
+         uint64_t *ptr_slot = (uint64_t *)slot;
+         *ptr_slot = dxil->metadata.args[i].localptr.sharedmem_offset;
+         break;
+      }
+      case CLC_KERNEL_ARG_ADDRESS_PRIVATE: {
+         assert(size == dxil->metadata.args[i].size);
+         memcpy(slot, arg->get_data(), size);
+         break;
+      }
+      default:
+         assert(0);
+      }
+   }
+
+   for (unsigned i = 0; i < dxil->kernel->num_args; ++i) {
+      RawShaderArg *arg = args[i];
+
+      if (dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL ||
+          dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_CONSTANT) {
+         argres[i] = add_uav_resource(resources, 0,
+                                      dxil->metadata.args[i].globconstptr.buf_id,
+                                      arg->get_data(), arg->get_num_elems(),
+                                      arg->get_elem_size());
+      }
+   }
+
+   if (dxil->metadata.printf_uav_id > 0)
+      add_uav_resource(resources, 0, dxil->metadata.printf_uav_id, NULL, 1024 * 1024 / 4, 4);
+
+   for (unsigned i = 0; i < dxil->metadata.num_consts; ++i)
+      add_uav_resource(resources, 0, dxil->metadata.consts[i].uav_id,
+                       dxil->metadata.consts[i].data,
+                       dxil->metadata.consts[i].size / 4, 4);
+
+   if (argsbuf.size())
+      add_cbv_resource(resources, 0, dxil->metadata.kernel_inputs_cbv_id,
+                       argsbuf.data(), argsbuf.size());
+
+   add_cbv_resource(resources, 0, dxil->metadata.work_properties_cbv_id,
+                    &work_props, sizeof(work_props));
+
+   auto root_sig = create_root_signature(resources);
+   auto pipeline_state = create_pipeline_state(root_sig, *dxil);
+
+   cmdlist->SetDescriptorHeaps(1, &uav_heap);
+   cmdlist->SetComputeRootSignature(root_sig.Get());
+   cmdlist->SetComputeRootDescriptorTable(0, uav_heap->GetGPUDescriptorHandleForHeapStart());
+   cmdlist->SetPipelineState(pipeline_state.Get());
+
+   cmdlist->Dispatch(compile_args.x / conf.local_size[0],
+                     compile_args.y / conf.local_size[1],
+                     compile_args.z / conf.local_size[2]);
+
+   for (auto &range : resources.ranges) {
+      if (range.RangeType == D3D12_DESCRIPTOR_RANGE_TYPE_UAV) {
+         for (unsigned i = range.OffsetInDescriptorsFromTableStart;
+              i < range.NumDescriptors; i++) {
+            if (!resources.descs[i].Get())
+               continue;
+
+            resource_barrier(resources.descs[i],
+                             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+                             D3D12_RESOURCE_STATE_COMMON);
+         }
+      }
+   }
+
+   execute_cmdlist();
+
+   for (unsigned i = 0; i < args.size(); i++) {
+      if (!(args[i]->get_direction() & SHADER_ARG_OUTPUT))
+         continue;
+
+      assert(dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL);
+      get_buffer_data(argres[i], args[i]->get_data(),
+                      args[i]->get_elem_size() * args[i]->get_num_elems());
+   }
+
+   ComPtr<ID3D12InfoQueue> info_queue;
+   dev->QueryInterface(info_queue.ReleaseAndGetAddressOf());
+   if (info_queue)
+   {
+      EXPECT_EQ(0, info_queue->GetNumStoredMessages());
+      for (unsigned i = 0; i < info_queue->GetNumStoredMessages(); ++i) {
+         SIZE_T message_size = 0;
+         info_queue->GetMessageA(i, nullptr, &message_size);
+         D3D12_MESSAGE* message = (D3D12_MESSAGE*)malloc(message_size);
+         info_queue->GetMessageA(i, message, &message_size);
+         FAIL() << message->pDescription;
+         free(message);
+      }
+   }
+}
+
+void
+ComputeTest::SetUp()
+{
+   static struct clc_context *compiler_ctx_g = nullptr;
+
+   if (!compiler_ctx_g) {
+      clc_context_options options = { };
+      options.optimize = (debug_get_option_debug_compute() & COMPUTE_DEBUG_OPTIMIZE_LIBCLC) != 0;
+
+      compiler_ctx_g = clc_context_new(&logger, &options);
+      if (!compiler_ctx_g)
+         throw runtime_error("failed to create CLC compiler context");
+
+      if (debug_get_option_debug_compute() & COMPUTE_DEBUG_SERIALIZE_LIBCLC) {
+         void *serialized = nullptr;
+         size_t serialized_size = 0;
+         clc_context_serialize(compiler_ctx_g, &serialized, &serialized_size);
+         if (!serialized)
+            throw runtime_error("failed to serialize CLC compiler context");
+
+         clc_free_context(compiler_ctx_g);
+         compiler_ctx_g = nullptr;
+
+         compiler_ctx_g = clc_context_deserialize(serialized, serialized_size);
+         if (!compiler_ctx_g)
+            throw runtime_error("failed to deserialize CLC compiler context");
+
+         clc_context_free_serialized(serialized);
+      }
+   }
+   compiler_ctx = compiler_ctx_g;
+
+   enable_d3d12_debug_layer();
+
+   factory = get_dxgi_factory();
+   if (!factory)
+      throw runtime_error("failed to create DXGI factory");
+
+   adapter = choose_adapter(factory);
+   if (!adapter)
+      throw runtime_error("failed to choose adapter");
+
+   dev = create_device(adapter);
+   if (!dev)
+      throw runtime_error("failed to create device");
+
+   if (FAILED(dev->CreateFence(0, D3D12_FENCE_FLAG_NONE,
+                               __uuidof(cmdqueue_fence),
+                               (void **)&cmdqueue_fence)))
+      throw runtime_error("failed to create fence\n");
+
+   D3D12_COMMAND_QUEUE_DESC queue_desc;
+   queue_desc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
+   queue_desc.Priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL;
+   queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+   queue_desc.NodeMask = 0;
+   if (FAILED(dev->CreateCommandQueue(&queue_desc,
+                                      __uuidof(cmdqueue),
+                                      (void **)&cmdqueue)))
+      throw runtime_error("failed to create command queue");
+
+   if (FAILED(dev->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE,
+             __uuidof(cmdalloc), (void **)&cmdalloc)))
+      throw runtime_error("failed to create command allocator");
+
+   if (FAILED(dev->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE,
+             cmdalloc, NULL, __uuidof(cmdlist), (void **)&cmdlist)))
+      throw runtime_error("failed to create command list");
+
+   D3D12_DESCRIPTOR_HEAP_DESC heap_desc;
+   heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+   heap_desc.NumDescriptors = 1000;
+   heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+   heap_desc.NodeMask = 0;
+   if (FAILED(dev->CreateDescriptorHeap(&heap_desc,
+       __uuidof(uav_heap), (void **)&uav_heap)))
+      throw runtime_error("failed to create descriptor heap");
+
+   uav_heap_incr = dev->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+
+   event = CreateEvent(NULL, FALSE, FALSE, NULL);
+   if (!event)
+      throw runtime_error("Failed to create event");
+   fence_value = 1;
+}
+
+void
+ComputeTest::TearDown()
+{
+   CloseHandle(event);
+
+   uav_heap->Release();
+   cmdlist->Release();
+   cmdalloc->Release();
+   cmdqueue->Release();
+   cmdqueue_fence->Release();
+   dev->Release();
+   adapter->Release();
+   factory->Release();
+}
+
+PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE ComputeTest::D3D12SerializeVersionedRootSignature;
+
+bool
+validate_module(const struct clc_dxil_object &dxil)
+{
+   static HMODULE hmod = LoadLibrary("DXIL.DLL");
+   if (!hmod) {
+      /* Enabling experimental shaders allows us to run unsigned shader code,
+       * such as when under the debugger where we can't run the validator. */
+      if (debug_get_option_debug_compute() & COMPUTE_DEBUG_EXPERIMENTAL_SHADERS)
+         return true;
+      else
+         throw runtime_error("failed to load DXIL.DLL");
+   }
+
+   DxcCreateInstanceProc pfnDxcCreateInstance =
+      (DxcCreateInstanceProc)GetProcAddress(hmod, "DxcCreateInstance");
+   if (!pfnDxcCreateInstance)
+      throw runtime_error("failed to load DxcCreateInstance");
+
+   struct shader_blob : public IDxcBlob {
+      shader_blob(void *data, size_t size) : data(data), size(size) {}
+      LPVOID STDMETHODCALLTYPE GetBufferPointer() override { return data; }
+      SIZE_T STDMETHODCALLTYPE GetBufferSize() override { return size; }
+      HRESULT STDMETHODCALLTYPE QueryInterface(REFIID, void **) override { return E_NOINTERFACE; }
+      ULONG STDMETHODCALLTYPE AddRef() override { return 1; }
+      ULONG STDMETHODCALLTYPE Release() override { return 0; }
+      void *data;
+      size_t size;
+   } blob(dxil.binary.data, dxil.binary.size);
+
+   IDxcValidator *validator;
+   if (FAILED(pfnDxcCreateInstance(CLSID_DxcValidator, __uuidof(IDxcValidator),
+                                   (void **)&validator)))
+      throw runtime_error("failed to create IDxcValidator");
+
+   IDxcOperationResult *result;
+   if (FAILED(validator->Validate(&blob, DxcValidatorFlags_InPlaceEdit,
+                                  &result)))
+      throw runtime_error("Validate failed");
+
+   HRESULT hr;
+   if (FAILED(result->GetStatus(&hr)) ||
+       FAILED(hr)) {
+      IDxcBlobEncoding *message;
+      result->GetErrorBuffer(&message);
+      fprintf(stderr, "D3D12: validation failed: %*s\n",
+                   (int)message->GetBufferSize(),
+                   (char *)message->GetBufferPointer());
+      message->Release();
+      validator->Release();
+      result->Release();
+      return false;
+   }
+
+   validator->Release();
+   result->Release();
+   return true;
+}
+
+static void
+dump_blob(const char *path, const struct clc_dxil_object &dxil)
+{
+   FILE *fp = fopen(path, "wb");
+   if (fp) {
+      fwrite(dxil.binary.data, 1, dxil.binary.size, fp);
+      fclose(fp);
+      printf("D3D12: wrote '%s'...\n", path);
+   }
+}
+
+ComputeTest::Shader
+ComputeTest::compile(const std::vector<const char *> &sources,
+                     const std::vector<const char *> &compile_args,
+                     bool create_library)
+{
+   struct clc_compile_args args = { 0 };
+   args.args = compile_args.data();
+   args.num_args = (unsigned)compile_args.size();
+   struct clc_dxil_object *dxil;
+   ComputeTest::Shader shader;
+
+   std::vector<Shader> shaders;
+
+   args.source.name = "obj.cl";
+
+   for (unsigned i = 0; i < sources.size(); i++) {
+      args.source.value = sources[i];
+
+      auto obj = clc_compile(compiler_ctx, &args, &logger);
+      if (!obj)
+         throw runtime_error("failed to compile object!");
+
+      Shader shader;
+      shader.obj = std::shared_ptr<struct clc_object>(obj, clc_free_object);
+      shaders.push_back(shader);
+   }
+
+   if (shaders.size() == 1 && create_library)
+      return shaders[0];
+
+   return link(shaders, create_library);
+}
+
+ComputeTest::Shader
+ComputeTest::link(const std::vector<Shader> &sources,
+                  bool create_library)
+{
+   std::vector<const clc_object*> objs;
+   for (auto& source : sources)
+      objs.push_back(&*source.obj);
+
+   struct clc_linker_args link_args = {};
+   link_args.in_objs = objs.data();
+   link_args.num_in_objs = (unsigned)objs.size();
+   link_args.create_library = create_library;
+   struct clc_object *obj = clc_link(compiler_ctx,
+                                     &link_args,
+                                     &logger);
+   if (!obj)
+      throw runtime_error("failed to link objects!");
+
+   ComputeTest::Shader shader;
+   shader.obj = std::shared_ptr<struct clc_object>(obj, clc_free_object);
+   if (!link_args.create_library)
+      configure(shader, NULL);
+
+   return shader;
+}
+
+void
+ComputeTest::configure(Shader &shader,
+                       const struct clc_runtime_kernel_conf *conf)
+{
+   struct clc_dxil_object *dxil;
+
+   dxil = clc_to_dxil(compiler_ctx, shader.obj.get(), "main_test", conf, &logger);
+   if (!dxil)
+      throw runtime_error("failed to compile kernel!");
+
+   shader.dxil = std::shared_ptr<struct clc_dxil_object>(dxil, clc_free_dxil_object);
+}
+
+void
+ComputeTest::validate(ComputeTest::Shader &shader)
+{
+   dump_blob("unsigned.cso", *shader.dxil);
+   if (!validate_module(*shader.dxil))
+      throw runtime_error("failed to validate module!");
+
+   dump_blob("signed.cso", *shader.dxil);
+}
diff --git a/src/microsoft/clc/compute_test.h b/src/microsoft/clc/compute_test.h
new file mode 100644 (file)
index 0000000..6fb06e6
--- /dev/null
@@ -0,0 +1,324 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdexcept>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <gtest/gtest.h>
+#include <wrl.h>
+
+#include "clc_compiler.h"
+
+using std::runtime_error;
+using Microsoft::WRL::ComPtr;
+
+inline D3D12_CPU_DESCRIPTOR_HANDLE
+offset_cpu_handle(D3D12_CPU_DESCRIPTOR_HANDLE handle, UINT offset)
+{
+   handle.ptr += offset;
+   return handle;
+}
+
+inline size_t
+align(size_t value, unsigned alignment)
+{
+   assert(alignment > 0);
+   return ((value + (alignment - 1)) / alignment) * alignment;
+}
+
+class ComputeTest : public ::testing::Test {
+protected:
+   struct Shader {
+      std::shared_ptr<struct clc_object> obj;
+      std::shared_ptr<struct clc_dxil_object> dxil;
+   };
+
+   static void
+   enable_d3d12_debug_layer();
+
+   static IDXGIFactory4 *
+   get_dxgi_factory();
+
+   static IDXGIAdapter1 *
+   choose_adapter(IDXGIFactory4 *factory);
+
+   static ID3D12Device *
+   create_device(IDXGIAdapter1 *adapter);
+
+   struct Resources {
+      void add(ComPtr<ID3D12Resource> res,
+               D3D12_DESCRIPTOR_RANGE_TYPE type,
+               unsigned spaceid,
+               unsigned resid)
+      {
+         descs.push_back(res);
+
+         if(!ranges.empty() &&
+            ranges.back().RangeType == type &&
+            ranges.back().RegisterSpace == spaceid &&
+            ranges.back().BaseShaderRegister + ranges.back().NumDescriptors == resid) {
+            ranges.back().NumDescriptors++;
+           return;
+         }
+
+         D3D12_DESCRIPTOR_RANGE1 range;
+
+         range.RangeType = type;
+         range.NumDescriptors = 1;
+         range.BaseShaderRegister = resid;
+         range.RegisterSpace = spaceid;
+         range.OffsetInDescriptorsFromTableStart = descs.size() - 1;
+         range.Flags = D3D12_DESCRIPTOR_RANGE_FLAG_DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS;
+         ranges.push_back(range);
+      }
+
+      std::vector<D3D12_DESCRIPTOR_RANGE1> ranges;
+      std::vector<ComPtr<ID3D12Resource>> descs;
+   };
+
+   ComPtr<ID3D12RootSignature>
+   create_root_signature(const Resources &resources);
+
+   ComPtr<ID3D12PipelineState>
+   create_pipeline_state(ComPtr<ID3D12RootSignature> &root_sig,
+                         const struct clc_dxil_object &dxil);
+
+   ComPtr<ID3D12Resource>
+   create_buffer(int size, D3D12_HEAP_TYPE heap_type);
+
+   ComPtr<ID3D12Resource>
+   create_upload_buffer_with_data(const void *data, size_t size);
+
+   ComPtr<ID3D12Resource>
+   create_sized_buffer_with_data(size_t buffer_size, const void *data,
+                                 size_t data_size);
+
+   ComPtr<ID3D12Resource>
+   create_buffer_with_data(const void *data, size_t size)
+   {
+      return create_sized_buffer_with_data(size, data, size);
+   }
+
+   void
+   get_buffer_data(ComPtr<ID3D12Resource> res,
+                   void *buf, size_t size);
+
+   void
+   resource_barrier(ComPtr<ID3D12Resource> &res,
+                    D3D12_RESOURCE_STATES state_before,
+                    D3D12_RESOURCE_STATES state_after);
+
+   void
+   execute_cmdlist();
+
+   void
+   create_uav_buffer(ComPtr<ID3D12Resource> res,
+                     size_t width, size_t byte_stride,
+                     D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle);
+
+   void create_cbv(ComPtr<ID3D12Resource> res, size_t size,
+                   D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle);
+
+   ComPtr<ID3D12Resource>
+   add_uav_resource(Resources &resources, unsigned spaceid, unsigned resid,
+                    const void *data = NULL, size_t num_elems = 0,
+                    size_t elem_size = 0);
+
+   ComPtr<ID3D12Resource>
+   add_cbv_resource(Resources &resources, unsigned spaceid, unsigned resid,
+                    const void *data, size_t size);
+
+   void
+   SetUp() override;
+
+   void
+   TearDown() override;
+
+   Shader
+   compile(const std::vector<const char *> &sources,
+           const std::vector<const char *> &compile_args = {},
+           bool create_library = false);
+
+   Shader
+   link(const std::vector<Shader> &sources,
+        bool create_library = false);
+
+   void
+   configure(Shader &shader,
+             const struct clc_runtime_kernel_conf *conf);
+
+   void
+   validate(Shader &shader);
+
+   enum ShaderArgDirection {
+      SHADER_ARG_INPUT = 1,
+      SHADER_ARG_OUTPUT = 2,
+      SHADER_ARG_INOUT = SHADER_ARG_INPUT | SHADER_ARG_OUTPUT,
+   };
+
+   class RawShaderArg {
+   public:
+      RawShaderArg(enum ShaderArgDirection dir) : dir(dir) { }
+      virtual size_t get_elem_size() const = 0;
+      virtual size_t get_num_elems() const = 0;
+      virtual const void *get_data() const = 0;
+      virtual void *get_data() = 0;
+      enum ShaderArgDirection get_direction() { return dir; }
+   private:
+      enum ShaderArgDirection dir;
+   };
+
+   class NullShaderArg : public RawShaderArg {
+   public:
+      NullShaderArg() : RawShaderArg(SHADER_ARG_INPUT) { }
+      size_t get_elem_size() const override { return 0; }
+      size_t get_num_elems() const override { return 0; }
+      const void *get_data() const override { return NULL; }
+      void *get_data() override { return NULL; }
+   };
+
+   template <typename T>
+   class ShaderArg : public std::vector<T>, public RawShaderArg
+   {
+   public:
+      ShaderArg(const T &v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) :
+         std::vector<T>({ v }), RawShaderArg(dir) { }
+      ShaderArg(const std::vector<T> &v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) :
+         std::vector<T>(v), RawShaderArg(dir) { }
+      ShaderArg(const std::initializer_list<T> v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) :
+         std::vector<T>(v), RawShaderArg(dir) { }
+
+      ShaderArg<T>& operator =(const T &v)
+      {
+         this->clear();
+        this->push_back(v);
+         return *this;
+      }
+
+      operator T&() { return this->at(0); }
+      operator const T&() const { return this->at(0); }
+
+      ShaderArg<T>& operator =(const std::vector<T> &v)
+      {
+        *this = v;
+         return *this;
+      }
+
+      ShaderArg<T>& operator =(std::initializer_list<T> v)
+      {
+        *this = v;
+         return *this;
+      }
+
+      size_t get_elem_size() const override { return sizeof(T); }
+      size_t get_num_elems() const override { return this->size(); }
+      const void *get_data() const override { return this->data(); }
+      void *get_data() override { return this->data(); }
+   };
+
+   struct CompileArgs
+   {
+      unsigned x, y, z;
+      std::vector<const char *> compiler_command_line;
+      clc_work_properties_data work_props;
+   };
+
+private:
+   void gather_args(std::vector<RawShaderArg *> &args) { }
+
+   template <typename T, typename... Rest>
+   void gather_args(std::vector<RawShaderArg *> &args, T &arg, Rest&... rest)
+   {
+      args.push_back(&arg);
+      gather_args(args, rest...);
+   }
+
+   void run_shader_with_raw_args(Shader shader,
+                                 const CompileArgs &compile_args,
+                                 const std::vector<RawShaderArg *> &args);
+
+protected:
+   template <typename... Args>
+   void run_shader(Shader shader,
+                   const CompileArgs &compile_args,
+                   Args&... args)
+   {
+      std::vector<RawShaderArg *> raw_args;
+      gather_args(raw_args, args...);
+      run_shader_with_raw_args(shader, compile_args, raw_args);
+   }
+
+   template <typename... Args>
+   void run_shader(const std::vector<const char *> &sources,
+                   unsigned x, unsigned y, unsigned z,
+                   Args&... args)
+   {
+      std::vector<RawShaderArg *> raw_args;
+      gather_args(raw_args, args...);
+      CompileArgs compile_args = { x, y, z };
+      run_shader_with_raw_args(compile(sources), compile_args, raw_args);
+   }
+
+   template <typename... Args>
+   void run_shader(const std::vector<const char *> &sources,
+                   const CompileArgs &compile_args,
+                   Args&... args)
+   {
+      std::vector<RawShaderArg *> raw_args;
+      gather_args(raw_args, args...);
+      run_shader_with_raw_args(
+         compile(sources, compile_args.compiler_command_line),
+         compile_args, raw_args);
+   }
+
+   template <typename... Args>
+   void run_shader(const char *source,
+                   unsigned x, unsigned y, unsigned z,
+                   Args&... args)
+   {
+      std::vector<RawShaderArg *> raw_args;
+      gather_args(raw_args, args...);
+      CompileArgs compile_args = { x, y, z };
+      run_shader_with_raw_args(compile({ source }), compile_args, raw_args);
+   }
+
+   IDXGIFactory4 *factory;
+   IDXGIAdapter1 *adapter;
+   ID3D12Device *dev;
+   ID3D12Fence *cmdqueue_fence;
+   ID3D12CommandQueue *cmdqueue;
+   ID3D12CommandAllocator *cmdalloc;
+   ID3D12GraphicsCommandList *cmdlist;
+   ID3D12DescriptorHeap *uav_heap;
+
+   struct clc_context *compiler_ctx;
+
+   UINT uav_heap_incr;
+   int fence_value;
+
+   HANDLE event;
+   static PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE D3D12SerializeVersionedRootSignature;
+};
diff --git a/src/microsoft/clc/meson.build b/src/microsoft/clc/meson.build
new file mode 100644 (file)
index 0000000..9dc371d
--- /dev/null
@@ -0,0 +1,59 @@
+# Copyright © Microsoft Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+clang_resource_dir = join_paths(
+  dep_clang.get_variable(cmake: 'CLANG_INCLUDE_DIRS'), '..',
+  'lib', 'clang', dep_clang.version(), 'include'
+)
+
+opencl_c_h = custom_target(
+  'opencl-c.h',
+  input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c.h')],
+  output : 'opencl-c.h.h',
+  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_source'],
+)
+opencl_c_base_h = custom_target(
+  'opencl-c-base.h',
+  input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c-base.h')],
+  output : 'opencl-c-base.h.h',
+  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_base_source'],
+)
+
+libclc_compiler = shared_library(
+  'clglon12compiler',
+  'clc_compiler.c',
+  'clc_nir.c',
+  'clc_helpers.cpp',
+  opencl_c_h,
+  opencl_c_base_h,
+  vs_module_defs : 'clglon12compiler.def',
+  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_compiler, inc_gallium, inc_spirv],
+  dependencies: [idep_nir_headers, dep_clang, dep_llvm, cc.find_library('version'),
+    dep_llvmspirvlib, idep_mesautil, idep_libdxil_compiler, idep_nir, dep_spirv_tools]
+)
+
+clc_compiler_test = executable('clc_compiler_test',
+  ['clc_compiler_test.cpp', 'compute_test.cpp'],
+  link_with : [libclc_compiler],
+  dependencies : [idep_gtest, idep_mesautil],
+  include_directories : [inc_include, inc_src])
+
+test('clc_compiler_test', clc_compiler_test, timeout: 120)
diff --git a/src/microsoft/compiler/dxcapi.h b/src/microsoft/compiler/dxcapi.h
new file mode 100644 (file)
index 0000000..cde8f44
--- /dev/null
@@ -0,0 +1,676 @@
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcapi.h                                                                  //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides declarations for the DirectX Compiler API entry point.           //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef __DXC_API__
+#define __DXC_API__
+
+#ifdef _WIN32
+#ifndef DXC_API_IMPORT
+#define DXC_API_IMPORT __declspec(dllimport)
+#endif
+#else
+#ifndef DXC_API_IMPORT
+#define DXC_API_IMPORT __attribute__ ((visibility ("default")))
+#endif
+#endif
+
+#ifdef _WIN32
+#define DECLARE_CROSS_PLATFORM_UUIDOF(T)
+#define DEFINE_CROSS_PLATFORM_UUIDOF(T)
+#else
+#include <dlfcn.h>
+#include "dxc/Support/WinAdapter.h"
+#endif
+
+struct IMalloc;
+
+struct IDxcIncludeHandler;
+
+typedef HRESULT (__stdcall *DxcCreateInstanceProc)(
+    _In_ REFCLSID   rclsid,
+    _In_ REFIID     riid,
+    _Out_ LPVOID*   ppv
+);
+
+typedef HRESULT(__stdcall *DxcCreateInstance2Proc)(
+  _In_ IMalloc    *pMalloc,
+  _In_ REFCLSID   rclsid,
+  _In_ REFIID     riid,
+  _Out_ LPVOID*   ppv
+  );
+
+/// <summary>
+/// Creates a single uninitialized object of the class associated with a specified CLSID.
+/// </summary>
+/// <param name="rclsid">
+/// The CLSID associated with the data and code that will be used to create the object.
+/// </param>
+/// <param name="riid">
+/// A reference to the identifier of the interface to be used to communicate 
+/// with the object.
+/// </param>
+/// <param name="ppv">
+/// Address of pointer variable that receives the interface pointer requested
+/// in riid. Upon successful return, *ppv contains the requested interface
+/// pointer. Upon failure, *ppv contains NULL.</param>
+/// <remarks>
+/// While this function is similar to CoCreateInstance, there is no COM involvement.
+/// </remarks>
+
+extern "C"
+DXC_API_IMPORT HRESULT __stdcall DxcCreateInstance(
+  _In_ REFCLSID   rclsid,
+  _In_ REFIID     riid,
+  _Out_ LPVOID*   ppv
+  );
+
+extern "C"
+DXC_API_IMPORT HRESULT __stdcall DxcCreateInstance2(
+  _In_ IMalloc    *pMalloc,
+  _In_ REFCLSID   rclsid,
+  _In_ REFIID     riid,
+  _Out_ LPVOID*   ppv
+);
+
+// For convenience, equivalent definitions to CP_UTF8 and CP_UTF16.
+#define DXC_CP_UTF8 65001
+#define DXC_CP_UTF16 1200
+// Use DXC_CP_ACP for: Binary;  ANSI Text;  Autodetect UTF with BOM
+#define DXC_CP_ACP 0
+
+// This flag indicates that the shader hash was computed taking into account source information (-Zss)
+#define DXC_HASHFLAG_INCLUDES_SOURCE  1
+
+// Hash digest type for ShaderHash
+typedef struct DxcShaderHash {
+  UINT32 Flags; // DXC_HASHFLAG_*
+  BYTE HashDigest[16];
+} DxcShaderHash;
+
+#define DXC_FOURCC(ch0, ch1, ch2, ch3) (                     \
+  (UINT32)(UINT8)(ch0)        | (UINT32)(UINT8)(ch1) << 8  | \
+  (UINT32)(UINT8)(ch2) << 16  | (UINT32)(UINT8)(ch3) << 24   \
+  )
+#define DXC_PART_PDB                      DXC_FOURCC('I', 'L', 'D', 'B')
+#define DXC_PART_PDB_NAME                 DXC_FOURCC('I', 'L', 'D', 'N')
+#define DXC_PART_PRIVATE_DATA             DXC_FOURCC('P', 'R', 'I', 'V')
+#define DXC_PART_ROOT_SIGNATURE           DXC_FOURCC('R', 'T', 'S', '0')
+#define DXC_PART_DXIL                     DXC_FOURCC('D', 'X', 'I', 'L')
+#define DXC_PART_REFLECTION_DATA          DXC_FOURCC('R', 'D', 'A', 'T')
+#define DXC_PART_SHADER_HASH              DXC_FOURCC('H', 'A', 'S', 'H')
+#define DXC_PART_INPUT_SIGNATURE          DXC_FOURCC('I', 'S', 'G', '1')
+#define DXC_PART_OUTPUT_SIGNATURE         DXC_FOURCC('O', 'S', 'G', '1')
+#define DXC_PART_PATCH_CONSTANT_SIGNATURE DXC_FOURCC('P', 'S', 'G', '1')
+
+// Some option arguments are defined here for continuity with D3DCompile interface
+#define DXC_ARG_DEBUG L"-Zi"
+#define DXC_ARG_SKIP_VALIDATION L"-Vd"
+#define DXC_ARG_SKIP_OPTIMIZATIONS L"-Od"
+#define DXC_ARG_PACK_MATRIX_ROW_MAJOR L"-Zpr"
+#define DXC_ARG_PACK_MATRIX_COLUMN_MAJOR L"-Zpc"
+#define DXC_ARG_AVOID_FLOW_CONTROL L"-Gfa"
+#define DXC_ARG_PREFER_FLOW_CONTROL L"-Gfp"
+#define DXC_ARG_ENABLE_STRICTNESS L"-Ges"
+#define DXC_ARG_ENABLE_BACKWARDS_COMPATIBILITY L"-Gec"
+#define DXC_ARG_IEEE_STRICTNESS L"-Gis"
+#define DXC_ARG_OPTIMIZATION_LEVEL0 L"-O0"
+#define DXC_ARG_OPTIMIZATION_LEVEL1 L"-O1"
+#define DXC_ARG_OPTIMIZATION_LEVEL2 L"-O2"
+#define DXC_ARG_OPTIMIZATION_LEVEL3 L"-O3"
+#define DXC_ARG_WARNINGS_ARE_ERRORS L"-WX"
+#define DXC_ARG_RESOURCES_MAY_ALIAS L"-res_may_alias"
+#define DXC_ARG_ALL_RESOURCES_BOUND L"-all_resources_bound"
+#define DXC_ARG_DEBUG_NAME_FOR_SOURCE L"-Zss"
+#define DXC_ARG_DEBUG_NAME_FOR_BINARY L"-Zsb"
+
+// IDxcBlob is an alias of ID3D10Blob and ID3DBlob
+struct __declspec(uuid("8BA5FB08-5195-40e2-AC58-0D989C3A0102"))
+IDxcBlob : public IUnknown {
+public:
+  virtual LPVOID STDMETHODCALLTYPE GetBufferPointer(void) = 0;
+  virtual SIZE_T STDMETHODCALLTYPE GetBufferSize(void) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlob)
+};
+
+struct __declspec(uuid("7241d424-2646-4191-97c0-98e96e42fc68"))
+IDxcBlobEncoding : public IDxcBlob {
+public:
+  virtual HRESULT STDMETHODCALLTYPE GetEncoding(_Out_ BOOL *pKnown,
+                                                _Out_ UINT32 *pCodePage) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobEncoding)
+};
+
+// Notes on IDxcBlobUtf16 and IDxcBlobUtf8
+// These guarantee null-terminated text and the stated encoding.
+// GetBufferSize() will return the size in bytes, including null-terminator
+// GetStringLength() will return the length in characters, excluding the null-terminator
+// Name strings will use IDxcBlobUtf16, while other string output blobs,
+// such as errors/warnings, preprocessed HLSL, or other text will be based
+// on the -encoding option.
+
+// The API will use this interface for output name strings
+struct __declspec(uuid("A3F84EAB-0FAA-497E-A39C-EE6ED60B2D84"))
+IDxcBlobUtf16 : public IDxcBlobEncoding {
+public:
+  virtual LPCWSTR STDMETHODCALLTYPE GetStringPointer(void) = 0;
+  virtual SIZE_T STDMETHODCALLTYPE GetStringLength(void) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobUtf16)
+};
+struct __declspec(uuid("3DA636C9-BA71-4024-A301-30CBF125305B"))
+IDxcBlobUtf8 : public IDxcBlobEncoding {
+public:
+  virtual LPCSTR STDMETHODCALLTYPE GetStringPointer(void) = 0;
+  virtual SIZE_T STDMETHODCALLTYPE GetStringLength(void) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobUtf8)
+};
+
+struct __declspec(uuid("7f61fc7d-950d-467f-b3e3-3c02fb49187c"))
+IDxcIncludeHandler : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE LoadSource(
+    _In_z_ LPCWSTR pFilename,                                 // Candidate filename.
+    _COM_Outptr_result_maybenull_ IDxcBlob **ppIncludeSource  // Resultant source object for included file, nullptr if not found.
+    ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcIncludeHandler)
+};
+
+// Structure for supplying bytes or text input to Dxc APIs.
+// Use Encoding = 0 for non-text bytes, ANSI text, or unknown with BOM.
+typedef struct DxcBuffer {
+  LPCVOID Ptr;
+  SIZE_T Size;
+  UINT Encoding;
+} DxcText;
+
+struct DxcDefine {
+  LPCWSTR Name;
+  _Maybenull_ LPCWSTR Value;
+};
+
+struct __declspec(uuid("73EFFE2A-70DC-45F8-9690-EFF64C02429D"))
+IDxcCompilerArgs : public IUnknown {
+  // Pass GetArguments() and GetCount() to Compile
+  virtual LPCWSTR* STDMETHODCALLTYPE GetArguments() = 0;
+  virtual UINT32 STDMETHODCALLTYPE GetCount() = 0;
+
+  // Add additional arguments or defines here, if desired.
+  virtual HRESULT STDMETHODCALLTYPE AddArguments(
+    _In_opt_count_(argCount) LPCWSTR *pArguments,       // Array of pointers to arguments to add
+    _In_ UINT32 argCount                                // Number of arguments to add
+  ) = 0;
+  virtual HRESULT STDMETHODCALLTYPE AddArgumentsUTF8(
+    _In_opt_count_(argCount)LPCSTR *pArguments,         // Array of pointers to UTF-8 arguments to add
+    _In_ UINT32 argCount                                // Number of arguments to add
+  ) = 0;
+  virtual HRESULT STDMETHODCALLTYPE AddDefines(
+      _In_count_(defineCount) const DxcDefine *pDefines, // Array of defines
+      _In_ UINT32 defineCount                            // Number of defines
+  ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompilerArgs)
+};
+
+//////////////////////////
+// Legacy Interfaces
+/////////////////////////
+
+// NOTE: IDxcUtils replaces IDxcLibrary
+struct __declspec(uuid("e5204dc7-d18c-4c3c-bdfb-851673980fe7"))
+IDxcLibrary : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE SetMalloc(_In_opt_ IMalloc *pMalloc) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobFromBlob(
+    _In_ IDxcBlob *pBlob, UINT32 offset, UINT32 length, _COM_Outptr_ IDxcBlob **ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobFromFile(
+    _In_z_ LPCWSTR pFileName, _In_opt_ UINT32* codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingFromPinned(
+    _In_bytecount_(size) LPCVOID pText, UINT32 size, UINT32 codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingOnHeapCopy(
+    _In_bytecount_(size) LPCVOID pText, UINT32 size, UINT32 codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingOnMalloc(
+    _In_bytecount_(size) LPCVOID pText, IMalloc *pIMalloc, UINT32 size, UINT32 codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateIncludeHandler(
+    _COM_Outptr_ IDxcIncludeHandler **ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE CreateStreamFromBlobReadOnly(
+    _In_ IDxcBlob *pBlob, _COM_Outptr_ IStream **ppStream) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf8(
+    _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf16(
+    _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcLibrary)
+};
+
+// NOTE: IDxcResult replaces IDxcOperationResult
+struct __declspec(uuid("CEDB484A-D4E9-445A-B991-CA21CA157DC2"))
+IDxcOperationResult : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE GetStatus(_Out_ HRESULT *pStatus) = 0;
+
+  // GetResult returns the main result of the operation.
+  // This corresponds to:
+  // DXC_OUT_OBJECT - Compile() with shader or library target
+  // DXC_OUT_DISASSEMBLY - Disassemble()
+  // DXC_OUT_HLSL - Compile() with -P
+  // DXC_OUT_ROOT_SIGNATURE - Compile() with rootsig_* target
+  virtual HRESULT STDMETHODCALLTYPE GetResult(_COM_Outptr_result_maybenull_ IDxcBlob **ppResult) = 0;
+
+  // GetErrorBuffer Corresponds to DXC_OUT_ERRORS.
+  virtual HRESULT STDMETHODCALLTYPE GetErrorBuffer(_COM_Outptr_result_maybenull_ IDxcBlobEncoding **ppErrors) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOperationResult)
+};
+
+// NOTE: IDxcCompiler3 replaces IDxcCompiler and IDxcCompiler2
+struct __declspec(uuid("8c210bf3-011f-4422-8d70-6f9acb8db617"))
+IDxcCompiler : public IUnknown {
+  // Compile a single entry point to the target shader model
+  virtual HRESULT STDMETHODCALLTYPE Compile(
+    _In_ IDxcBlob *pSource,                       // Source text to compile
+    _In_opt_z_ LPCWSTR pSourceName,               // Optional file name for pSource. Used in errors and include handlers.
+    _In_opt_z_ LPCWSTR pEntryPoint,               // entry point name
+    _In_z_ LPCWSTR pTargetProfile,                // shader profile to compile
+    _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+    _In_ UINT32 argCount,                         // Number of arguments
+    _In_count_(defineCount)
+      const DxcDefine *pDefines,                  // Array of defines
+    _In_ UINT32 defineCount,                      // Number of defines
+    _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+    _COM_Outptr_ IDxcOperationResult **ppResult   // Compiler output status, buffer, and errors
+  ) = 0;
+
+  // Preprocess source text
+  virtual HRESULT STDMETHODCALLTYPE Preprocess(
+    _In_ IDxcBlob *pSource,                       // Source text to preprocess
+    _In_opt_z_ LPCWSTR pSourceName,               // Optional file name for pSource. Used in errors and include handlers.
+    _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+    _In_ UINT32 argCount,                         // Number of arguments
+    _In_count_(defineCount)
+      const DxcDefine *pDefines,                  // Array of defines
+    _In_ UINT32 defineCount,                      // Number of defines
+    _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+    _COM_Outptr_ IDxcOperationResult **ppResult   // Preprocessor output status, buffer, and errors
+  ) = 0;
+
+  // Disassemble a program.
+  virtual HRESULT STDMETHODCALLTYPE Disassemble(
+    _In_ IDxcBlob *pSource,                         // Program to disassemble.
+    _COM_Outptr_ IDxcBlobEncoding **ppDisassembly   // Disassembly text.
+    ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler)
+};
+
+// NOTE: IDxcCompiler3 replaces IDxcCompiler and IDxcCompiler2
+struct __declspec(uuid("A005A9D9-B8BB-4594-B5C9-0E633BEC4D37"))
+IDxcCompiler2 : public IDxcCompiler {
+  // Compile a single entry point to the target shader model with debug information.
+  virtual HRESULT STDMETHODCALLTYPE CompileWithDebug(
+    _In_ IDxcBlob *pSource,                       // Source text to compile
+    _In_opt_z_ LPCWSTR pSourceName,               // Optional file name for pSource. Used in errors and include handlers.
+    _In_opt_z_ LPCWSTR pEntryPoint,               // Entry point name
+    _In_z_ LPCWSTR pTargetProfile,                // Shader profile to compile
+    _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+    _In_ UINT32 argCount,                         // Number of arguments
+    _In_count_(defineCount)
+      const DxcDefine *pDefines,                  // Array of defines
+    _In_ UINT32 defineCount,                      // Number of defines
+    _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+    _COM_Outptr_ IDxcOperationResult **ppResult,  // Compiler output status, buffer, and errors
+    _Outptr_opt_result_z_ LPWSTR *ppDebugBlobName,// Suggested file name for debug blob. (Must be HeapFree()'d!)
+    _COM_Outptr_opt_ IDxcBlob **ppDebugBlob       // Debug blob
+  ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler2)
+};
+
+struct __declspec(uuid("F1B5BE2A-62DD-4327-A1C2-42AC1E1E78E6"))
+IDxcLinker : public IUnknown {
+public:
+  // Register a library with name to ref it later.
+  virtual HRESULT RegisterLibrary(
+    _In_opt_ LPCWSTR pLibName,          // Name of the library.
+    _In_ IDxcBlob *pLib                 // Library blob.
+  ) = 0;
+
+  // Links the shader and produces a shader blob that the Direct3D runtime can
+  // use.
+  virtual HRESULT STDMETHODCALLTYPE Link(
+    _In_opt_ LPCWSTR pEntryName,        // Entry point name
+    _In_ LPCWSTR pTargetProfile,        // shader profile to link
+    _In_count_(libCount)
+        const LPCWSTR *pLibNames,       // Array of library names to link
+    _In_ UINT32 libCount,               // Number of libraries to link
+    _In_opt_count_(argCount) const LPCWSTR *pArguments, // Array of pointers to arguments
+    _In_ UINT32 argCount,               // Number of arguments
+    _COM_Outptr_
+        IDxcOperationResult **ppResult  // Linker output status, buffer, and errors
+  ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcLinker)
+};
+
+/////////////////////////
+// Latest interfaces. Please use these
+////////////////////////
+
+// NOTE: IDxcUtils replaces IDxcLibrary
+struct __declspec(uuid("4605C4CB-2019-492A-ADA4-65F20BB7D67F"))
+IDxcUtils : public IUnknown {
+  // Create a sub-blob that holds a reference to the outer blob and points to its memory.
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobFromBlob(
+    _In_ IDxcBlob *pBlob, UINT32 offset, UINT32 length, _COM_Outptr_ IDxcBlob **ppResult) = 0;
+
+  // For codePage, use 0 (or DXC_CP_ACP) for raw binary or ANSI code page
+
+  // Creates a blob referencing existing memory, with no copy.
+  // User must manage the memory lifetime separately.
+  // (was: CreateBlobWithEncodingFromPinned)
+  virtual HRESULT STDMETHODCALLTYPE CreateBlobFromPinned(
+    _In_bytecount_(size) LPCVOID pData, UINT32 size, UINT32 codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+  // Create blob, taking ownership of memory allocated with supplied allocator.
+  // (was: CreateBlobWithEncodingOnMalloc)
+  virtual HRESULT STDMETHODCALLTYPE MoveToBlob(
+    _In_bytecount_(size) LPCVOID pData, IMalloc *pIMalloc, UINT32 size, UINT32 codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+  ////
+  // New blobs and copied contents are allocated with the current allocator
+
+  // Copy blob contents to memory owned by the new blob.
+  // (was: CreateBlobWithEncodingOnHeapCopy)
+  virtual HRESULT STDMETHODCALLTYPE CreateBlob(
+    _In_bytecount_(size) LPCVOID pData, UINT32 size, UINT32 codePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+  // (was: CreateBlobFromFile)
+  virtual HRESULT STDMETHODCALLTYPE LoadFile(
+    _In_z_ LPCWSTR pFileName, _In_opt_ UINT32* pCodePage,
+    _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE CreateReadOnlyStreamFromBlob(
+    _In_ IDxcBlob *pBlob, _COM_Outptr_ IStream **ppStream) = 0;
+
+  // Create default file-based include handler
+  virtual HRESULT STDMETHODCALLTYPE CreateDefaultIncludeHandler(
+    _COM_Outptr_ IDxcIncludeHandler **ppResult) = 0;
+
+  // Convert or return matching encoded text blobs
+  virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf8(
+    _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobUtf8 **pBlobEncoding) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf16(
+    _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobUtf16 **pBlobEncoding) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE GetDxilContainerPart(
+    _In_ const DxcBuffer *pShader,
+    _In_ UINT32 DxcPart,
+    _Outptr_result_nullonfailure_ void **ppPartData,
+    _Out_ UINT32 *pPartSizeInBytes) = 0;
+
+  // Create reflection interface from serialized Dxil container, or DXC_PART_REFLECTION_DATA.
+  // TBD: Require part header for RDAT?  (leaning towards yes)
+  virtual HRESULT STDMETHODCALLTYPE CreateReflection(
+    _In_ const DxcBuffer *pData, REFIID iid, void **ppvReflection) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE BuildArguments(
+    _In_opt_z_ LPCWSTR pSourceName,               // Optional file name for pSource. Used in errors and include handlers.
+    _In_opt_z_ LPCWSTR pEntryPoint,               // Entry point name. (-E)
+    _In_z_ LPCWSTR pTargetProfile,                // Shader profile to compile. (-T)
+    _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+    _In_ UINT32 argCount,                         // Number of arguments
+    _In_count_(defineCount)
+      const DxcDefine *pDefines,                  // Array of defines
+    _In_ UINT32 defineCount,                      // Number of defines
+    _COM_Outptr_ IDxcCompilerArgs **ppArgs        // Arguments you can use with Compile() method
+  ) = 0;
+
+  // Takes the shader PDB and returns the hash and the container inside it
+  virtual HRESULT STDMETHODCALLTYPE GetPDBContents(
+    _In_ IDxcBlob *pPDBBlob, _COM_Outptr_ IDxcBlob **ppHash, _COM_Outptr_ IDxcBlob **ppContainer) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcUtils)
+};
+
+// For use with IDxcResult::[Has|Get]Output dxcOutKind argument
+// Note: text outputs returned from version 2 APIs are UTF-8 or UTF-16 based on -encoding option
+typedef enum DXC_OUT_KIND {
+  DXC_OUT_NONE = 0,
+  DXC_OUT_OBJECT = 1,         // IDxcBlob - Shader or library object
+  DXC_OUT_ERRORS = 2,         // IDxcBlobUtf8 or IDxcBlobUtf16
+  DXC_OUT_PDB = 3,            // IDxcBlob
+  DXC_OUT_SHADER_HASH = 4,    // IDxcBlob - DxcShaderHash of shader or shader with source info (-Zsb/-Zss)
+  DXC_OUT_DISASSEMBLY = 5,    // IDxcBlobUtf8 or IDxcBlobUtf16 - from Disassemble
+  DXC_OUT_HLSL = 6,           // IDxcBlobUtf8 or IDxcBlobUtf16 - from Preprocessor or Rewriter
+  DXC_OUT_TEXT = 7,           // IDxcBlobUtf8 or IDxcBlobUtf16 - other text, such as -ast-dump or -Odump
+  DXC_OUT_REFLECTION = 8,     // IDxcBlob - RDAT part with reflection data
+  DXC_OUT_ROOT_SIGNATURE = 9, // IDxcBlob - Serialized root signature output
+
+  DXC_OUT_FORCE_DWORD = 0xFFFFFFFF
+} DXC_OUT_KIND;
+
+struct __declspec(uuid("58346CDA-DDE7-4497-9461-6F87AF5E0659"))
+IDxcResult : public IDxcOperationResult {
+  virtual BOOL STDMETHODCALLTYPE HasOutput(_In_ DXC_OUT_KIND dxcOutKind) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetOutput(_In_ DXC_OUT_KIND dxcOutKind,
+    _In_ REFIID iid, _COM_Outptr_opt_result_maybenull_ void **ppvObject,
+    _COM_Outptr_ IDxcBlobUtf16 **ppOutputName) = 0;
+
+  virtual UINT32 GetNumOutputs() = 0;
+  virtual DXC_OUT_KIND GetOutputByIndex(UINT32 Index) = 0;
+  virtual DXC_OUT_KIND PrimaryOutput() = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcResult)
+};
+
+struct __declspec(uuid("228B4687-5A6A-4730-900C-9702B2203F54"))
+IDxcCompiler3 : public IUnknown {
+  // Compile a single entry point to the target shader model,
+  // Compile a library to a library target (-T lib_*),
+  // Compile a root signature (-T rootsig_*), or
+  // Preprocess HLSL source (-P)
+  virtual HRESULT STDMETHODCALLTYPE Compile(
+    _In_ const DxcBuffer *pSource,                // Source text to compile
+    _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+    _In_ UINT32 argCount,                         // Number of arguments
+    _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+    _In_ REFIID riid, _Out_ LPVOID *ppResult      // IDxcResult: status, buffer, and errors
+  ) = 0;
+
+  // Disassemble a program.
+  virtual HRESULT STDMETHODCALLTYPE Disassemble(
+    _In_ const DxcBuffer *pObject,                // Program to disassemble: dxil container or bitcode.
+    _In_ REFIID riid, _Out_ LPVOID *ppResult      // IDxcResult: status, disassembly text, and errors
+    ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler3)
+};
+
+static const UINT32 DxcValidatorFlags_Default = 0;
+static const UINT32 DxcValidatorFlags_InPlaceEdit = 1;  // Validator is allowed to update shader blob in-place.
+static const UINT32 DxcValidatorFlags_RootSignatureOnly = 2;
+static const UINT32 DxcValidatorFlags_ModuleOnly = 4;
+static const UINT32 DxcValidatorFlags_ValidMask = 0x7;
+
+struct __declspec(uuid("A6E82BD2-1FD7-4826-9811-2857E797F49A"))
+IDxcValidator : public IUnknown {
+  // Validate a shader.
+  virtual HRESULT STDMETHODCALLTYPE Validate(
+    _In_ IDxcBlob *pShader,                       // Shader to validate.
+    _In_ UINT32 Flags,                            // Validation flags.
+    _COM_Outptr_ IDxcOperationResult **ppResult   // Validation output status, buffer, and errors
+    ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcValidator)
+};
+
+struct __declspec(uuid("334b1f50-2292-4b35-99a1-25588d8c17fe"))
+IDxcContainerBuilder : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pDxilContainerHeader) = 0;                // Loads DxilContainer to the builder
+  virtual HRESULT STDMETHODCALLTYPE AddPart(_In_ UINT32 fourCC, _In_ IDxcBlob *pSource) = 0;      // Part to add to the container
+  virtual HRESULT STDMETHODCALLTYPE RemovePart(_In_ UINT32 fourCC) = 0;                           // Remove the part with fourCC
+  virtual HRESULT STDMETHODCALLTYPE SerializeContainer(_Out_ IDxcOperationResult **ppResult) = 0; // Builds a container of the given container builder state
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcContainerBuilder)
+};
+
+struct __declspec(uuid("091f7a26-1c1f-4948-904b-e6e3a8a771d5"))
+IDxcAssembler : public IUnknown {
+  // Assemble dxil in ll or llvm bitcode to DXIL container.
+  virtual HRESULT STDMETHODCALLTYPE AssembleToContainer(
+    _In_ IDxcBlob *pShader,                       // Shader to assemble.
+    _COM_Outptr_ IDxcOperationResult **ppResult   // Assembly output status, buffer, and errors
+    ) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcAssembler)
+};
+
+struct __declspec(uuid("d2c21b26-8350-4bdc-976a-331ce6f4c54c"))
+IDxcContainerReflection : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pContainer) = 0; // Container to load.
+  virtual HRESULT STDMETHODCALLTYPE GetPartCount(_Out_ UINT32 *pResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetPartKind(UINT32 idx, _Out_ UINT32 *pResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetPartContent(UINT32 idx, _COM_Outptr_ IDxcBlob **ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE FindFirstPartKind(UINT32 kind, _Out_ UINT32 *pResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetPartReflection(UINT32 idx, REFIID iid, void **ppvObject) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcContainerReflection)
+};
+
+struct __declspec(uuid("AE2CD79F-CC22-453F-9B6B-B124E7A5204C"))
+IDxcOptimizerPass : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE GetOptionName(_COM_Outptr_ LPWSTR *ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetDescription(_COM_Outptr_ LPWSTR *ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetOptionArgCount(_Out_ UINT32 *pCount) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetOptionArgName(UINT32 argIndex, _COM_Outptr_ LPWSTR *ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetOptionArgDescription(UINT32 argIndex, _COM_Outptr_ LPWSTR *ppResult) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOptimizerPass)
+};
+
+struct __declspec(uuid("25740E2E-9CBA-401B-9119-4FB42F39F270"))
+IDxcOptimizer : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE GetAvailablePassCount(_Out_ UINT32 *pCount) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetAvailablePass(UINT32 index, _COM_Outptr_ IDxcOptimizerPass** ppResult) = 0;
+  virtual HRESULT STDMETHODCALLTYPE RunOptimizer(IDxcBlob *pBlob,
+    _In_count_(optionCount) LPCWSTR *ppOptions, UINT32 optionCount,
+    _COM_Outptr_ IDxcBlob **pOutputModule,
+    _COM_Outptr_opt_ IDxcBlobEncoding **ppOutputText) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOptimizer)
+};
+
+static const UINT32 DxcVersionInfoFlags_None = 0;
+static const UINT32 DxcVersionInfoFlags_Debug = 1; // Matches VS_FF_DEBUG
+static const UINT32 DxcVersionInfoFlags_Internal = 2; // Internal Validator (non-signing)
+
+struct __declspec(uuid("b04f5b50-2059-4f12-a8ff-a1e0cde1cc7e"))
+IDxcVersionInfo : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE GetVersion(_Out_ UINT32 *pMajor, _Out_ UINT32 *pMinor) = 0;
+  virtual HRESULT STDMETHODCALLTYPE GetFlags(_Out_ UINT32 *pFlags) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcVersionInfo)
+};
+
+struct __declspec(uuid("fb6904c4-42f0-4b62-9c46-983af7da7c83"))
+IDxcVersionInfo2 : public IDxcVersionInfo {
+  virtual HRESULT STDMETHODCALLTYPE GetCommitInfo(_Out_ UINT32 *pCommitCount, _Out_ char **pCommitHash) = 0;
+
+  DECLARE_CROSS_PLATFORM_UUIDOF(IDxcVersionInfo2)
+};
+
+// Note: __declspec(selectany) requires 'extern'
+// On Linux __declspec(selectany) is removed and using 'extern' results in link error.
+#ifdef _MSC_VER
+#define CLSID_SCOPE __declspec(selectany) extern
+#else
+#define CLSID_SCOPE
+#endif
+
+CLSID_SCOPE const CLSID CLSID_DxcCompiler = {
+    0x73e22d93,
+    0xe6ce,
+    0x47f3,
+    {0xb5, 0xbf, 0xf0, 0x66, 0x4f, 0x39, 0xc1, 0xb0}};
+
+// {EF6A8087-B0EA-4D56-9E45-D07E1A8B7806}
+CLSID_SCOPE const GUID CLSID_DxcLinker = {
+    0xef6a8087,
+    0xb0ea,
+    0x4d56,
+    {0x9e, 0x45, 0xd0, 0x7e, 0x1a, 0x8b, 0x78, 0x6}};
+
+// {CD1F6B73-2AB0-484D-8EDC-EBE7A43CA09F}
+CLSID_SCOPE const CLSID CLSID_DxcDiaDataSource = {
+    0xcd1f6b73,
+    0x2ab0,
+    0x484d,
+    {0x8e, 0xdc, 0xeb, 0xe7, 0xa4, 0x3c, 0xa0, 0x9f}};
+
+// {3E56AE82-224D-470F-A1A1-FE3016EE9F9D}
+CLSID_SCOPE const CLSID CLSID_DxcCompilerArgs = {
+    0x3e56ae82,
+    0x224d,
+    0x470f,
+    {0xa1, 0xa1, 0xfe, 0x30, 0x16, 0xee, 0x9f, 0x9d}};
+
+// {6245D6AF-66E0-48FD-80B4-4D271796748C}
+CLSID_SCOPE const GUID CLSID_DxcLibrary = {
+    0x6245d6af,
+    0x66e0,
+    0x48fd,
+    {0x80, 0xb4, 0x4d, 0x27, 0x17, 0x96, 0x74, 0x8c}};
+
+CLSID_SCOPE const GUID CLSID_DxcUtils = CLSID_DxcLibrary;
+
+// {8CA3E215-F728-4CF3-8CDD-88AF917587A1}
+CLSID_SCOPE const GUID CLSID_DxcValidator = {
+    0x8ca3e215,
+    0xf728,
+    0x4cf3,
+    {0x8c, 0xdd, 0x88, 0xaf, 0x91, 0x75, 0x87, 0xa1}};
+
+// {D728DB68-F903-4F80-94CD-DCCF76EC7151}
+CLSID_SCOPE const GUID CLSID_DxcAssembler = {
+    0xd728db68,
+    0xf903,
+    0x4f80,
+    {0x94, 0xcd, 0xdc, 0xcf, 0x76, 0xec, 0x71, 0x51}};
+
+// {b9f54489-55b8-400c-ba3a-1675e4728b91}
+CLSID_SCOPE const GUID CLSID_DxcContainerReflection = {
+    0xb9f54489,
+    0x55b8,
+    0x400c,
+    {0xba, 0x3a, 0x16, 0x75, 0xe4, 0x72, 0x8b, 0x91}};
+
+// {AE2CD79F-CC22-453F-9B6B-B124E7A5204C}
+CLSID_SCOPE const GUID CLSID_DxcOptimizer = {
+    0xae2cd79f,
+    0xcc22,
+    0x453f,
+    {0x9b, 0x6b, 0xb1, 0x24, 0xe7, 0xa5, 0x20, 0x4c}};
+
+// {94134294-411f-4574-b4d0-8741e25240d2}
+CLSID_SCOPE const GUID CLSID_DxcContainerBuilder = {
+    0x94134294,
+    0x411f,
+    0x4574,
+    {0xb4, 0xd0, 0x87, 0x41, 0xe2, 0x52, 0x40, 0xd2}};
+#endif
index e6c7a09..d08fd52 100644 (file)
 #include "util/u_math.h"
 
 static void
+cl_type_size_align(const struct glsl_type *type, unsigned *size,
+                   unsigned *align)
+{
+   *size = glsl_get_cl_size(type);
+   *align = glsl_get_cl_alignment(type);
+}
+
+static void
 extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32,
                          unsigned dst_bit_size,
                          nir_ssa_def **dst_comps,
@@ -62,6 +70,116 @@ extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32,
 }
 
 static nir_ssa_def *
+load_comps_to_vec32(nir_builder *b, unsigned src_bit_size,
+                    nir_ssa_def **src_comps, unsigned num_src_comps)
+{
+   unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32);
+   unsigned step = DIV_ROUND_UP(src_bit_size, 32);
+   unsigned comps_per32b = 32 / src_bit_size;
+   nir_ssa_def *vec32comps[4];
+
+   for (unsigned i = 0; i < num_vec32comps; i += step) {
+      nir_ssa_def *tmp;
+      switch (src_bit_size) {
+      case 64:
+         vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]);
+         vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]);
+         break;
+      case 32:
+         vec32comps[i] = src_comps[i];
+         break;
+      case 16:
+      case 8:
+         unsigned src_offs = i * comps_per32b;
+
+         vec32comps[i] = nir_u2u32(b, src_comps[src_offs]);
+         for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) {
+             nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]),
+                                            nir_imm_int(b, j * src_bit_size));
+             vec32comps[i] = nir_ior(b, vec32comps[i], tmp);
+         }
+         break;
+      }
+   }
+
+   return nir_vec(b, vec32comps, num_vec32comps);
+}
+
+static nir_ssa_def *
+build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ptr_dxil);
+
+   load->num_components = 1;
+   load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+   load->src[1] = nir_src_for_ssa(idx);
+   nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->dest.ssa;
+}
+
+static bool
+lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   if (!nir_deref_mode_is(deref, nir_var_shader_temp))
+      return false;
+   nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align));
+   nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3)));
+
+   assert(intr->dest.is_ssa);
+   unsigned num_components = nir_dest_num_components(intr->dest);
+   unsigned bit_size = nir_dest_bit_size(intr->dest);
+   unsigned load_size = MAX2(32, bit_size);
+   unsigned num_bits = num_components * bit_size;
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+   unsigned comp_idx = 0;
+
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+   nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */));
+
+   /* Split loads into 32-bit chunks */
+   for (unsigned i = 0; i < num_bits; i += load_size) {
+      unsigned subload_num_bits = MIN2(num_bits - i, load_size);
+      nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32));
+      nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx);
+
+      if (load_size == 64) {
+         idx = nir_iadd(b, idx, nir_imm_int(b, 1));
+         vec32 = nir_vec2(b, vec32,
+                             build_load_ptr_dxil(b, path.path[0], idx));
+      }
+
+      /* If we have 2 bytes or less to load we need to adjust the u32 value so
+       * we can always extract the LSB.
+       */
+      if (subload_num_bits <= 16) {
+         nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)),
+                                          nir_imm_int(b, 8));
+         vec32 = nir_ushr(b, vec32, shift);
+      }
+
+      /* And now comes the pack/unpack step to match the original type. */
+      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
+                               subload_num_bits / bit_size);
+      comp_idx += subload_num_bits / bit_size;
+   }
+
+   nir_deref_path_finish(&path);
+   assert(comp_idx == num_components);
+   nir_ssa_def *result = nir_vec(b, comps, num_components);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static nir_ssa_def *
 ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,
                           nir_ssa_def *offset, unsigned num_bytes)
 {
@@ -155,3 +273,1107 @@ build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
    assert(comp_idx == num_components);
    return nir_vec(b, comps, num_components);
 }
+
+static bool
+lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+   assert(intr->src[0].is_ssa);
+   assert(intr->src[1].is_ssa);
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_ssa_def *buffer = intr->src[0].ssa;
+   nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3UL));
+   unsigned bit_size = nir_dest_bit_size(intr->dest);
+   unsigned num_components = nir_dest_num_components(intr->dest);
+   unsigned num_bits = num_components * bit_size;
+
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+   unsigned comp_idx = 0;
+
+   /* We need to split loads in 16byte chunks because that's the optimal
+    * granularity of bufferLoad(). Minimum alignment is 4byte, which saves
+    * from us from extra complexity to extract >= 32 bit components.
+    */
+   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
+      /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
+       * load.
+       */
+      unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32);
+      nir_intrinsic_instr *load =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_load_ssbo);
+
+      /* The number of components to store depends on the number of bytes. */
+      load->num_components = DIV_ROUND_UP(subload_num_bits, 32);
+      load->src[0] = nir_src_for_ssa(buffer);
+      load->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i / 8)));
+      nir_ssa_dest_init(&load->instr, &load->dest, load->num_components,
+                        32, NULL);
+      nir_builder_instr_insert(b, &load->instr);
+
+      nir_ssa_def *vec32 = &load->dest.ssa;
+
+      /* If we have 2 bytes or less to load we need to adjust the u32 value so
+       * we can always extract the LSB.
+       */
+      if (subload_num_bits <= 16) {
+         nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)),
+                                          nir_imm_int(b, 8));
+         vec32 = nir_ushr(b, vec32, shift);
+      }
+
+      nir_intrinsic_set_align(load, 4, 0);
+
+      /* And now comes the pack/unpack step to match the original type. */
+      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
+                               subload_num_bits / bit_size);
+      comp_idx += subload_num_bits / bit_size;
+   }
+
+   assert(comp_idx == num_components);
+   nir_ssa_def *result = nir_vec(b, comps, num_components);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+
+   assert(intr->src[0].is_ssa);
+   assert(intr->src[1].is_ssa);
+   assert(intr->src[2].is_ssa);
+
+   nir_ssa_def *val = intr->src[0].ssa;
+   nir_ssa_def *buffer = intr->src[1].ssa;
+   nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3UL));
+
+   unsigned bit_size = val->bit_size;
+   unsigned num_components = val->num_components;
+   unsigned num_bits = num_components * bit_size;
+
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+   unsigned comp_idx = 0;
+
+   for (unsigned i = 0; i < num_components; i++)
+      comps[i] = nir_channel(b, val, i);
+
+   /* We split stores in 16byte chunks because that's the optimal granularity
+    * of bufferStore(). Minimum alignment is 4byte, which saves from us from
+    * extra complexity to store >= 32 bit components.
+    */
+   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
+      /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
+       * store.
+       */
+      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
+      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
+      nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
+                                               substore_num_bits / bit_size);
+      nir_intrinsic_instr *store;
+
+      if (substore_num_bits < 32) {
+         nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1);
+
+        /* If we have 16 bits or less to store we need to place them
+         * correctly in the u32 component. Anything greater than 16 bits
+         * (including uchar3) is naturally aligned on 32bits.
+         */
+         if (substore_num_bits <= 16) {
+            nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3));
+            nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
+
+            vec32 = nir_ishl(b, vec32, shift);
+            mask = nir_ishl(b, mask, shift);
+         }
+
+         store = nir_intrinsic_instr_create(b->shader,
+                                            nir_intrinsic_store_ssbo_masked_dxil);
+         store->src[0] = nir_src_for_ssa(vec32);
+         store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
+         store->src[2] = nir_src_for_ssa(buffer);
+         store->src[3] = nir_src_for_ssa(local_offset);
+      } else {
+         store = nir_intrinsic_instr_create(b->shader,
+                                            nir_intrinsic_store_ssbo);
+         store->src[0] = nir_src_for_ssa(vec32);
+         store->src[1] = nir_src_for_ssa(buffer);
+         store->src[2] = nir_src_for_ssa(local_offset);
+
+         nir_intrinsic_set_align(store, 4, 0);
+      }
+
+      /* The number of components to store depends on the number of bits. */
+      store->num_components = DIV_ROUND_UP(substore_num_bits, 32);
+      nir_builder_instr_insert(b, &store->instr);
+      comp_idx += substore_num_bits / bit_size;
+   }
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+void
+lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op)
+{
+   for (unsigned i = 0; i < num_comps; i++) {
+      nir_intrinsic_instr *load =
+         nir_intrinsic_instr_create(b->shader, op);
+
+      load->num_components = 1;
+      load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
+      nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+      nir_builder_instr_insert(b, &load->instr);
+      comps[i] = &load->dest.ssa;
+   }
+}
+
+static bool
+lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+   unsigned bit_size = nir_dest_bit_size(intr->dest);
+   unsigned num_components = nir_dest_num_components(intr->dest);
+   unsigned num_bits = num_components * bit_size;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_intrinsic_op op = intr->intrinsic;
+
+   assert(intr->src[0].is_ssa);
+   nir_ssa_def *offset = intr->src[0].ssa;
+   if (op == nir_intrinsic_load_shared) {
+      offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
+      op = nir_intrinsic_load_shared_dxil;
+   } else {
+      offset = nir_u2u32(b, offset);
+      op = nir_intrinsic_load_scratch_dxil;
+   }
+   nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+   nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
+
+   /* We need to split loads in 32-bit accesses because the buffer
+    * is an i32 array and DXIL does not support type casts.
+    */
+   unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
+   lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op);
+   unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
+
+   for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
+      unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
+      unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
+      nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
+
+      /* If we have 16 bits or less to load we need to adjust the u32 value so
+       * we can always extract the LSB.
+       */
+      if (num_bits <= 16) {
+         nir_ssa_def *shift =
+            nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)),
+                        nir_imm_int(b, 8));
+         vec32 = nir_ushr(b, vec32, shift);
+      }
+
+      /* And now comes the pack/unpack step to match the original type. */
+      unsigned dest_index = i * 32 / bit_size;
+      extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps);
+   }
+
+   nir_ssa_def *result = nir_vec(b, comps, num_components);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+   nir_instr_remove(&intr->instr);
+
+   return true;
+}
+
+static void
+lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op)
+{
+
+   for (unsigned i = 0; i < vec32->num_components; i++) {
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader, op);
+
+      store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i));
+      store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
+      store->num_components = 1;
+      nir_builder_instr_insert(b, &store->instr);
+   }
+}
+
+static void
+lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index,
+                         nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op)
+{
+   nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
+
+   /* If we have 16 bits or less to store we need to place them correctly in
+    * the u32 component. Anything greater than 16 bits (including uchar3) is
+    * naturally aligned on 32bits.
+    */
+   if (num_bits <= 16) {
+      nir_ssa_def *shift =
+         nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8);
+
+      vec32 = nir_ishl(b, vec32, shift);
+      mask = nir_ishl(b, mask, shift);
+   }
+
+   if (op == nir_intrinsic_store_shared_dxil) {
+      /* Use the dedicated masked intrinsic */
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_store_shared_masked_dxil);
+      store->src[0] = nir_src_for_ssa(vec32);
+      store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
+      store->src[2] = nir_src_for_ssa(index);
+      store->num_components = 1;
+      nir_builder_instr_insert(b, &store->instr);
+   } else {
+      /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
+      nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_scratch_dxil);
+      load->src[0] = nir_src_for_ssa(index);
+      load->num_components = 1;
+      nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+      nir_builder_instr_insert(b, &load->instr);
+
+      nir_ssa_def *new_val = nir_ior(b, vec32,
+                                     nir_iand(b,
+                                              nir_inot(b, mask),
+                                              &load->dest.ssa));
+
+      lower_store_vec32(b, index, new_val, op);
+   }
+}
+
+static bool
+lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->src[0].is_ssa);
+   unsigned num_components = nir_src_num_components(intr->src[0]);
+   unsigned bit_size = nir_src_bit_size(intr->src[0]);
+   unsigned num_bits = num_components * bit_size;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_intrinsic_op op = intr->intrinsic;
+
+   nir_ssa_def *offset = intr->src[1].ssa;
+   if (op == nir_intrinsic_store_shared) {
+      offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
+      op = nir_intrinsic_store_shared_dxil;
+   } else {
+      offset = nir_u2u32(b, offset);
+      op = nir_intrinsic_store_scratch_dxil;
+   }
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+
+   unsigned comp_idx = 0;
+   for (unsigned i = 0; i < num_components; i++)
+      comps[i] = nir_channel(b, intr->src[0].ssa, i);
+
+   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
+      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store.
+       */
+      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
+      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
+      nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
+                                               substore_num_bits / bit_size);
+      nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2));
+
+      /* For anything less than 32bits we need to use the masked version of the
+       * intrinsic to preserve data living in the same 32bit slot.
+       */
+      if (num_bits < 32) {
+         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op);
+      } else {
+         lower_store_vec32(b, index, vec32, op);
+      }
+
+      comp_idx += substore_num_bits / bit_size;
+   }
+
+   nir_instr_remove(&intr->instr);
+
+   return true;
+}
+
+static void
+ubo_to_temp_patch_deref_mode(nir_deref_instr *deref)
+{
+   deref->modes = nir_var_shader_temp;
+   nir_foreach_use(use_src, &deref->dest.ssa) {
+      if (use_src->parent_instr->type != nir_instr_type_deref)
+        continue;
+
+      nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr);
+      ubo_to_temp_patch_deref_mode(parent);
+   }
+}
+
+static void
+ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he)
+{
+   assert(nir_deref_mode_is(deref, nir_var_mem_constant));
+   assert(deref->dest.is_ssa);
+   assert(he->data);
+
+   nir_foreach_use(use_src, &deref->dest.ssa) {
+      if (use_src->parent_instr->type == nir_instr_type_deref) {
+         ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he);
+      } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr);
+         if (intr->intrinsic != nir_intrinsic_load_deref)
+            he->data = NULL;
+      } else {
+         he->data = NULL;
+      }
+
+      if (!he->data)
+         break;
+   }
+}
+
+bool
+dxil_nir_lower_ubo_to_temp(nir_shader *nir)
+{
+   struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL);
+   bool progress = false;
+
+   /* First pass: collect all UBO accesses that could be turned into
+    * shader temp accesses.
+    */
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+            if (!nir_deref_mode_is(deref, nir_var_mem_constant) ||
+                deref->deref_type != nir_deref_type_var)
+                  continue;
+
+            struct hash_entry *he =
+               _mesa_hash_table_search(ubo_to_temp, deref->var);
+
+            if (!he)
+               he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var);
+
+            if (!he->data)
+               continue;
+
+            ubo_to_temp_update_entry(deref, he);
+         }
+      }
+   }
+
+   hash_table_foreach(ubo_to_temp, he) {
+      nir_variable *var = he->data;
+
+      if (!var)
+         continue;
+
+      /* Change the variable mode. */
+      var->data.mode = nir_var_shader_temp;
+
+      /* Make sure the variable has a name.
+       * DXIL variables must have names.
+       */
+      if (!var->name)
+         var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables));
+
+      progress = true;
+   }
+   _mesa_hash_table_destroy(ubo_to_temp, NULL);
+
+   /* Second pass: patch all derefs that were accessing the converted UBOs
+    * variables.
+    */
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+            if (nir_deref_mode_is(deref, nir_var_mem_constant) &&
+                deref->deref_type == nir_deref_type_var &&
+                deref->var->data.mode == nir_var_shader_temp)
+               ubo_to_temp_patch_deref_mode(deref);
+         }
+      }
+   }
+
+   return progress;
+}
+
+bool
+lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+   assert(intr->src[0].is_ssa);
+   assert(intr->src[1].is_ssa);
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_ssa_def *result =
+      build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,
+                             nir_dest_num_components(intr->dest),
+                             nir_dest_bit_size(intr->dest));
+
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+            switch (intr->intrinsic) {
+            case nir_intrinsic_load_deref:
+               progress |= lower_load_deref(&b, intr);
+               break;
+            case nir_intrinsic_load_shared:
+            case nir_intrinsic_load_scratch:
+               progress |= lower_32b_offset_load(&b, intr);
+               break;
+            case nir_intrinsic_load_ssbo:
+               progress |= lower_load_ssbo(&b, intr);
+               break;
+            case nir_intrinsic_load_ubo:
+               progress |= lower_load_ubo(&b, intr);
+               break;
+            case nir_intrinsic_store_shared:
+            case nir_intrinsic_store_scratch:
+               progress |= lower_32b_offset_store(&b, intr);
+               break;
+            case nir_intrinsic_store_ssbo:
+               progress |= lower_store_ssbo(&b, intr);
+               break;
+            }
+         }
+      }
+   }
+
+   return progress;
+}
+
+lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr,
+                    nir_intrinsic_op dxil_op)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+
+   assert(intr->src[0].is_ssa);
+   nir_ssa_def *offset =
+      nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr)));
+   nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
+
+   nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op);
+   atomic->src[0] = nir_src_for_ssa(index);
+   assert(intr->src[1].is_ssa);
+   atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa);
+   if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) {
+      assert(intr->src[2].is_ssa);
+      atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa);
+   }
+   atomic->num_components = 0;
+   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, intr->dest.ssa.name);
+
+   nir_builder_instr_insert(b, &atomic->instr);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(&atomic->dest.ssa));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+dxil_nir_lower_atomics_to_dxil(nir_shader *nir)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+            switch (intr->intrinsic) {
+
+#define ATOMIC(op)                                                            \
+  case nir_intrinsic_shared_atomic_##op:                                     \
+     progress |= lower_shared_atomic(&b, intr,                                \
+                                     nir_intrinsic_shared_atomic_##op##_dxil); \
+     break
+
+            ATOMIC(add);
+            ATOMIC(imin);
+            ATOMIC(umin);
+            ATOMIC(imax);
+            ATOMIC(umax);
+            ATOMIC(and);
+            ATOMIC(or);
+            ATOMIC(xor);
+            ATOMIC(exchange);
+            ATOMIC(comp_swap);
+
+#undef ATOMIC
+            }
+         }
+      }
+   }
+
+   return progress;
+}
+
+static bool
+lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref)
+{
+   assert(nir_deref_mode_is(deref, nir_var_mem_ssbo));
+   assert(deref->deref_type == nir_deref_type_var ||
+          deref->deref_type == nir_deref_type_cast);
+   nir_variable *var = deref->var;
+
+   b->cursor = nir_before_instr(&deref->instr);
+
+   if (deref->deref_type == nir_deref_type_var) {
+      /* We turn all deref_var into deref_cast and build a pointer value based on
+       * the var binding which encodes the UAV id.
+       */
+      nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32);
+      nir_deref_instr *deref_cast =
+         nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type,
+                              glsl_get_explicit_stride(var->type));
+      nir_ssa_def_rewrite_uses(&deref->dest.ssa,
+                               nir_src_for_ssa(&deref_cast->dest.ssa));
+      nir_instr_remove(&deref->instr);
+
+      deref = deref_cast;
+      return true;
+   }
+   return false;
+}
+
+bool
+dxil_nir_lower_deref_ssbo(nir_shader *nir)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+            if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) ||
+                (deref->deref_type != nir_deref_type_var &&
+                 deref->deref_type != nir_deref_type_cast))
+               continue;
+
+            progress |= lower_deref_ssbo(&b, deref);
+         }
+      }
+   }
+
+   return progress;
+}
+
+static bool
+lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu)
+{
+   const nir_op_info *info = &nir_op_infos[alu->op];
+   bool progress = false;
+
+   b->cursor = nir_before_instr(&alu->instr);
+
+   for (unsigned i = 0; i < info->num_inputs; i++) {
+      nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src);
+
+      if (!deref)
+         continue;
+
+      nir_deref_path path;
+      nir_deref_path_init(&path, deref, NULL);
+      nir_deref_instr *root_deref = path.path[0];
+      nir_deref_path_finish(&path);
+
+      if (root_deref->deref_type != nir_deref_type_cast)
+         continue;
+
+      nir_ssa_def *ptr =
+         nir_iadd(b, root_deref->parent.ssa,
+                     nir_build_deref_offset(b, deref, cl_type_size_align));
+      nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr));
+      progress = true;
+   }
+
+   return progress;
+}
+
+bool
+dxil_nir_opt_alu_deref_srcs(nir_shader *nir)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      bool progress = false;
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_alu)
+               continue;
+
+            nir_alu_instr *alu = nir_instr_as_alu(instr);
+            progress |= lower_alu_deref_srcs(&b, alu);
+         }
+      }
+   }
+
+   return progress;
+}
+
+static nir_ssa_def *
+memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent,
+                       nir_ssa_def *index)
+{
+   nir_deref_instr *deref;
+
+   index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
+   assert(parent->deref_type == nir_deref_type_cast);
+   deref = nir_build_deref_ptr_as_array(b, parent, index);
+
+   return nir_load_deref(b, deref);
+}
+
+static void
+memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent,
+                        nir_ssa_def *index, nir_ssa_def *value)
+{
+   nir_deref_instr *deref;
+
+   index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
+   assert(parent->deref_type == nir_deref_type_cast);
+   deref = nir_build_deref_ptr_as_array(b, parent, index);
+   nir_store_deref(b, deref, value, 1);
+}
+
+static bool
+lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]);
+   nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]);
+   assert(intr->src[2].is_ssa);
+   nir_ssa_def *num_bytes = intr->src[2].ssa;
+
+   assert(dst_deref && src_deref);
+
+   b->cursor = nir_after_instr(&intr->instr);
+
+   dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes,
+                                       glsl_uint8_t_type(), 1);
+   src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes,
+                                       glsl_uint8_t_type(), 1);
+
+   /*
+    * We want to avoid 64b instructions, so let's assume we'll always be
+    * passed a value that fits in a 32b type and truncate the 64b value.
+    */
+   num_bytes = nir_u2u32(b, num_bytes);
+
+   nir_variable *loop_index_var =
+     nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index");
+   nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var);
+   nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1);
+
+   nir_loop *loop = nir_push_loop(b);
+   nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref);
+   nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes);
+   nir_if *loop_check = nir_push_if(b, cmp);
+   nir_jump(b, nir_jump_break);
+   nir_pop_if(b, loop_check);
+   nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index);
+   memcpy_store_deref_elem(b, dst_deref, loop_index, val);
+   nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1);
+   nir_pop_loop(b, loop);
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+dxil_nir_lower_memcpy_deref(nir_shader *nir)
+{
+   bool progress = false;
+
+   foreach_list_typed(nir_function, func, node, &nir->functions) {
+      if (!func->is_entrypoint)
+         continue;
+      assert(func->impl);
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+            if (intr->intrinsic == nir_intrinsic_memcpy_deref)
+               progress |= lower_memcpy_deref(&b, intr);
+         }
+      }
+   }
+
+   return progress;
+}
+
+static void
+cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size)
+{
+   nir_phi_instr *lowered = nir_phi_instr_create(b->shader);
+   int num_components = 0;
+   int old_bit_size = phi->dest.ssa.bit_size;
+
+   nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size,
+                                             nir_type_uint | new_bit_size,
+                                             nir_rounding_mode_undef);
+   nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size,
+                                               nir_type_uint | old_bit_size,
+                                               nir_rounding_mode_undef);
+
+   nir_foreach_phi_src(src, phi) {
+      assert(num_components == 0 || num_components == src->src.ssa->num_components);
+      num_components = src->src.ssa->num_components;
+
+      b->cursor = nir_after_instr(src->src.ssa->parent_instr);
+
+      nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL);
+
+      nir_phi_src *new_src = rzalloc(lowered, nir_phi_src);
+      new_src->pred = src->pred;
+      new_src->src = nir_src_for_ssa(cast);
+      exec_list_push_tail(&lowered->srcs, &new_src->node);
+   }
+
+   nir_ssa_dest_init(&lowered->instr, &lowered->dest,
+                     num_components, new_bit_size, NULL);
+
+   b->cursor = nir_before_instr(&phi->instr);
+   nir_builder_instr_insert(b, &lowered->instr);
+
+   b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor));
+   nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL);
+
+   nir_ssa_def_rewrite_uses(&phi->dest.ssa, nir_src_for_ssa(result));
+   nir_instr_remove(&phi->instr);
+}
+
+static bool
+upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block_reverse(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_phi)
+            continue;
+
+         nir_phi_instr *phi = nir_instr_as_phi(instr);
+         assert(phi->dest.is_ssa);
+
+         if (phi->dest.ssa.bit_size == 1 ||
+             phi->dest.ssa.bit_size >= min_bit_size)
+            continue;
+
+         cast_phi(&b, phi, min_bit_size);
+         progress = true;
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
+   return progress;
+}
+
+bool
+dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= upcast_phi_impl(function->impl, min_bit_size);
+   }
+
+   return progress;
+}
+
+/* The following float-to-half conversion routines are based on the "half" library:
+ * https://sourceforge.net/projects/half/
+ *
+ * half - IEEE 754-based half-precision floating-point library.
+ *
+ * Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Version 2.1.0
+ */
+
+
+static bool
+lower_fp16_casts_filter(const nir_instr *instr, const void *data)
+{
+   if (instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      /* TODO: DXIL has instructions for f2f16_rtz. For CL, it's not precise enough
+       * due to denorm handling. If the f2f16 instruction has undef rounding mode,
+       * we could map that too, but for CL, f2f16 is implied to mean rtne.
+       */
+      switch (alu->op) {
+      case nir_op_f2f16:
+      case nir_op_f2f16_rtne:
+      case nir_op_f2f16_rtz:
+         return true;
+      }
+   } else if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      return intrin->intrinsic == nir_intrinsic_convert_alu_types &&
+         nir_intrinsic_dest_type(intrin) == nir_type_float16;
+   }
+   return false;
+}
+
+static nir_ssa_def *
+half_rounded(nir_builder *b, nir_ssa_def *value, nir_ssa_def *guard, nir_ssa_def *sticky,
+             nir_ssa_def *sign, nir_rounding_mode mode)
+{
+   switch (mode) {
+   case nir_rounding_mode_rtne:
+      return nir_iadd(b, value, nir_iand(b, guard, nir_ior(b, sticky, value)));
+   case nir_rounding_mode_ru:
+      sign = nir_ushr(b, sign, nir_imm_int(b, 31));
+      return nir_iadd(b, value, nir_iand(b, nir_inot(b, sign),
+                                            nir_ior(b, guard, sticky)));
+   case nir_rounding_mode_rd:
+      sign = nir_ushr(b, sign, nir_imm_int(b, 31));
+      return nir_iadd(b, value, nir_iand(b, sign,
+                                            nir_ior(b, guard, sticky)));
+   default:
+      return value;
+   }
+}
+
+static nir_ssa_def *
+float_to_half_impl(nir_builder *b, nir_ssa_def *src, nir_rounding_mode mode)
+{
+   nir_ssa_def *f32infinity = nir_imm_int(b, 255 << 23);
+   nir_ssa_def *f16max = nir_imm_int(b, (127 + 16) << 23);
+   nir_ssa_def *denorm_magic = nir_imm_int(b, ((127 - 15) + (23 - 10) + 1) << 23);
+   nir_ssa_def *sign = nir_iand(b, src, nir_imm_int(b, 0x80000000));
+   nir_ssa_def *one = nir_imm_int(b, 1);
+
+   nir_ssa_def *abs = nir_iand(b, src, nir_imm_int(b, 0x7FFFFFFF));
+   /* NaN or INF. For rtne, overflow also becomes INF, so combine the comparisons */
+   nir_push_if(b, nir_ige(b, abs, mode == nir_rounding_mode_rtne ? f16max : f32infinity));
+   nir_ssa_def *inf_nanfp16 = nir_bcsel(b,
+                                    nir_ilt(b, f32infinity, abs),
+                                    nir_imm_int(b, 0x7E00),
+                                    nir_imm_int(b, 0x7C00));
+   nir_push_else(b, NULL);
+
+   nir_ssa_def *overflowed_fp16 = NULL;
+   if (mode != nir_rounding_mode_rtne) {
+      /* Handle overflow */
+      nir_push_if(b, nir_ige(b, abs, f16max));
+      switch (mode) {
+      case nir_rounding_mode_rtz:
+         overflowed_fp16 = nir_imm_int(b, 0x7BFF);
+         break;
+      case nir_rounding_mode_ru:
+         /* Negative becomes max float, positive becomes inf */
+         overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7BFF), nir_imm_int(b, 0x7C00));
+         break;
+      case nir_rounding_mode_rd:
+         /* Negative becomes inf, positive becomes max float */
+         overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7C00), nir_imm_int(b, 0x7BFF));
+         break;
+      default: unreachable("Should've been handled already");
+      }
+      nir_push_else(b, NULL);
+   }
+
+   nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 113 << 23)));
+
+   /* FP16 will be normal */
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_ssa_def *value = nir_ior(b,
+                                nir_ishl(b,
+                                         nir_isub(b,
+                                                  nir_ushr(b, abs, nir_imm_int(b, 23)),
+                                                  nir_imm_int(b, 112)),
+                                         nir_imm_int(b, 10)),
+                                nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 13)), nir_imm_int(b, 0x3FFF)));
+   nir_ssa_def *guard = nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 12)), one);
+   nir_ssa_def *sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, abs, nir_imm_int(b, 0xFFF)), zero), one, zero);
+   nir_ssa_def *normal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
+
+   nir_push_else(b, NULL);
+   nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 102 << 23)));
+   
+   /* FP16 will be denormal */
+   nir_ssa_def *i = nir_isub(b, nir_imm_int(b, 125), nir_ushr(b, abs, nir_imm_int(b, 23)));
+   nir_ssa_def *masked = nir_ior(b, nir_iand(b, abs, nir_imm_int(b, 0x7FFFFF)), nir_imm_int(b, 0x800000));
+   value = nir_ushr(b, masked, nir_iadd(b, i, one));
+   guard = nir_iand(b, nir_ushr(b, masked, i), one);
+   sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, masked, nir_isub(b, nir_ishl(b, one, i), one)), zero), one, zero);
+   nir_ssa_def *denormal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
+
+   nir_push_else(b, NULL);
+
+   /* Handle underflow. Nonzero values need to shift up or down for round-up or round-down */
+   nir_ssa_def *underflowed_fp16 = zero;
+   if (mode == nir_rounding_mode_ru ||
+       mode == nir_rounding_mode_rd) {
+      nir_push_if(b, nir_i2b1(b, abs));
+
+      if (mode == nir_rounding_mode_ru)
+         underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), zero, one);
+      else
+         underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), one, zero);
+
+      nir_push_else(b, NULL);
+      nir_pop_if(b, NULL);
+      underflowed_fp16 = nir_if_phi(b, underflowed_fp16, zero);
+   }
+
+   nir_pop_if(b, NULL);
+   nir_ssa_def *underflowed_or_denorm_fp16 = nir_if_phi(b, denormal_fp16, underflowed_fp16);
+
+   nir_pop_if(b, NULL);
+   nir_ssa_def *finite_fp16 = nir_if_phi(b, normal_fp16, underflowed_or_denorm_fp16);
+
+   nir_ssa_def *finite_or_overflowed_fp16 = finite_fp16;
+   if (mode != nir_rounding_mode_rtne) {
+      nir_pop_if(b, NULL);
+      finite_or_overflowed_fp16 = nir_if_phi(b, overflowed_fp16, finite_fp16);
+   }
+
+   nir_pop_if(b, NULL);
+   nir_ssa_def *fp16 = nir_if_phi(b, inf_nanfp16, finite_or_overflowed_fp16);
+
+   return nir_u2u16(b, nir_ior(b, fp16, nir_ushr(b, sign, nir_imm_int(b, 16))));
+}
+
+static nir_ssa_def *
+lower_fp16_cast_impl(nir_builder *b, nir_instr *instr, void *data)
+{
+   nir_ssa_def *src, *dst;
+   uint8_t *swizzle = NULL;
+   nir_rounding_mode mode = nir_rounding_mode_rtne;
+
+   if (instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      src = alu->src[0].src.ssa;
+      swizzle = alu->src[0].swizzle;
+      dst = &alu->dest.dest.ssa;
+      assert(src->bit_size == 32);
+      switch (alu->op) {
+      case nir_op_f2f16:
+      case nir_op_f2f16_rtne:
+         break;
+      case nir_op_f2f16_rtz:
+         mode = nir_rounding_mode_rtz;
+         break;
+      }
+   } else {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      assert(nir_intrinsic_src_type(intrin) == nir_type_float32);
+      src = intrin->src[0].ssa;
+      dst = &intrin->dest.ssa;
+      mode = nir_intrinsic_rounding_mode(intrin);
+   }
+
+   nir_ssa_def *rets[NIR_MAX_VEC_COMPONENTS] = { NULL };
+
+   for (unsigned i = 0; i < dst->num_components; i++) {
+      nir_ssa_def *comp = nir_channel(b, src, swizzle ? swizzle[i] : i);
+      rets[i] = float_to_half_impl(b, comp, mode);
+   }
+
+   return nir_vec(b, rets, dst->num_components);
+}
+
+bool
+dxil_nir_lower_fp16_casts(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(shader,
+                                        lower_fp16_casts_filter,
+                                        lower_fp16_cast_impl,
+                                        NULL);
+}
index b20b632..3dc8b4f 100644 (file)
@@ -32,6 +32,14 @@ bool dxil_nir_lower_8bit_conv(nir_shader *shader);
 bool dxil_nir_lower_16bit_conv(nir_shader *shader);
 bool dxil_nir_lower_x2b(nir_shader *shader);
 bool dxil_nir_lower_inot(nir_shader *shader);
+bool dxil_nir_lower_ubo_to_temp(nir_shader *shader);
+bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader);
+bool dxil_nir_lower_atomics_to_dxil(nir_shader *shader);
+bool dxil_nir_lower_deref_ssbo(nir_shader *shader);
+bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader);
+bool dxil_nir_lower_memcpy_deref(nir_shader *shader);
+bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size);
+bool dxil_nir_lower_fp16_casts(nir_shader *shader);
 
 nir_ssa_def *
 build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
index 4439179..f5fef4f 100644 (file)
@@ -97,8 +97,10 @@ nir_options = {
    .lower_pack_32_2x16_split = true,
    .lower_unpack_64_2x32_split = true,
    .lower_unpack_32_2x16_split = true,
+   .use_scoped_barrier = true,
    .vertex_id_zero_based = true,
    .lower_base_vertex = true,
+   .has_cs_global_id = true,
 };
 
 const nir_shader_compiler_options*
@@ -809,6 +811,48 @@ emit_srv(struct ntd_context *ctx, nir_variable *var, unsigned binding, unsigned
 }
 
 static bool
+emit_globals(struct ntd_context *ctx, nir_shader *s, unsigned size)
+{
+   nir_foreach_variable_with_modes(var, s, nir_var_mem_ssbo)
+      size++;
+
+   if (!size)
+      return true;
+
+   const struct dxil_type *type = dxil_module_get_int_type(&ctx->mod, 32);
+   if (!type)
+      return false;
+
+   const struct dxil_type *struct_type =
+      dxil_module_get_struct_type(&ctx->mod, NULL, &type, 1);
+   if (!struct_type)
+      return false;
+
+   const struct dxil_type *array_type =
+      dxil_module_get_array_type(&ctx->mod, struct_type, size);
+   if (!array_type)
+      return false;
+
+   resource_array_layout layout = {0, 0, size};
+   const struct dxil_mdnode *uav_meta =
+      emit_uav_metadata(&ctx->mod, array_type,
+                                   "globals", &layout,
+                                   DXIL_COMP_TYPE_INVALID,
+                                   DXIL_RESOURCE_KIND_RAW_BUFFER);
+   if (!uav_meta)
+      return false;
+
+   ctx->uav_metadata_nodes[ctx->num_uav_arrays++] = uav_meta;
+   if (ctx->num_uav_arrays > 8)
+      ctx->mod.feats.use_64uavs = 1;
+   /* Handles to UAVs used for kernel globals are created on-demand */
+   ctx->num_uavs += size;
+   add_resource(ctx, DXIL_RES_UAV_RAW, &layout);
+   ctx->mod.raw_and_structured_buffers = true;
+   return true;
+}
+
+static bool
 emit_uav(struct ntd_context *ctx, nir_variable *var, unsigned count)
 {
    assert(ctx->num_uav_arrays < ARRAY_SIZE(ctx->uav_metadata_nodes));
@@ -937,6 +981,53 @@ var_fill_const_array(struct ntd_context *ctx, const struct nir_constant *c,
 }
 
 static bool
+emit_global_consts(struct ntd_context *ctx, nir_shader *s)
+{
+   nir_foreach_variable_with_modes(var, s, nir_var_shader_temp) {
+      struct dxil_value *ret;
+      bool err;
+
+      assert(var->constant_initializer);
+
+      unsigned int num_members = DIV_ROUND_UP(glsl_get_cl_size(var->type), 4);
+      uint32_t *const_ints = ralloc_array(ctx->ralloc_ctx, uint32_t, num_members);
+      err = var_fill_const_array(ctx, var->constant_initializer, var->type,
+                                 const_ints, 0);
+      if (!err)
+         return false;
+      const struct dxil_value **const_vals =
+         ralloc_array(ctx->ralloc_ctx, const struct dxil_value *, num_members);
+      if (!const_vals)
+         return false;
+      for (int i = 0; i < num_members; i++)
+         const_vals[i] = dxil_module_get_int32_const(&ctx->mod, const_ints[i]);
+
+      const struct dxil_type *elt_type = dxil_module_get_int_type(&ctx->mod, 32);
+      if (!elt_type)
+         return false;
+      const struct dxil_type *type =
+         dxil_module_get_array_type(&ctx->mod, elt_type, num_members);
+      if (!type)
+         return false;
+      const struct dxil_value *agg_vals =
+         dxil_module_get_array_const(&ctx->mod, type, const_vals);
+      if (!agg_vals)
+         return false;
+
+      const struct dxil_value *gvar = dxil_add_global_ptr_var(&ctx->mod, var->name, type,
+                                                              DXIL_AS_DEFAULT, 4,
+                                                              agg_vals);
+      if (!gvar)
+         return false;
+
+      if (!_mesa_hash_table_insert(ctx->consts, var, (void *)gvar))
+         return false;
+   }
+
+   return true;
+}
+
+static bool
 emit_cbv(struct ntd_context *ctx, unsigned binding,
          unsigned size, char *name)
 {
@@ -1882,6 +1973,8 @@ emit_alu(struct ntd_context *ctx, nir_alu_instr *alu)
    case nir_op_flog2: return emit_unary_intin(ctx, alu, DXIL_INTR_FLOG2, src[0]);
    case nir_op_ffloor: return emit_unary_intin(ctx, alu, DXIL_INTR_ROUND_NI, src[0]);
    case nir_op_ffract: return emit_unary_intin(ctx, alu, DXIL_INTR_FRC, src[0]);
+   case nir_op_fisnormal: return emit_unary_intin(ctx, alu, DXIL_INTR_ISNORMAL, src[0]);
+   case nir_op_fisfinite: return emit_unary_intin(ctx, alu, DXIL_INTR_ISFINITE, src[0]);
 
    case nir_op_fddx:
    case nir_op_fddx_coarse: return emit_unary_intin(ctx, alu, DXIL_INTR_DDX_COARSE, src[0]);
@@ -1967,6 +2060,120 @@ load_ubo(struct ntd_context *ctx, const struct dxil_value *handle,
 }
 
 static bool
+emit_barrier(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *opcode, *mode;
+   const struct dxil_func *func;
+   uint32_t flags = 0;
+
+   if (nir_intrinsic_execution_scope(intr) == NIR_SCOPE_WORKGROUP)
+      flags |= DXIL_BARRIER_MODE_SYNC_THREAD_GROUP;
+
+   nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
+   nir_scope mem_scope = nir_intrinsic_memory_scope(intr);
+
+   if (modes & ~(nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_shared))
+      return false;
+
+   if (mem_scope != NIR_SCOPE_DEVICE && mem_scope != NIR_SCOPE_WORKGROUP)
+      return false;
+
+   if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
+      if (mem_scope == NIR_SCOPE_DEVICE)
+         flags |= DXIL_BARRIER_MODE_UAV_FENCE_GLOBAL;
+      else
+         flags |= DXIL_BARRIER_MODE_UAV_FENCE_THREAD_GROUP;
+   }
+
+   if (modes & nir_var_mem_shared)
+      flags |= DXIL_BARRIER_MODE_UAV_FENCE_THREAD_GROUP;
+
+   func = dxil_get_function(&ctx->mod, "dx.op.barrier", DXIL_NONE);
+   if (!func)
+      return false;
+
+   opcode = dxil_module_get_int32_const(&ctx->mod, DXIL_INTR_BARRIER);
+   if (!opcode)
+      return false;
+
+   mode = dxil_module_get_int32_const(&ctx->mod, flags);
+   if (!mode)
+      return false;
+
+   const struct dxil_value *args[] = { opcode, mode };
+
+   return dxil_emit_call_void(&ctx->mod, func,
+                              args, ARRAY_SIZE(args));
+}
+
+static bool
+emit_load_global_invocation_id(struct ntd_context *ctx,
+                                    nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+   nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa);
+
+   for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+      if (comps & (1 << i)) {
+         const struct dxil_value *idx = dxil_module_get_int32_const(&ctx->mod, i);
+         if (!idx)
+            return false;
+         const struct dxil_value *globalid = emit_threadid_call(ctx, idx);
+
+         if (!globalid)
+            return false;
+
+         store_dest_value(ctx, &intr->dest, i, globalid);
+      }
+   }
+   return true;
+}
+
+static bool
+emit_load_local_invocation_id(struct ntd_context *ctx,
+                              nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+   nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa);
+
+   for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+      if (comps & (1 << i)) {
+         const struct dxil_value
+            *idx = dxil_module_get_int32_const(&ctx->mod, i);
+         if (!idx)
+            return false;
+         const struct dxil_value
+            *threadidingroup = emit_threadidingroup_call(ctx, idx);
+         if (!threadidingroup)
+            return false;
+         store_dest_value(ctx, &intr->dest, i, threadidingroup);
+      }
+   }
+   return true;
+}
+
+static bool
+emit_load_local_work_group_id(struct ntd_context *ctx,
+                              nir_intrinsic_instr *intr)
+{
+   assert(intr->dest.is_ssa);
+   nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa);
+
+   for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+      if (comps & (1 << i)) {
+         const struct dxil_value *idx = dxil_module_get_int32_const(&ctx->mod, i);
+         if (!idx)
+            return false;
+         const struct dxil_value *groupid = emit_groupid_call(ctx, idx);
+         if (!groupid)
+            return false;
+         store_dest_value(ctx, &intr->dest, i, groupid);
+      }
+   }
+   return true;
+}
+
+static bool
 emit_load_primitiveid(struct ntd_context *ctx,
                       nir_intrinsic_instr *intr)
 {
@@ -2000,6 +2207,249 @@ get_int32_undef(struct dxil_module *m)
    return dxil_module_get_undef(m, int32_type);
 }
 
+static const struct dxil_value *
+offset_to_index(struct dxil_module *m, const struct dxil_value *offset,
+                unsigned bit_size)
+{
+   unsigned shift_amt = util_logbase2(bit_size / 8);
+   const struct dxil_value *shift =
+      dxil_module_get_int32_const(m, shift_amt);
+   if (!shift)
+      return NULL;
+
+   return dxil_emit_binop(m, DXIL_BINOP_LSHR, offset, shift, 0);
+}
+
+static const struct dxil_value *
+index_to_offset(struct dxil_module *m, const struct dxil_value *index,
+                unsigned bit_size)
+{
+   unsigned shift_amt = util_logbase2(bit_size / 8);
+   const struct dxil_value *shift =
+      dxil_module_get_int32_const(m, shift_amt);
+   if (!shift)
+      return NULL;
+
+   return dxil_emit_binop(m, DXIL_BINOP_SHL, index, shift, 0);
+}
+
+static const struct dxil_value *
+emit_gep_for_index(struct ntd_context *ctx, const nir_variable *var,
+                   const struct dxil_value *index)
+{
+   assert(var->data.mode == nir_var_shader_temp);
+
+   struct hash_entry *he = _mesa_hash_table_search(ctx->consts, var);
+   assert(he != NULL);
+   const struct dxil_value *ptr = he->data;
+
+   const struct dxil_value *zero = dxil_module_get_int32_const(&ctx->mod, 0);
+   if (!zero)
+      return NULL;
+
+   const struct dxil_value *ops[] = { ptr, zero, index };
+   return dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+}
+
+static bool
+emit_load_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
+   const struct dxil_value *buffer =
+      get_src(ctx, &intr->src[0], 0, nir_type_uint);
+   const struct dxil_value *offset =
+      get_src(ctx, &intr->src[1], 0, nir_type_uint);
+   if (!int32_undef || !buffer || !offset)
+      return false;
+
+   assert(nir_src_bit_size(intr->src[0]) == 32);
+   assert(nir_intrinsic_dest_components(intr) <= 4);
+
+   const struct dxil_value *handle =
+      emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer,
+                             nir_src_is_const(intr->src[0]));
+   if (!handle)
+      return false;
+
+   const struct dxil_value *coord[2] = {
+      offset,
+      int32_undef
+   };
+
+   const struct dxil_value *load = emit_bufferload_call(ctx, handle, coord);
+   if (!load)
+      return false;
+
+   for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+      const struct dxil_value *val =
+         dxil_emit_extractval(&ctx->mod, load, i);
+      if (!val)
+         return false;
+      store_dest_value(ctx, &intr->dest, i, val);
+   }
+   return true;
+}
+
+static bool
+emit_store_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *buffer =
+      get_src(ctx, &intr->src[1], 0, nir_type_uint);
+   const struct dxil_value *offset =
+      get_src(ctx, &intr->src[2], 0, nir_type_uint);
+   if (!buffer || !offset)
+      return false;
+
+   const struct dxil_value *handle =
+      emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer,
+                             nir_src_is_const(intr->src[1]));
+   if (!handle)
+      return false;
+
+   assert(nir_src_bit_size(intr->src[0]) == 32);
+   unsigned num_components = nir_src_num_components(intr->src[0]);
+   assert(num_components <= 4);
+   const struct dxil_value *value[4];
+   for (unsigned i = 0; i < num_components; ++i) {
+      value[i] = get_src(ctx, &intr->src[0], i, nir_type_uint);
+      if (!value[i])
+         return false;
+   }
+
+   const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
+   if (!int32_undef)
+      return false;
+
+   const struct dxil_value *coord[2] = {
+      offset,
+      int32_undef
+   };
+
+   for (int i = num_components; i < 4; ++i)
+      value[i] = int32_undef;
+
+   const struct dxil_value *write_mask =
+      dxil_module_get_int8_const(&ctx->mod, (1u << num_components) - 1);
+   if (!write_mask)
+      return false;
+
+   return emit_bufferstore_call(ctx, handle, coord, value, write_mask, DXIL_I32);
+}
+
+static bool
+emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *value =
+      get_src(ctx, &intr->src[0], 0, nir_type_uint);
+   const struct dxil_value *mask =
+      get_src(ctx, &intr->src[1], 0, nir_type_uint);
+   const struct dxil_value *buffer =
+      get_src(ctx, &intr->src[2], 0, nir_type_uint);
+   const struct dxil_value *offset =
+      get_src(ctx, &intr->src[3], 0, nir_type_uint);
+   if (!value || !mask || !buffer || !offset)
+      return false;
+
+   const struct dxil_value *handle =
+      emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer,
+                             nir_src_is_const(intr->src[2]));
+   if (!handle)
+      return false;
+
+   const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
+   if (!int32_undef)
+      return false;
+
+   const struct dxil_value *coord[3] = {
+      offset, int32_undef, int32_undef
+   };
+
+   return
+      emit_atomic_binop(ctx, handle, DXIL_ATOMIC_AND, coord, mask) != NULL &&
+      emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL;
+}
+
+static bool
+emit_store_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *zero, *index;
+   unsigned bit_size = nir_src_bit_size(intr->src[0]);
+
+   /* All shared mem accesses should have been lowered to scalar 32bit
+    * accesses.
+    */
+   assert(bit_size == 32);
+   assert(nir_src_num_components(intr->src[0]) == 1);
+
+   zero = dxil_module_get_int32_const(&ctx->mod, 0);
+   if (!zero)
+      return false;
+
+   if (intr->intrinsic == nir_intrinsic_store_shared_dxil)
+      index = get_src(ctx, &intr->src[1], 0, nir_type_uint);
+   else
+      index = get_src(ctx, &intr->src[2], 0, nir_type_uint);
+   if (!index)
+      return false;
+
+   const struct dxil_value *ops[] = { ctx->sharedvars, zero, index };
+   const struct dxil_value *ptr, *value;
+
+   ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+   if (!ptr)
+      return false;
+
+   value = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+
+   if (intr->intrinsic == nir_intrinsic_store_shared_dxil)
+      return dxil_emit_store(&ctx->mod, value, ptr, 4, false);
+
+   const struct dxil_value *mask = get_src(ctx, &intr->src[1], 0, nir_type_uint);
+
+   if (!dxil_emit_atomicrmw(&ctx->mod, mask, ptr, DXIL_RMWOP_AND, false,
+                            DXIL_ATOMIC_ORDERING_ACQREL,
+                            DXIL_SYNC_SCOPE_CROSSTHREAD))
+      return false;
+
+   if (!dxil_emit_atomicrmw(&ctx->mod, value, ptr, DXIL_RMWOP_OR, false,
+                            DXIL_ATOMIC_ORDERING_ACQREL,
+                            DXIL_SYNC_SCOPE_CROSSTHREAD))
+      return false;
+
+   return true;
+}
+
+static bool
+emit_store_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *zero, *index;
+   unsigned bit_size = nir_src_bit_size(intr->src[0]);
+
+   /* All scratch mem accesses should have been lowered to scalar 32bit
+    * accesses.
+    */
+   assert(bit_size == 32);
+   assert(nir_src_num_components(intr->src[0]) == 1);
+
+   zero = dxil_module_get_int32_const(&ctx->mod, 0);
+   if (!zero)
+      return false;
+
+   index = get_src(ctx, &intr->src[1], 0, nir_type_uint);
+   if (!index)
+      return false;
+
+   const struct dxil_value *ops[] = { ctx->scratchvars, zero, index };
+   const struct dxil_value *ptr, *value;
+
+   ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+   if (!ptr)
+      return false;
+
+   value = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+   return dxil_emit_store(&ctx->mod, value, ptr, 4, false);
+}
+
 static bool
 emit_load_ubo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -2225,6 +2675,97 @@ emit_load_input(struct ntd_context *ctx, nir_intrinsic_instr *intr,
 }
 
 static bool
+emit_load_ptr(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   struct nir_variable *var =
+      nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
+   const struct dxil_value *index =
+      get_src(ctx, &intr->src[1], 0, nir_type_uint);
+
+   const struct dxil_value *ptr = emit_gep_for_index(ctx, var, index);
+   if (!ptr)
+      return false;
+
+   const struct dxil_value *retval =
+      dxil_emit_load(&ctx->mod, ptr, 4, false);
+
+   store_dest(ctx, &intr->dest, 0, retval, nir_type_uint);
+   return true;
+}
+
+static bool
+emit_load_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *zero, *index;
+   unsigned bit_size = nir_dest_bit_size(intr->dest);
+   unsigned align = bit_size / 8;
+
+   /* All shared mem accesses should have been lowered to scalar 32bit
+    * accesses.
+    */
+   assert(bit_size == 32);
+   assert(nir_dest_num_components(intr->dest) == 1);
+
+   zero = dxil_module_get_int32_const(&ctx->mod, 0);
+   if (!zero)
+      return false;
+
+   index = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+   if (!index)
+      return false;
+
+   const struct dxil_value *ops[] = { ctx->sharedvars, zero, index };
+   const struct dxil_value *ptr, *retval;
+
+   ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+   if (!ptr)
+      return false;
+
+   retval = dxil_emit_load(&ctx->mod, ptr, align, false);
+   if (!retval)
+      return false;
+
+   store_dest(ctx, &intr->dest, 0, retval, nir_type_uint);
+   return true;
+}
+
+static bool
+emit_load_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+   const struct dxil_value *zero, *one, *index;
+   unsigned bit_size = nir_dest_bit_size(intr->dest);
+   unsigned align = bit_size / 8;
+
+   /* All scratch mem accesses should have been lowered to scalar 32bit
+    * accesses.
+    */
+   assert(bit_size == 32);
+   assert(nir_dest_num_components(intr->dest) == 1);
+
+   zero = dxil_module_get_int32_const(&ctx->mod, 0);
+   if (!zero)
+      return false;
+
+   index = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+   if (!index)
+      return false;
+
+   const struct dxil_value *ops[] = { ctx->scratchvars, zero, index };
+   const struct dxil_value *ptr, *retval;
+
+   ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+   if (!ptr)
+      return false;
+
+   retval = dxil_emit_load(&ctx->mod, ptr, align, false);
+   if (!retval)
+      return false;
+
+   store_dest(ctx, &intr->dest, 0, retval, nir_type_uint);
+   return true;
+}
+
+static bool
 emit_load_deref(struct ntd_context *ctx, nir_intrinsic_instr *intr)
 {
    assert(intr->src[0].is_ssa);
@@ -2573,10 +3114,31 @@ static bool
 emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
 {
    switch (intr->intrinsic) {
+   case nir_intrinsic_load_global_invocation_id:
+   case nir_intrinsic_load_global_invocation_id_zero_base:
+      return emit_load_global_invocation_id(ctx, intr);
+   case nir_intrinsic_load_local_invocation_id:
+      return emit_load_local_invocation_id(ctx, intr);
+   case nir_intrinsic_load_work_group_id:
+   case nir_intrinsic_load_work_group_id_zero_base:
+      return emit_load_local_work_group_id(ctx, intr);
+   case nir_intrinsic_load_ssbo:
+      return emit_load_ssbo(ctx, intr);
+   case nir_intrinsic_store_ssbo:
+      return emit_store_ssbo(ctx, intr);
+   case nir_intrinsic_store_ssbo_masked_dxil:
+      return emit_store_ssbo_masked(ctx, intr);
    case nir_intrinsic_store_deref:
       return emit_store_deref(ctx, intr);
+   case nir_intrinsic_store_shared_dxil:
+   case nir_intrinsic_store_shared_masked_dxil:
+      return emit_store_shared(ctx, intr);
+   case nir_intrinsic_store_scratch_dxil:
+      return emit_store_scratch(ctx, intr);
    case nir_intrinsic_load_deref:
       return emit_load_deref(ctx, intr);
+   case nir_intrinsic_load_ptr_dxil:
+      return emit_load_ptr(ctx, intr);
    case nir_intrinsic_load_ubo:
       return emit_load_ubo(ctx, intr);
    case nir_intrinsic_load_ubo_dxil:
@@ -2592,6 +3154,10 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
                                           ctx->system_value[SYSTEM_VALUE_INSTANCE_ID]);
    case nir_intrinsic_load_primitive_id:
       return emit_load_primitiveid(ctx, intr);
+   case nir_intrinsic_load_shared_dxil:
+      return emit_load_shared(ctx, intr);
+   case nir_intrinsic_load_scratch_dxil:
+      return emit_load_scratch(ctx, intr);
    case nir_intrinsic_discard_if:
       return emit_discard_if(ctx, intr);
    case nir_intrinsic_discard:
@@ -2600,7 +3166,55 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
       return emit_emit_vertex(ctx, intr);
    case nir_intrinsic_end_primitive:
       return emit_end_primitive(ctx, intr);
-
+   case nir_intrinsic_scoped_barrier:
+      return emit_barrier(ctx, intr);
+   case nir_intrinsic_ssbo_atomic_add:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_ADD, nir_type_int);
+   case nir_intrinsic_ssbo_atomic_imin:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_IMIN, nir_type_int);
+   case nir_intrinsic_ssbo_atomic_umin:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_UMIN, nir_type_uint);
+   case nir_intrinsic_ssbo_atomic_imax:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_IMAX, nir_type_int);
+   case nir_intrinsic_ssbo_atomic_umax:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_UMAX, nir_type_uint);
+   case nir_intrinsic_ssbo_atomic_and:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_AND, nir_type_uint);
+   case nir_intrinsic_ssbo_atomic_or:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_OR, nir_type_uint);
+   case nir_intrinsic_ssbo_atomic_xor:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_XOR, nir_type_uint);
+   case nir_intrinsic_ssbo_atomic_exchange:
+      return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_EXCHANGE, nir_type_int);
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+      return emit_ssbo_atomic_comp_swap(ctx, intr);
+   case nir_intrinsic_shared_atomic_add_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_ADD, nir_type_int);
+   case nir_intrinsic_shared_atomic_imin_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_MIN, nir_type_int);
+   case nir_intrinsic_shared_atomic_umin_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_UMIN, nir_type_uint);
+   case nir_intrinsic_shared_atomic_imax_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_MAX, nir_type_int);
+   case nir_intrinsic_shared_atomic_umax_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_UMAX, nir_type_uint);
+   case nir_intrinsic_shared_atomic_and_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_AND, nir_type_uint);
+   case nir_intrinsic_shared_atomic_or_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_OR, nir_type_uint);
+   case nir_intrinsic_shared_atomic_xor_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_XOR, nir_type_uint);
+   case nir_intrinsic_shared_atomic_exchange_dxil:
+      return emit_shared_atomic(ctx, intr, DXIL_RMWOP_XCHG, nir_type_int);
+   case nir_intrinsic_shared_atomic_comp_swap_dxil:
+      return emit_shared_atomic_comp_swap(ctx, intr);
+   case nir_intrinsic_image_store:
+      return emit_image_store(ctx, intr);
+   case nir_intrinsic_image_size:
+      return emit_image_size(ctx, intr);
+
+   case nir_intrinsic_load_num_work_groups:
+   case nir_intrinsic_load_local_group_size:
    default:
       NIR_INSTR_UNSUPPORTED(&intr->instr);
       assert("Unimplemented intrinsic instruction");
@@ -3266,18 +3880,88 @@ prepare_phi_values(struct ntd_context *ctx, nir_shader *shader)
 static bool
 emit_cbvs(struct ntd_context *ctx, nir_shader *s)
 {
-   for (int i = ctx->opts->ubo_binding_offset; i < s->info.num_ubos; ++i) {
-      char name[64];
-      snprintf(name, sizeof(name), "__ubo%d", i);
-      if (!emit_cbv(ctx, i, 16384 /*4096 vec4's*/, name))
+   if (s->info.stage == MESA_SHADER_KERNEL) {
+      nir_foreach_variable_with_modes(var, s, nir_var_mem_ubo) {
+         if (!emit_ubo_var(ctx, var))
+            return false;
+      }
+   } else {
+      for (int i = ctx->opts->ubo_binding_offset; i < s->info.num_ubos; ++i) {
+         char name[64];
+         snprintf(name, sizeof(name), "__ubo%d", i);
+         if (!emit_cbv(ctx, i, 16384 /*4096 vec4's*/, name))
+            return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+emit_scratch(struct ntd_context *ctx, nir_shader *s)
+{
+   if (s->scratch_size) {
+      /*
+       * We always allocate an u32 array, no matter the actual variable types.
+       * According to the DXIL spec, the minimum load/store granularity is
+       * 32-bit, anything smaller requires using a read-extract/read-write-modify
+       * approach.
+       */
+      unsigned size = ALIGN_POT(s->scratch_size, sizeof(uint32_t));
+      const struct dxil_type *int32 = dxil_module_get_int_type(&ctx->mod, 32);
+      const struct dxil_value *array_length = dxil_module_get_int32_const(&ctx->mod, size / sizeof(uint32_t));
+      if (!int32 || !array_length)
+         return false;
+
+      const struct dxil_type *type = dxil_module_get_array_type(
+         &ctx->mod, int32, size / sizeof(uint32_t));
+      if (!type)
+         return false;
+
+      ctx->scratchvars = dxil_emit_alloca(&ctx->mod, type, int32, array_length, 4);
+      if (!ctx->scratchvars)
          return false;
    }
 
    return true;
 }
 
+/* The validator complains if we don't have ops that reference a global variable. */
+static bool
+shader_has_shared_ops(struct nir_shader *s)
+{
+   nir_foreach_function(func, s) {
+      if (!func->impl)
+         continue;
+      nir_foreach_block(block, func->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_load_shared_dxil:
+            case nir_intrinsic_store_shared_dxil:
+            case nir_intrinsic_shared_atomic_add_dxil:
+            case nir_intrinsic_shared_atomic_and_dxil:
+            case nir_intrinsic_shared_atomic_comp_swap_dxil:
+            case nir_intrinsic_shared_atomic_exchange_dxil:
+            case nir_intrinsic_shared_atomic_imax_dxil:
+            case nir_intrinsic_shared_atomic_imin_dxil:
+            case nir_intrinsic_shared_atomic_or_dxil:
+            case nir_intrinsic_shared_atomic_umax_dxil:
+            case nir_intrinsic_shared_atomic_umin_dxil:
+            case nir_intrinsic_shared_atomic_xor_dxil:
+               return true;
+            default: break;
+            }
+         }
+      }
+   }
+   return false;
+}
+
 static bool
-emit_module(struct ntd_context *ctx, nir_shader *s)
+emit_module(struct ntd_context *ctx, nir_shader *s, const struct nir_to_dxil_options *opts)
 {
    unsigned binding;
 
@@ -3314,6 +3998,45 @@ emit_module(struct ntd_context *ctx, nir_shader *s)
       }
    }
 
+   if (s->info.cs.shared_size && shader_has_shared_ops(s)) {
+      const struct dxil_type *type;
+      unsigned size;
+
+     /*
+      * We always allocate an u32 array, no matter the actual variable types.
+      * According to the DXIL spec, the minimum load/store granularity is
+      * 32-bit, anything smaller requires using a read-extract/read-write-modify
+      * approach. Non-atomic 64-bit accesses are allowed, but the
+      * GEP(cast(gvar, u64[] *), offset) and cast(GEP(gvar, offset), u64 *))
+      * sequences don't seem to be accepted by the DXIL validator when the
+      * pointer is in the groupshared address space, making the 32-bit -> 64-bit
+      * pointer cast impossible.
+      */
+      size = ALIGN_POT(s->info.cs.shared_size, sizeof(uint32_t));
+      type = dxil_module_get_array_type(&ctx->mod,
+                                        dxil_module_get_int_type(&ctx->mod, 32),
+                                        size / sizeof(uint32_t));
+      ctx->sharedvars = dxil_add_global_ptr_var(&ctx->mod, "shared", type,
+                                                DXIL_AS_GROUPSHARED,
+                                                ffs(sizeof(uint64_t)),
+                                                NULL);
+   }
+
+   if (!emit_scratch(ctx, s))
+      return false;
+
+   /* UAVs */
+   if (s->info.stage == MESA_SHADER_KERNEL) {
+      if (!emit_globals(ctx, s, opts->num_kernel_globals))
+         return false;
+
+      ctx->consts = _mesa_pointer_hash_table_create(ctx->ralloc_ctx);
+      if (!ctx->consts)
+         return false;
+      if (!emit_global_consts(ctx, s))
+         return false;
+   }
+
    nir_foreach_variable_with_modes(var, s, nir_var_uniform) {
       unsigned count = glsl_type_get_image_count(var->type);
       if (var->data.mode == nir_var_uniform && count) {
@@ -3383,6 +4106,7 @@ get_dxil_shader_kind(struct nir_shader *s)
       return DXIL_GEOMETRY_SHADER;
    case MESA_SHADER_FRAGMENT:
       return DXIL_PIXEL_SHADER;
+   case MESA_SHADER_KERNEL:
    case MESA_SHADER_COMPUTE:
       return DXIL_COMPUTE_SHADER;
    default:
@@ -3437,11 +4161,16 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts)
       NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
       NIR_PASS(progress, s, nir_opt_algebraic);
       NIR_PASS(progress, s, dxil_nir_lower_x2b);
+      if (s->options->lower_int64_options)
+         NIR_PASS(progress, s, nir_lower_int64);
       NIR_PASS(progress, s, nir_lower_alu);
       NIR_PASS(progress, s, dxil_nir_lower_inot);
       NIR_PASS(progress, s, nir_opt_constant_folding);
       NIR_PASS(progress, s, nir_opt_undef);
+      NIR_PASS(progress, s, nir_lower_undef_to_zero);
       NIR_PASS(progress, s, nir_opt_deref);
+      NIR_PASS(progress, s, dxil_nir_lower_upcast_phis, opts->lower_int16 ? 32 : 16);
+      NIR_PASS(progress, s, nir_lower_64bit_phis);
       NIR_PASS_V(s, nir_lower_system_values);
    } while (progress);
 
@@ -3602,7 +4331,7 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
    if (debug_dxil & DXIL_DEBUG_VERBOSE)
       nir_print_shader(s, stderr);
 
-   if (!emit_module(ctx, s)) {
+   if (!emit_module(ctx, s, opts)) {
       debug_printf("D3D12: dxil_container_add_module failed\n");
       retval = false;
       goto out;
index d0d7d16..654fc9d 100644 (file)
@@ -52,6 +52,7 @@ struct nir_to_dxil_options {
    bool disable_math_refactoring;
    unsigned ubo_binding_offset;
    unsigned provoking_vertex;
+   unsigned num_kernel_globals;
 };
 
 bool
index ed218ad..05d8683 100644 (file)
@@ -20,4 +20,7 @@
 # IN THE SOFTWARE.
 
 subdir('compiler')
+if with_microsoft_clc
+  subdir('clc')
+endif
 subdir('resource_state_manager')