error('ACO tests require Radv')
endif
+_microsoft_clc = get_option('microsoft-clc')
+if _microsoft_clc == 'auto'
+ with_microsoft_clc = false
+else
+ with_microsoft_clc = _microsoft_clc == 'true'
+endif
+
+if with_microsoft_clc
+ with_clc = true
+ dep_clang = dependency(
+ 'clang',
+ method: 'cmake',
+ static: true,
+ modules: [
+ 'clangBasic', 'clangCodeGen', 'clangDriver', 'clangFrontend', 'clangFrontendTool',
+ 'clangHandleCXX', 'clangHandleLLVM',
+ ],
+ )
+endif
+
if host_machine.system() == 'darwin'
with_dri_platform = 'apple'
pre_args += '-DBUILDING_MESA'
'lto', 'option', 'objcarcopts', 'profiledata',
]
endif
+if with_microsoft_clc
+ llvm_modules += ['target', 'linker', 'irreader', 'option', 'libdriver']
+endif
-if with_amd_vk or with_gallium_radeonsi or with_gallium_opencl
+if with_microsoft_clc
+ _llvm_version = '>= 10.0.0'
+elif with_amd_vk or with_gallium_radeonsi or with_gallium_opencl
_llvm_version = '>= 8.0.0'
elif with_gallium_swr
_llvm_version = '>= 6.0.0'
optional_modules : llvm_optional_modules,
required : (
with_amd_vk or with_gallium_radeonsi or with_gallium_swr or
- with_gallium_opencl or _llvm == 'enabled'
+ with_gallium_opencl or with_microsoft_clc or _llvm == 'enabled'
),
static : not _shared_llvm,
method : _llvm_method,
error('The following drivers require LLVM: Radv, RadeonSI, SWR. One of these is enabled, but LLVM is disabled.')
elif with_gallium_opencl
error('The OpenCL "Clover" state tracker requires LLVM, but LLVM is disabled.')
+elif with_microsoft_clc
+ error('The Microsoft CLC compiler requires LLVM, but LLVM is disabled.')
endif
-with_opencl_spirv = _opencl != 'disabled' and get_option('opencl-spirv')
+with_opencl_spirv = (_opencl != 'disabled' and get_option('opencl-spirv')) or with_microsoft_clc
if with_opencl_spirv
chosen_llvm_version_array = dep_llvm.version().split('.')
chosen_llvm_version_major = chosen_llvm_version_array[0].to_int()
description : 'Enable GLVND support.'
)
option(
+ 'microsoft-clc',
+ type : 'combo',
+ value : 'auto',
+ choices : ['auto', 'true', 'false'],
+ description : 'Build support for the Microsoft CLC to DXIL compiler'
+)
+option(
'glx-read-only-text',
type : 'boolean',
value : false,
intrinsic("bindless_resource_ir3", [1], dest_comp=1, indices=[DESC_SET], flags=[CAN_ELIMINATE, CAN_REORDER])
# DXIL specific intrinsics
+# src[] = { value, mask, index, offset }.
+intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1])
+# src[] = { value, index }.
+intrinsic("store_shared_dxil", [1, 1])
+# src[] = { value, mask, index }.
+intrinsic("store_shared_masked_dxil", [1, 1, 1])
+# src[] = { value, index }.
+intrinsic("store_scratch_dxil", [1, 1])
+# src[] = { index }.
+load("shared_dxil", [1], [], [CAN_ELIMINATE])
+# src[] = { index }.
+load("scratch_dxil", [1], [], [CAN_ELIMINATE])
+# src[] = { deref_var, offset }
+load("ptr_dxil", [1, 1], [], [])
# src[] = { index, 16-byte-based-offset }
load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE])
+# DXIL Shared atomic intrinsics
+#
+# All of the shared variable atomic memory operations read a value from
+# memory, compute a new value using one of the operations below, write the
+# new value to memory, and return the original value read.
+#
+# All operations take 2 sources:
+#
+# 0: The index in the i32 array for by the shared memory region
+# 1: The data parameter to the atomic function (i.e. the value to add
+# in shared_atomic_add, etc).
+intrinsic("shared_atomic_add_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_imin_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_umin_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_imax_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_umax_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_and_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_or_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_xor_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_exchange_dxil", src_comp=[1, 1], dest_comp=1)
+intrinsic("shared_atomic_comp_swap_dxil", src_comp=[1, 1, 1], dest_comp=1)
+
# Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
# within a blend shader to read/write the raw value from the tile buffer,
# without applying any format conversion in the process. If the shader needs
if with_any_intel
subdir('intel')
endif
-if with_gallium_d3d12
+if with_microsoft_clc or with_gallium_d3d12
subdir('microsoft')
endif
subdir('mesa')
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_serialize.h"
+#include "glsl_types.h"
+#include "nir_types.h"
+#include "clc_compiler.h"
+#include "clc_helpers.h"
+#include "clc_nir.h"
+#include "../compiler/dxil_nir.h"
+#include "../compiler/dxil_nir_lower_int_samplers.h"
+#include "../compiler/nir_to_dxil.h"
+
+#include "util/u_debug.h"
+#include <util/u_math.h>
+#include "spirv/nir_spirv.h"
+#include "nir_builder.h"
+#include "nir_builtin_builder.h"
+
+#include "git_sha1.h"
+
+enum clc_debug_flags {
+ CLC_DEBUG_DUMP_SPIRV = 1 << 0,
+ CLC_DEBUG_VERBOSE = 1 << 1,
+};
+
+static const struct debug_named_value debug_options[] = {
+ { "dump_spirv", CLC_DEBUG_DUMP_SPIRV, "Dump spirv blobs" },
+ { "verbose", CLC_DEBUG_VERBOSE, NULL },
+ DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(debug_clc, "CLC_DEBUG", debug_options, 0)
+
+static void
+clc_print_kernels_info(const struct clc_object *obj)
+{
+ fprintf(stdout, "Kernels:\n");
+ for (unsigned i = 0; i < obj->num_kernels; i++) {
+ const struct clc_kernel_arg *args = obj->kernels[i].args;
+ bool first = true;
+
+ fprintf(stdout, "\tvoid %s(", obj->kernels[i].name);
+ for (unsigned j = 0; j < obj->kernels[i].num_args; j++) {
+ if (!first)
+ fprintf(stdout, ", ");
+ else
+ first = false;
+
+ switch (args[j].address_qualifier) {
+ case CLC_KERNEL_ARG_ADDRESS_GLOBAL:
+ fprintf(stdout, "__global ");
+ break;
+ case CLC_KERNEL_ARG_ADDRESS_LOCAL:
+ fprintf(stdout, "__local ");
+ break;
+ case CLC_KERNEL_ARG_ADDRESS_CONSTANT:
+ fprintf(stdout, "__constant ");
+ break;
+ default:
+ break;
+ }
+
+ if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_VOLATILE)
+ fprintf(stdout, "volatile ");
+ if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_CONST)
+ fprintf(stdout, "const ");
+ if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_RESTRICT)
+ fprintf(stdout, "restrict ");
+
+ fprintf(stdout, "%s %s", args[j].type_name, args[j].name);
+ }
+ fprintf(stdout, ");\n");
+ }
+}
+
+struct clc_image_lower_context
+{
+ struct clc_dxil_metadata *metadata;
+ unsigned *num_srvs;
+ unsigned *num_uavs;
+ nir_deref_instr *deref;
+ unsigned num_buf_ids;
+ int metadata_index;
+};
+
+static int
+lower_image_deref_impl(nir_builder *b, struct clc_image_lower_context *context,
+ const struct glsl_type *new_var_type,
+ unsigned *num_bindings)
+{
+ nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+ nir_variable *uniform = nir_variable_create(b->shader, nir_var_uniform, new_var_type, NULL);
+ uniform->data.access = in_var->data.access;
+ uniform->data.binding = in_var->data.binding;
+ if (context->num_buf_ids > 0) {
+ // Need to assign a new binding
+ context->metadata->args[context->metadata_index].
+ image.buf_ids[context->num_buf_ids] = uniform->data.binding = (*num_bindings)++;
+ }
+ context->num_buf_ids++;
+ return uniform->data.binding;
+}
+
+static int
+lower_read_only_image_deref(nir_builder *b, struct clc_image_lower_context *context,
+ nir_alu_type image_type)
+{
+ nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+
+ // Non-writeable images should be converted to samplers,
+ // since they may have texture operations done on them
+ const struct glsl_type *new_var_type =
+ glsl_sampler_type(glsl_get_sampler_dim(in_var->type),
+ false, glsl_sampler_type_is_array(in_var->type),
+ nir_get_glsl_base_type_for_nir_type(image_type | 32));
+ return lower_image_deref_impl(b, context, new_var_type, context->num_srvs);
+}
+
+static int
+lower_read_write_image_deref(nir_builder *b, struct clc_image_lower_context *context,
+ nir_alu_type image_type)
+{
+ nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+ const struct glsl_type *new_var_type =
+ glsl_image_type(glsl_get_sampler_dim(in_var->type),
+ glsl_sampler_type_is_array(in_var->type),
+ nir_get_glsl_base_type_for_nir_type(image_type | 32));
+ return lower_image_deref_impl(b, context, new_var_type, context->num_uavs);
+}
+
+static void
+clc_lower_input_image_deref(nir_builder *b, struct clc_image_lower_context *context)
+{
+ // The input variable here isn't actually an image, it's just the
+ // image format data.
+ //
+ // For every use of an image in a different way, we'll add an
+ // appropriate uniform to match it. That can result in up to
+ // 3 uniforms (float4, int4, uint4) for each image. Only one of these
+ // formats will actually produce correct data, but a single kernel
+ // could use runtime conditionals to potentially access any of them.
+ //
+ // If the image is used in a query that doesn't have a corresponding
+ // DXIL intrinsic (CL image channel order or channel format), then
+ // we'll add a kernel input for that data that'll be lowered by the
+ // explicit IO pass later on.
+ //
+ // After all that, we can remove the image input variable and deref.
+
+ enum image_uniform_type {
+ FLOAT4,
+ INT4,
+ UINT4,
+ IMAGE_UNIFORM_TYPE_COUNT
+ };
+
+ int image_bindings[IMAGE_UNIFORM_TYPE_COUNT] = {-1, -1, -1};
+ nir_ssa_def *format_deref_dest = NULL, *order_deref_dest = NULL;
+
+ nir_variable *in_var = nir_deref_instr_get_variable(context->deref);
+ enum gl_access_qualifier access = in_var->data.access;
+
+ context->metadata_index = 0;
+ while (context->metadata->args[context->metadata_index].image.buf_ids[0] != in_var->data.binding)
+ context->metadata_index++;
+
+ context->num_buf_ids = 0;
+
+ /* Do this in 2 passes:
+ * 1. When encountering a strongly-typed access (load/store), replace the deref
+ * with one that references an appropriately typed variable. When encountering
+ * an untyped access (size query), if we have a strongly-typed variable already,
+ * replace the deref to point to it.
+ * 2. If there's any references left, they should all be untyped. If we found
+ * a strongly-typed access later in the 1st pass, then just replace the reference.
+ * If we didn't, e.g. the resource is only used for a size query, then pick an
+ * arbitrary type for it.
+ */
+ for (int pass = 0; pass < 2; ++pass) {
+ nir_foreach_use_safe(src, &context->deref->dest.ssa) {
+ enum image_uniform_type type;
+
+ if (src->parent_instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(src->parent_instr);
+ enum nir_alu_type dest_type;
+
+ b->cursor = nir_before_instr(&intrinsic->instr);
+
+ switch (intrinsic->intrinsic) {
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_store: {
+ dest_type = intrinsic->intrinsic == nir_intrinsic_image_deref_load ?
+ nir_intrinsic_dest_type(intrinsic) : nir_intrinsic_src_type(intrinsic);
+
+ switch (nir_alu_type_get_base_type(dest_type)) {
+ case nir_type_float: type = FLOAT4; break;
+ case nir_type_int: type = INT4; break;
+ case nir_type_uint: type = UINT4; break;
+ default: unreachable("Unsupported image type for load.");
+ }
+
+ int image_binding = image_bindings[type];
+ if (image_binding < 0) {
+ image_binding = image_bindings[type] =
+ lower_read_write_image_deref(b, context, dest_type);
+ }
+
+ assert((in_var->data.access & ACCESS_NON_WRITEABLE) == 0);
+ nir_rewrite_image_intrinsic(intrinsic, nir_imm_int(b, image_binding), false);
+ break;
+ }
+
+ case nir_intrinsic_image_deref_size: {
+ int image_binding = -1;
+ for (unsigned i = 0; i < IMAGE_UNIFORM_TYPE_COUNT; ++i) {
+ if (image_bindings[i] >= 0) {
+ image_binding = image_bindings[i];
+ break;
+ }
+ }
+ if (image_binding < 0) {
+ // Skip for now and come back to it
+ if (pass == 0)
+ break;
+
+ type = FLOAT4;
+ image_binding = image_bindings[type] =
+ lower_read_write_image_deref(b, context, nir_type_float32);
+ }
+
+ assert((in_var->data.access & ACCESS_NON_WRITEABLE) == 0);
+ nir_rewrite_image_intrinsic(intrinsic, nir_imm_int(b, image_binding), false);
+ break;
+ }
+
+ case nir_intrinsic_image_deref_format:
+ case nir_intrinsic_image_deref_order: {
+ nir_ssa_def **cached_deref = intrinsic->intrinsic == nir_intrinsic_image_deref_format ?
+ &format_deref_dest : &order_deref_dest;
+ if (!*cached_deref) {
+ nir_variable *new_input = nir_variable_create(b->shader, nir_var_uniform, glsl_uint_type(), NULL);
+ new_input->data.driver_location = in_var->data.driver_location;
+ if (intrinsic->intrinsic == nir_intrinsic_image_deref_format) {
+ /* Match cl_image_format { image_channel_order, image_channel_data_type }; */
+ new_input->data.driver_location += glsl_get_cl_size(new_input->type);
+ }
+
+ b->cursor = nir_after_instr(&context->deref->instr);
+ *cached_deref = nir_load_var(b, new_input);
+ }
+
+ /* No actual intrinsic needed here, just reference the loaded variable */
+ nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(*cached_deref));
+ nir_instr_remove(&intrinsic->instr);
+ break;
+ }
+
+ default:
+ unreachable("Unsupported image intrinsic");
+ }
+ } else if (src->parent_instr->type == nir_instr_type_tex) {
+ assert(in_var->data.access & ACCESS_NON_WRITEABLE);
+ nir_tex_instr *tex = nir_instr_as_tex(src->parent_instr);
+
+ switch (nir_alu_type_get_base_type(tex->dest_type)) {
+ case nir_type_float: type = FLOAT4; break;
+ case nir_type_int: type = INT4; break;
+ case nir_type_uint: type = UINT4; break;
+ default: unreachable("Unsupported image format for sample.");
+ }
+
+ int image_binding = image_bindings[type];
+ if (image_binding < 0) {
+ image_binding = image_bindings[type] =
+ lower_read_only_image_deref(b, context, tex->dest_type);
+ }
+
+ nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_texture_deref));
+ tex->texture_index = image_binding;
+ }
+ }
+ }
+
+ context->metadata->args[context->metadata_index].image.num_buf_ids = context->num_buf_ids;
+
+ nir_instr_remove(&context->deref->instr);
+ exec_node_remove(&in_var->node);
+}
+
+static void
+clc_lower_images(nir_shader *nir, struct clc_image_lower_context *context)
+{
+ nir_foreach_function(func, nir) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type == nir_instr_type_deref) {
+ context->deref = nir_instr_as_deref(instr);
+
+ if (glsl_type_is_image(context->deref->type)) {
+ assert(context->deref->deref_type == nir_deref_type_var);
+ clc_lower_input_image_deref(&b, context);
+ }
+ }
+ }
+ }
+ }
+}
+
+static void
+clc_lower_64bit_semantics(nir_shader *nir)
+{
+ nir_foreach_function(func, nir) {
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+ switch (intrinsic->intrinsic) {
+ case nir_intrinsic_load_global_invocation_id:
+ case nir_intrinsic_load_global_invocation_id_zero_base:
+ case nir_intrinsic_load_base_global_invocation_id:
+ case nir_intrinsic_load_local_invocation_id:
+ case nir_intrinsic_load_work_group_id:
+ case nir_intrinsic_load_work_group_id_zero_base:
+ case nir_intrinsic_load_base_work_group_id:
+ case nir_intrinsic_load_num_work_groups:
+ break;
+ default:
+ continue;
+ }
+
+ if (nir_instr_ssa_def(instr)->bit_size != 64)
+ continue;
+
+ intrinsic->dest.ssa.bit_size = 32;
+ b.cursor = nir_after_instr(instr);
+
+ nir_ssa_def *i64 = nir_u2u64(&b, &intrinsic->dest.ssa);
+ nir_ssa_def_rewrite_uses_after(
+ &intrinsic->dest.ssa,
+ nir_src_for_ssa(i64),
+ i64->parent_instr);
+ }
+ }
+ }
+ }
+}
+
+static void
+clc_lower_nonnormalized_samplers(nir_shader *nir,
+ const dxil_wrap_sampler_state *states)
+{
+ nir_foreach_function(func, nir) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_tex)
+ continue;
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+ int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+ if (sampler_src_idx == -1)
+ continue;
+
+ nir_src *sampler_src = &tex->src[sampler_src_idx].src;
+ assert(sampler_src->is_ssa && sampler_src->ssa->parent_instr->type == nir_instr_type_deref);
+ nir_variable *sampler = nir_deref_instr_get_variable(
+ nir_instr_as_deref(sampler_src->ssa->parent_instr));
+
+ // If the sampler returns ints, we'll handle this in the int lowering pass
+ if (nir_alu_type_get_base_type(tex->dest_type) != nir_type_float)
+ continue;
+
+ // If sampler uses normalized coords, nothing to do
+ if (!states[sampler->data.binding].is_nonnormalized_coords)
+ continue;
+
+ b.cursor = nir_before_instr(&tex->instr);
+
+ int coords_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+ assert(coords_idx != -1);
+ nir_ssa_def *coords =
+ nir_ssa_for_src(&b, tex->src[coords_idx].src, tex->coord_components);
+
+ nir_ssa_def *txs = nir_i2f32(&b, nir_get_texture_size(&b, tex));
+
+ // Normalize coords for tex
+ nir_ssa_def *scale = nir_frcp(&b, txs);
+ nir_ssa_def *comps[4];
+ for (unsigned i = 0; i < coords->num_components; ++i) {
+ comps[i] = nir_channel(&b, coords, i);
+ if (tex->is_array && i == coords->num_components - 1) {
+ // Don't scale the array index, but do clamp it
+ comps[i] = nir_fround_even(&b, comps[i]);
+ comps[i] = nir_fmax(&b, comps[i], nir_imm_float(&b, 0.0f));
+ comps[i] = nir_fmin(&b, comps[i], nir_fsub(&b, nir_channel(&b, txs, i), nir_imm_float(&b, 1.0f)));
+ break;
+ }
+
+ // The CTS is pretty clear that this value has to be floored for nearest sampling
+ // but must not be for linear sampling.
+ if (!states[sampler->data.binding].is_linear_filtering)
+ comps[i] = nir_fadd_imm(&b, nir_ffloor(&b, comps[i]), 0.5f);
+ comps[i] = nir_fmul(&b, comps[i], nir_channel(&b, scale, i));
+ }
+ nir_ssa_def *normalized_coords = nir_vec(&b, comps, coords->num_components);
+ nir_instr_rewrite_src(&tex->instr,
+ &tex->src[coords_idx].src,
+ nir_src_for_ssa(normalized_coords));
+ }
+ }
+ }
+}
+
+
+static void
+clc_context_optimize(nir_shader *s)
+{
+ bool progress;
+ do {
+ progress = false;
+ NIR_PASS(progress, s, nir_split_var_copies);
+ NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, s, nir_lower_var_copies);
+ NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+ NIR_PASS(progress, s, nir_copy_prop);
+ NIR_PASS(progress, s, nir_opt_remove_phis);
+ NIR_PASS(progress, s, nir_opt_dce);
+ NIR_PASS(progress, s, nir_opt_if, true);
+ NIR_PASS(progress, s, nir_opt_dead_cf);
+ NIR_PASS(progress, s, nir_opt_cse);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, s, nir_opt_algebraic);
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ NIR_PASS(progress, s, nir_opt_undef);
+ NIR_PASS(progress, s, nir_lower_undef_to_zero);
+ NIR_PASS(progress, s, nir_opt_deref);
+ } while (progress);
+}
+
+struct clc_context *
+clc_context_new(const struct clc_logger *logger, const struct clc_context_options *options)
+{
+ struct clc_context *ctx = rzalloc(NULL, struct clc_context);
+ if (!ctx) {
+ clc_error(logger, "D3D12: failed to allocate a clc_context");
+ return NULL;
+ }
+
+ const struct spirv_to_nir_options libclc_spirv_options = {
+ .environment = NIR_SPIRV_OPENCL,
+ .create_library = true,
+ .constant_addr_format = nir_address_format_32bit_index_offset_pack64,
+ .global_addr_format = nir_address_format_32bit_index_offset_pack64,
+ .shared_addr_format = nir_address_format_32bit_offset_as_64bit,
+ .temp_addr_format = nir_address_format_32bit_offset_as_64bit,
+ .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32,
+ .caps = {
+ .address = true,
+ .float64 = true,
+ .int8 = true,
+ .int16 = true,
+ .int64 = true,
+ .kernel = true,
+ },
+ };
+ const struct nir_shader_compiler_options *libclc_nir_options =
+ dxil_get_nir_compiler_options();
+
+ glsl_type_singleton_init_or_ref();
+ nir_shader *s = nir_load_libclc_shader(64, NULL, &libclc_spirv_options, libclc_nir_options);
+ if (!s) {
+ clc_error(logger, "D3D12: spirv_to_nir failed on libclc blob");
+ ralloc_free(ctx);
+ return NULL;
+ }
+
+ if (options && options->optimize)
+ clc_context_optimize(s);
+
+ ctx->libclc_nir = s;
+ ralloc_steal(ctx, ctx->libclc_nir);
+
+ return ctx;
+}
+
+void
+clc_free_context(struct clc_context *ctx)
+{
+ ralloc_free(ctx);
+ glsl_type_singleton_decref();
+};
+
+void clc_context_serialize(struct clc_context *context,
+ void **serialized,
+ size_t *serialized_size)
+{
+ struct blob tmp;
+ blob_init(&tmp);
+ nir_serialize(&tmp, context->libclc_nir, true);
+
+ blob_finish_get_buffer(&tmp, serialized, serialized_size);
+}
+
+void clc_context_free_serialized(void *serialized)
+{
+ free(serialized);
+}
+
+struct clc_context *
+ clc_context_deserialize(const void *serialized, size_t serialized_size)
+{
+ struct clc_context *ctx = rzalloc(NULL, struct clc_context);
+ if (!ctx) {
+ return NULL;
+ }
+ const struct nir_shader_compiler_options *libclc_nir_options =
+ dxil_get_nir_compiler_options();
+
+ glsl_type_singleton_init_or_ref();
+
+ struct blob_reader tmp;
+ blob_reader_init(&tmp, serialized, serialized_size);
+
+ ctx->libclc_nir = nir_deserialize(NULL, libclc_nir_options, &tmp);
+ if (!ctx->libclc_nir) {
+ free(ctx);
+ return NULL;
+ }
+
+ ralloc_steal(ctx, ctx->libclc_nir);
+
+ return ctx;
+}
+
+struct clc_object *
+clc_compile(struct clc_context *ctx,
+ const struct clc_compile_args *args,
+ const struct clc_logger *logger)
+{
+ struct clc_object *obj;
+ int ret;
+
+ obj = calloc(1, sizeof(*obj));
+ if (!obj) {
+ clc_error(logger, "D3D12: failed to allocate a clc_object");
+ return NULL;
+ }
+
+ ret = clc_to_spirv(args, &obj->spvbin, logger);
+ if (ret < 0) {
+ free(obj);
+ return NULL;
+ }
+
+ if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+ clc_dump_spirv(&obj->spvbin, stdout);
+
+ return obj;
+}
+
+struct clc_object *
+clc_link(struct clc_context *ctx,
+ const struct clc_linker_args *args,
+ const struct clc_logger *logger)
+{
+ struct clc_object *out_obj;
+ int ret;
+
+ out_obj = malloc(sizeof(*out_obj));
+ if (!out_obj) {
+ clc_error(logger, "failed to allocate a clc_object");
+ return NULL;
+ }
+
+ ret = clc_link_spirv_binaries(args, &out_obj->spvbin, logger);
+ if (ret < 0) {
+ free(out_obj);
+ return NULL;
+ }
+
+ if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+ clc_dump_spirv(&out_obj->spvbin, stdout);
+
+ out_obj->kernels = clc_spirv_get_kernels_info(&out_obj->spvbin,
+ &out_obj->num_kernels);
+
+ if (debug_get_option_debug_clc() & CLC_DEBUG_VERBOSE)
+ clc_print_kernels_info(out_obj);
+
+ return out_obj;
+}
+
+void clc_free_object(struct clc_object *obj)
+{
+ clc_free_kernels_info(obj->kernels, obj->num_kernels);
+ clc_free_spirv_binary(&obj->spvbin);
+ free(obj);
+}
+
+static nir_variable *
+add_kernel_inputs_var(struct clc_dxil_object *dxil, nir_shader *nir,
+ unsigned *cbv_id)
+{
+ if (!dxil->kernel->num_args)
+ return NULL;
+
+ struct clc_dxil_metadata *metadata = &dxil->metadata;
+ unsigned size = 0;
+
+ nir_foreach_variable_with_modes(var, nir, nir_var_uniform)
+ size = MAX2(size,
+ var->data.driver_location +
+ glsl_get_cl_size(var->type));
+
+ size = align(size, 4);
+
+ nir_variable *var =
+ nir_variable_create(nir, nir_var_mem_ubo,
+ glsl_array_type(glsl_uint_type(),
+ size / 4, 0),
+ "kernel_inputs");
+ var->data.binding = (*cbv_id)++;
+ var->data.how_declared = nir_var_hidden;
+ return var;
+}
+
+static nir_variable *
+add_work_properties_var(struct clc_dxil_object *dxil,
+ struct nir_shader *nir, unsigned *cbv_id)
+{
+ struct clc_dxil_metadata *metadata = &dxil->metadata;
+ nir_variable *var =
+ nir_variable_create(nir, nir_var_mem_ubo,
+ glsl_array_type(glsl_uint_type(),
+ sizeof(struct clc_work_properties_data) / sizeof(unsigned),
+ 0),
+ "kernel_work_properies");
+ var->data.binding = (*cbv_id)++;
+ var->data.how_declared = nir_var_hidden;
+ return var;
+}
+
+static void
+clc_lower_constant_to_ssbo(nir_shader *nir,
+ const struct clc_kernel_info *kerninfo, unsigned *uav_id)
+{
+ /* Update UBO vars and assign them a binding. */
+ nir_foreach_variable_with_modes(var, nir, nir_var_mem_constant) {
+ var->data.mode = nir_var_mem_ssbo;
+ var->data.binding = (*uav_id)++;
+ }
+
+ /* And finally patch all the derefs referincing the constant
+ * variables/pointers.
+ */
+ nir_foreach_function(func, nir) {
+ if (!func->is_entrypoint)
+ continue;
+
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+ if (deref->modes != nir_var_mem_constant)
+ continue;
+
+ deref->modes = nir_var_mem_ssbo;
+ }
+ }
+ }
+}
+
+static void
+clc_lower_global_to_ssbo(nir_shader *nir)
+{
+ nir_foreach_function(func, nir) {
+ if (!func->is_entrypoint)
+ continue;
+
+ assert(func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+ if (deref->modes != nir_var_mem_global)
+ continue;
+
+ deref->modes = nir_var_mem_ssbo;
+ }
+ }
+ }
+}
+
+static void
+copy_const_initializer(const nir_constant *constant, const struct glsl_type *type,
+ uint8_t *data)
+{
+ unsigned size = glsl_get_cl_size(type);
+
+ if (glsl_type_is_array(type)) {
+ const struct glsl_type *elm_type = glsl_get_array_element(type);
+ unsigned step_size = glsl_get_explicit_stride(type);
+
+ for (unsigned i = 0; i < constant->num_elements; i++) {
+ copy_const_initializer(constant->elements[i], elm_type,
+ data + (i * step_size));
+ }
+ } else if (glsl_type_is_struct(type)) {
+ for (unsigned i = 0; i < constant->num_elements; i++) {
+ const struct glsl_type *elm_type = glsl_get_struct_field(type, i);
+ int offset = glsl_get_struct_field_offset(type, i);
+ copy_const_initializer(constant->elements[i], elm_type, data + offset);
+ }
+ } else {
+ assert(glsl_type_is_vector_or_scalar(type));
+
+ for (unsigned i = 0; i < glsl_get_components(type); i++) {
+ switch (glsl_get_bit_size(type)) {
+ case 64:
+ *((uint64_t *)data) = constant->values[i].u64;
+ break;
+ case 32:
+ *((uint32_t *)data) = constant->values[i].u32;
+ break;
+ case 16:
+ *((uint16_t *)data) = constant->values[i].u16;
+ break;
+ case 8:
+ *((uint8_t *)data) = constant->values[i].u8;
+ break;
+ default:
+ unreachable("Invalid base type");
+ }
+
+ data += glsl_get_bit_size(type) / 8;
+ }
+ }
+}
+
+static const struct glsl_type *
+get_cast_type(unsigned bit_size)
+{
+ switch (bit_size) {
+ case 64:
+ return glsl_int64_t_type();
+ case 32:
+ return glsl_int_type();
+ case 16:
+ return glsl_int16_t_type();
+ case 8:
+ return glsl_int8_t_type();
+ }
+ unreachable("Invalid bit_size");
+}
+
+static void
+split_unaligned_load(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment)
+{
+ enum gl_access_qualifier access = nir_intrinsic_access(intrin);
+ nir_ssa_def *srcs[NIR_MAX_VEC_COMPONENTS * NIR_MAX_VEC_COMPONENTS * sizeof(int64_t) / 8];
+ unsigned comp_size = intrin->dest.ssa.bit_size / 8;
+ unsigned num_comps = intrin->dest.ssa.num_components;
+
+ b->cursor = nir_before_instr(&intrin->instr);
+
+ nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]);
+
+ const struct glsl_type *cast_type = get_cast_type(alignment * 8);
+ nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment);
+
+ unsigned num_loads = DIV_ROUND_UP(comp_size * num_comps, alignment);
+ for (unsigned i = 0; i < num_loads; ++i) {
+ nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size));
+ srcs[i] = nir_load_deref_with_access(b, elem, access);
+ }
+
+ nir_ssa_def *new_dest = nir_extract_bits(b, srcs, num_loads, 0, num_comps, intrin->dest.ssa.bit_size);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(new_dest));
+ nir_instr_remove(&intrin->instr);
+}
+
+static void
+split_unaligned_store(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment)
+{
+ enum gl_access_qualifier access = nir_intrinsic_access(intrin);
+
+ assert(intrin->src[1].is_ssa);
+ nir_ssa_def *value = intrin->src[1].ssa;
+ unsigned comp_size = value->bit_size / 8;
+ unsigned num_comps = value->num_components;
+
+ b->cursor = nir_before_instr(&intrin->instr);
+
+ nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]);
+
+ const struct glsl_type *cast_type = get_cast_type(alignment * 8);
+ nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment);
+
+ unsigned num_stores = DIV_ROUND_UP(comp_size * num_comps, alignment);
+ for (unsigned i = 0; i < num_stores; ++i) {
+ nir_ssa_def *substore_val = nir_extract_bits(b, &value, 1, i * alignment * 8, 1, alignment * 8);
+ nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size));
+ nir_store_deref_with_access(b, elem, substore_val, ~0, access);
+ }
+
+ nir_instr_remove(&intrin->instr);
+}
+
+static bool
+split_unaligned_loads_stores(nir_shader *shader)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (!function->impl)
+ continue;
+
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_deref &&
+ intrin->intrinsic != nir_intrinsic_store_deref)
+ continue;
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+ unsigned align_mul = 0, align_offset = 0;
+ nir_get_explicit_deref_align(deref, true, &align_mul, &align_offset);
+
+ unsigned alignment = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
+
+ /* We can load anything at 4-byte alignment, except for
+ * UBOs (AKA CBs where the granularity is 16 bytes).
+ */
+ if (alignment >= (deref->modes == nir_var_mem_ubo ? 16 : 4))
+ continue;
+
+ nir_ssa_def *val;
+ if (intrin->intrinsic == nir_intrinsic_load_deref) {
+ assert(intrin->dest.is_ssa);
+ val = &intrin->dest.ssa;
+ } else {
+ assert(intrin->src[1].is_ssa);
+ val = intrin->src[1].ssa;
+ }
+
+ unsigned natural_alignment =
+ val->bit_size / 8 *
+ (val->num_components == 3 ? 4 : val->num_components);
+
+ if (alignment >= natural_alignment)
+ continue;
+
+ if (intrin->intrinsic == nir_intrinsic_load_deref)
+ split_unaligned_load(&b, intrin, alignment);
+ else
+ split_unaligned_store(&b, intrin, alignment);
+ progress = true;
+ }
+ }
+ }
+
+ return progress;
+}
+
+static enum pipe_tex_wrap
+wrap_from_cl_addressing(unsigned addressing_mode)
+{
+ switch (addressing_mode)
+ {
+ default:
+ case SAMPLER_ADDRESSING_MODE_NONE:
+ case SAMPLER_ADDRESSING_MODE_CLAMP:
+ // Since OpenCL's only border color is 0's and D3D specs out-of-bounds loads to return 0, don't apply any wrap mode
+ return (enum pipe_tex_wrap)-1;
+ case SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: return PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+ case SAMPLER_ADDRESSING_MODE_REPEAT: return PIPE_TEX_WRAP_REPEAT;
+ case SAMPLER_ADDRESSING_MODE_REPEAT_MIRRORED: return PIPE_TEX_WRAP_MIRROR_REPEAT;
+ }
+}
+
+static bool shader_has_double(nir_shader *nir)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+
+ assert(func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_alu)
+ continue;
+
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ const nir_op_info *info = &nir_op_infos[alu->op];
+
+ if (info->output_type & nir_type_float &&
+ nir_dest_bit_size(alu->dest.dest) == 64)
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool
+scale_fdiv(nir_shader *nir)
+{
+ bool progress = false;
+ nir_foreach_function(func, nir) {
+ if (!func->impl)
+ continue;
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_alu)
+ continue;
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ if (alu->op != nir_op_fdiv)
+ continue;
+
+ b.cursor = nir_before_instr(instr);
+ nir_ssa_def *fabs = nir_fabs(&b, alu->src[1].src.ssa);
+ nir_ssa_def *big = nir_flt(&b, nir_imm_int(&b, 0x7e800000), fabs);
+ nir_ssa_def *small = nir_flt(&b, fabs, nir_imm_int(&b, 0x00800000));
+
+ nir_ssa_def *scaled_down_a = nir_fmul_imm(&b, alu->src[0].src.ssa, 0.25);
+ nir_ssa_def *scaled_down_b = nir_fmul_imm(&b, alu->src[1].src.ssa, 0.25);
+ nir_ssa_def *scaled_up_a = nir_fmul_imm(&b, alu->src[0].src.ssa, 16777216.0);
+ nir_ssa_def *scaled_up_b = nir_fmul_imm(&b, alu->src[1].src.ssa, 16777216.0);
+
+ nir_ssa_def *final_a =
+ nir_bcsel(&b, big, scaled_down_a,
+ (nir_bcsel(&b, small, scaled_up_a, alu->src[0].src.ssa)));
+ nir_ssa_def *final_b =
+ nir_bcsel(&b, big, scaled_down_b,
+ (nir_bcsel(&b, small, scaled_up_b, alu->src[1].src.ssa)));
+
+ nir_instr_rewrite_src(instr, &alu->src[0].src, nir_src_for_ssa(final_a));
+ nir_instr_rewrite_src(instr, &alu->src[1].src, nir_src_for_ssa(final_b));
+ progress = true;
+ }
+ }
+ }
+ return progress;
+}
+
+struct clc_dxil_object *
+clc_to_dxil(struct clc_context *ctx,
+ const struct clc_object *obj,
+ const char *entrypoint,
+ const struct clc_runtime_kernel_conf *conf,
+ const struct clc_logger *logger)
+{
+ struct clc_dxil_object *dxil;
+ struct nir_shader *nir;
+ char *err_log;
+ int ret;
+
+ dxil = calloc(1, sizeof(*dxil));
+ if (!dxil) {
+ clc_error(logger, "failed to allocate the dxil object");
+ return NULL;
+ }
+
+ for (unsigned i = 0; i < obj->num_kernels; i++) {
+ if (!strcmp(obj->kernels[i].name, entrypoint)) {
+ dxil->kernel = &obj->kernels[i];
+ break;
+ }
+ }
+
+ if (!dxil->kernel) {
+ clc_error(logger, "no '%s' kernel found", entrypoint);
+ goto err_free_dxil;
+ }
+
+ const struct spirv_to_nir_options spirv_options = {
+ .environment = NIR_SPIRV_OPENCL,
+ .clc_shader = ctx->libclc_nir,
+ .constant_addr_format = nir_address_format_32bit_index_offset_pack64,
+ .global_addr_format = nir_address_format_32bit_index_offset_pack64,
+ .shared_addr_format = nir_address_format_32bit_offset_as_64bit,
+ .temp_addr_format = nir_address_format_32bit_offset_as_64bit,
+ .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32,
+ .caps = {
+ .address = true,
+ .float64 = true,
+ .int8 = true,
+ .int16 = true,
+ .int64 = true,
+ .kernel = true,
+ .kernel_image = true,
+ .literal_sampler = true,
+ },
+ };
+ nir_shader_compiler_options nir_options =
+ *dxil_get_nir_compiler_options();
+
+ if (conf && conf->lower_bit_size & 64) {
+ nir_options.lower_pack_64_2x32_split = false;
+ nir_options.lower_unpack_64_2x32_split = false;
+ nir_options.lower_int64_options = ~0;
+ }
+
+ if (conf && conf->lower_bit_size & 16)
+ nir_options.support_16bit_alu = true;
+
+ glsl_type_singleton_init_or_ref();
+
+ nir = spirv_to_nir(obj->spvbin.data, obj->spvbin.size / 4,
+ NULL, 0,
+ MESA_SHADER_KERNEL, entrypoint,
+ &spirv_options,
+ &nir_options);
+ if (!nir) {
+ clc_error(logger, "spirv_to_nir() failed");
+ goto err_free_dxil;
+ }
+ nir->info.cs.local_size_variable = true;
+
+ NIR_PASS_V(nir, nir_lower_goto_ifs);
+ NIR_PASS_V(nir, nir_opt_dead_cf);
+
+ struct clc_dxil_metadata *metadata = &dxil->metadata;
+
+ metadata->args = calloc(dxil->kernel->num_args,
+ sizeof(*metadata->args));
+ if (!metadata->args) {
+ clc_error(logger, "failed to allocate arg positions");
+ goto err_free_dxil;
+ }
+
+ // Calculate input offsets/metadata.
+ unsigned uav_id = 0, sampler_id = 0, offset = 0;
+ dxil_wrap_sampler_state int_sampler_states[PIPE_MAX_SHADER_SAMPLER_VIEWS] = {{{0}}};
+ nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
+ int i = var->data.location;
+ if (i < 0)
+ continue;
+
+ unsigned size = glsl_get_cl_size(var->type);
+ offset = align(offset, glsl_get_cl_alignment(var->type));
+ var->data.driver_location = offset;
+
+ metadata->args[i].offset = offset;
+ metadata->args[i].size = size;
+ metadata->kernel_inputs_buf_size = MAX2(metadata->kernel_inputs_buf_size,
+ offset + size);
+ if ((dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL ||
+ dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_CONSTANT) &&
+ // Ignore images during this pass - global memory buffers need to have contiguous bindings
+ !glsl_type_is_image(var->type)) {
+ metadata->args[i].globconstptr.buf_id = uav_id++;
+ } else if (glsl_type_is_sampler(var->type)) {
+ unsigned address_mode = conf ? conf->args[i].sampler.addressing_mode : 0u;
+ int_sampler_states[sampler_id].wrap[0] =
+ int_sampler_states[sampler_id].wrap[1] =
+ int_sampler_states[sampler_id].wrap[2] = wrap_from_cl_addressing(address_mode);
+ int_sampler_states[sampler_id].is_nonnormalized_coords =
+ conf ? !conf->args[i].sampler.normalized_coords : 0;
+ int_sampler_states[sampler_id].is_linear_filtering =
+ conf ? conf->args[i].sampler.linear_filtering : 0;
+ metadata->args[i].sampler.sampler_id = var->data.binding = sampler_id++;
+ }
+ offset += size;
+ }
+
+ unsigned num_global_inputs = uav_id;
+
+ // Second pass over inputs to calculate image bindings
+ unsigned srv_id = 0;
+ nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
+ int i = var->data.location;
+ if (i < 0)
+ continue;
+
+ if (glsl_type_is_image(var->type)) {
+ if (var->data.access == ACCESS_NON_WRITEABLE) {
+ metadata->args[i].image.buf_ids[0] = srv_id++;
+ } else {
+ // Write or read-write are UAVs
+ metadata->args[i].image.buf_ids[0] = uav_id++;
+ }
+
+ metadata->args[i].image.num_buf_ids = 1;
+ var->data.binding = metadata->args[i].image.buf_ids[0];
+ }
+ }
+
+ {
+ bool progress;
+ do
+ {
+ progress = false;
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, nir, nir_opt_deref);
+ NIR_PASS(progress, nir, nir_opt_dce);
+ NIR_PASS(progress, nir, nir_opt_undef);
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+ NIR_PASS(progress, nir, nir_opt_cse);
+ NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+ NIR_PASS(progress, nir, nir_opt_algebraic);
+ } while (progress);
+ }
+
+ // Inline all functions first.
+ // according to the comment on nir_inline_functions
+ NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+ NIR_PASS_V(nir, nir_lower_returns);
+ NIR_PASS_V(nir, nir_lower_libclc, ctx->libclc_nir);
+ NIR_PASS_V(nir, nir_inline_functions);
+
+ // Pick off the single entrypoint that we want.
+ foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ exec_node_remove(&func->node);
+ }
+ assert(exec_list_length(&nir->functions) == 1);
+
+ {
+ bool progress;
+ do
+ {
+ progress = false;
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, nir, nir_opt_deref);
+ NIR_PASS(progress, nir, nir_opt_dce);
+ NIR_PASS(progress, nir, nir_opt_undef);
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+ NIR_PASS(progress, nir, nir_opt_cse);
+ NIR_PASS(progress, nir, nir_split_var_copies);
+ NIR_PASS(progress, nir, nir_lower_var_copies);
+ NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+ NIR_PASS(progress, nir, nir_opt_algebraic);
+ NIR_PASS(progress, nir, nir_opt_if, true);
+ NIR_PASS(progress, nir, nir_opt_dead_cf);
+ NIR_PASS(progress, nir, nir_opt_remove_phis);
+ NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
+ } while (progress);
+ }
+
+ // Before removing dead uniforms, dedupe constant samplers to make more dead uniforms
+ NIR_PASS_V(nir, clc_nir_dedupe_const_samplers);
+ NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | nir_var_mem_constant | nir_var_function_temp, NULL);
+
+ NIR_PASS_V(nir, scale_fdiv);
+
+ // Assign bindings for constant samplers
+ nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
+ if (glsl_type_is_sampler(var->type) && var->data.sampler.is_inline_sampler) {
+ int_sampler_states[sampler_id].wrap[0] =
+ int_sampler_states[sampler_id].wrap[0] =
+ int_sampler_states[sampler_id].wrap[0] =
+ wrap_from_cl_addressing(var->data.sampler.addressing_mode);
+ int_sampler_states[sampler_id].is_nonnormalized_coords =
+ !var->data.sampler.normalized_coordinates;
+ int_sampler_states[sampler_id].is_linear_filtering =
+ var->data.sampler.filter_mode == SAMPLER_FILTER_MODE_LINEAR;
+ var->data.binding = sampler_id++;
+
+ assert(metadata->num_const_samplers < CLC_MAX_SAMPLERS);
+ metadata->const_samplers[metadata->num_const_samplers].sampler_id = var->data.binding;
+ metadata->const_samplers[metadata->num_const_samplers].addressing_mode = var->data.sampler.addressing_mode;
+ metadata->const_samplers[metadata->num_const_samplers].normalized_coords = var->data.sampler.normalized_coordinates;
+ metadata->const_samplers[metadata->num_const_samplers].filter_mode = var->data.sampler.filter_mode;
+ metadata->num_const_samplers++;
+ }
+ }
+
+ NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_function_temp | nir_var_shader_temp));
+
+ // Lower memcpy
+ NIR_PASS_V(nir, dxil_nir_lower_memcpy_deref);
+
+ bool has_printf = false;
+ //NIR_PASS(has_printf, nir, clc_nir_lower_printf, uav_id);
+ metadata->printf_uav_id = has_printf ? uav_id++ : -1;
+
+ // copy propagate to prepare for lower_explicit_io
+ NIR_PASS_V(nir, nir_split_var_copies);
+ NIR_PASS_V(nir, nir_opt_copy_prop_vars);
+ NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+ NIR_PASS_V(nir, nir_lower_alu);
+ NIR_PASS_V(nir, nir_opt_dce);
+ NIR_PASS_V(nir, nir_opt_deref);
+
+ // Needs to come before lower_explicit_io
+ NIR_PASS_V(nir, nir_lower_cl_images_to_tex);
+ struct clc_image_lower_context image_lower_context = { metadata, &srv_id, &uav_id };
+ NIR_PASS_V(nir, clc_lower_images, &image_lower_context);
+ NIR_PASS_V(nir, clc_lower_nonnormalized_samplers, int_sampler_states);
+ NIR_PASS_V(nir, nir_lower_samplers);
+ NIR_PASS_V(nir, dxil_lower_sample_to_txf_for_integer_tex,
+ int_sampler_states, NULL, 14.0f);
+
+ NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
+ assert(nir->scratch_size == 0);
+
+ NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+ nir_var_mem_shared | nir_var_function_temp | nir_var_uniform | nir_var_mem_global | nir_var_mem_constant,
+ glsl_get_cl_type_size_align);
+
+ NIR_PASS_V(nir, dxil_nir_lower_ubo_to_temp);
+ NIR_PASS_V(nir, clc_lower_constant_to_ssbo, dxil->kernel, &uav_id);
+ NIR_PASS_V(nir, clc_lower_global_to_ssbo);
+ NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo);
+
+ NIR_PASS_V(nir, split_unaligned_loads_stores);
+
+ assert(nir->info.cs.ptr_size == 64);
+ NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+ nir_address_format_32bit_index_offset_pack64);
+ NIR_PASS_V(nir, nir_lower_explicit_io,
+ nir_var_mem_shared | nir_var_function_temp | nir_var_uniform,
+ nir_address_format_32bit_offset_as_64bit);
+
+ NIR_PASS_V(nir, nir_lower_system_values);
+
+ nir_lower_compute_system_values_options compute_options = {
+ .has_base_global_invocation_id = (conf && conf->support_global_work_id_offsets),
+ .has_base_work_group_id = (conf && conf->support_work_group_id_offsets),
+ };
+ NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_options);
+
+ NIR_PASS_V(nir, clc_lower_64bit_semantics);
+
+ NIR_PASS_V(nir, nir_opt_deref);
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+ unsigned cbv_id = 0;
+
+ nir_variable *inputs_var =
+ add_kernel_inputs_var(dxil, nir, &cbv_id);
+ nir_variable *work_properties_var =
+ add_work_properties_var(dxil, nir, &cbv_id);
+
+ // Patch the localsize before calling clc_nir_lower_system_values().
+ if (conf) {
+ for (unsigned i = 0; i < ARRAY_SIZE(nir->info.cs.local_size); i++) {
+ if (!conf->local_size[i] ||
+ conf->local_size[i] == nir->info.cs.local_size[i])
+ continue;
+
+ if (nir->info.cs.local_size[i] &&
+ nir->info.cs.local_size[i] != conf->local_size[i]) {
+ debug_printf("D3D12: runtime local size does not match reqd_work_group_size() values\n");
+ goto err_free_dxil;
+ }
+
+ nir->info.cs.local_size[i] = conf->local_size[i];
+ }
+ }
+
+ NIR_PASS_V(nir, clc_nir_lower_kernel_input_loads, inputs_var);
+ NIR_PASS_V(nir, split_unaligned_loads_stores);
+ NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
+ nir_address_format_32bit_index_offset);
+ NIR_PASS_V(nir, clc_nir_lower_system_values, work_properties_var);
+ NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil);
+ NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs);
+ NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil);
+ NIR_PASS_V(nir, dxil_nir_lower_fp16_casts);
+ NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
+
+ // Convert pack to pack_split
+ NIR_PASS_V(nir, nir_lower_pack);
+ // Lower pack_split to bit math
+ NIR_PASS_V(nir, nir_opt_algebraic);
+
+ NIR_PASS_V(nir, nir_opt_dce);
+
+ nir_validate_shader(nir, "Validate before feeding NIR to the DXIL compiler");
+ struct nir_to_dxil_options opts = {
+ .interpolate_at_vertex = false,
+ .lower_int16 = (conf && (conf->lower_bit_size & 16) != 0),
+ .ubo_binding_offset = 0,
+ .disable_math_refactoring = true,
+ .num_kernel_globals = num_global_inputs,
+ };
+
+ for (unsigned i = 0; i < dxil->kernel->num_args; i++) {
+ if (dxil->kernel->args[i].address_qualifier != CLC_KERNEL_ARG_ADDRESS_LOCAL)
+ continue;
+
+ /* If we don't have the runtime conf yet, we just create a dummy variable.
+ * This will be adjusted when clc_to_dxil() is called with a conf
+ * argument.
+ */
+ unsigned size = 4;
+ if (conf && conf->args)
+ size = conf->args[i].localptr.size;
+
+ /* The alignment required for the pointee type is not easy to get from
+ * here, so let's base our logic on the size itself. Anything bigger than
+ * the maximum alignment constraint (which is 128 bytes, since ulong16 or
+ * doubl16 size are the biggest base types) should be aligned on this
+ * maximum alignment constraint. For smaller types, we use the size
+ * itself to calculate the alignment.
+ */
+ unsigned alignment = size < 128 ? (1 << (ffs(size) - 1)) : 128;
+
+ nir->info.cs.shared_size = align(nir->info.cs.shared_size, alignment);
+ metadata->args[i].localptr.sharedmem_offset = nir->info.cs.shared_size;
+ nir->info.cs.shared_size += size;
+ }
+
+ metadata->local_mem_size = nir->info.cs.shared_size;
+ metadata->priv_mem_size = nir->scratch_size;
+
+ /* DXIL double math is too limited compared to what NIR expects. Let's refuse
+ * to compile a shader when it contains double operations until we have
+ * double lowering hooked up.
+ */
+ if (shader_has_double(nir)) {
+ clc_error(logger, "NIR shader contains doubles, which we don't support yet");
+ goto err_free_dxil;
+ }
+
+ struct blob tmp;
+ if (!nir_to_dxil(nir, &opts, &tmp)) {
+ debug_printf("D3D12: nir_to_dxil failed\n");
+ goto err_free_dxil;
+ }
+
+ memcpy(metadata->local_size, nir->info.cs.local_size,
+ sizeof(metadata->local_size));
+ memcpy(metadata->local_size_hint, nir->info.cs.local_size_hint,
+ sizeof(metadata->local_size));
+
+ nir_foreach_variable_with_modes(var, nir, nir_var_mem_ssbo) {
+ if (var->constant_initializer) {
+ if (glsl_type_is_array(var->type)) {
+ int size = align(glsl_get_cl_size(var->type), 4);
+ uint8_t *data = malloc(size);
+ if (!data)
+ goto err_free_dxil;
+
+ copy_const_initializer(var->constant_initializer, var->type, data);
+ metadata->consts[metadata->num_consts].data = data;
+ metadata->consts[metadata->num_consts].size = size;
+ metadata->consts[metadata->num_consts].uav_id = var->data.binding;
+ metadata->num_consts++;
+ } else
+ unreachable("unexpected constant initializer");
+ }
+ }
+
+ metadata->kernel_inputs_cbv_id = inputs_var ? inputs_var->data.binding : 0;
+ metadata->work_properties_cbv_id = work_properties_var->data.binding;
+ metadata->num_uavs = uav_id;
+ metadata->num_srvs = srv_id;
+ metadata->num_samplers = sampler_id;
+
+ ralloc_free(nir);
+ glsl_type_singleton_decref();
+
+ blob_finish_get_buffer(&tmp, &dxil->binary.data,
+ &dxil->binary.size);
+ return dxil;
+
+err_free_dxil:
+ clc_free_dxil_object(dxil);
+ return NULL;
+}
+
+void clc_free_dxil_object(struct clc_dxil_object *dxil)
+{
+ for (unsigned i = 0; i < dxil->metadata.num_consts; i++)
+ free(dxil->metadata.consts[i].data);
+
+ free(dxil->binary.data);
+ free(dxil);
+}
+
+uint64_t clc_compiler_get_version()
+{
+ const char sha1[] = MESA_GIT_SHA1;
+ const char* dash = strchr(sha1, '-');
+ if (dash) {
+ return strtoull(dash + 1, NULL, 16);
+ }
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CLC_COMPILER_H
+#define CLC_COMPILER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct clc_named_value {
+ const char *name;
+ const char *value;
+};
+
+struct clc_compile_args {
+ const struct clc_named_value *headers;
+ unsigned num_headers;
+ struct clc_named_value source;
+ const char * const *args;
+ unsigned num_args;
+};
+
+struct clc_linker_args {
+ const struct clc_object * const *in_objs;
+ unsigned num_in_objs;
+ unsigned create_library;
+};
+
+typedef void (*clc_msg_callback)(void *priv, const char *msg);
+
+struct clc_logger {
+ void *priv;
+ clc_msg_callback error;
+ clc_msg_callback warning;
+};
+
+struct spirv_binary {
+ uint32_t *data;
+ size_t size;
+};
+
+enum clc_kernel_arg_type_qualifier {
+ CLC_KERNEL_ARG_TYPE_CONST = 1 << 0,
+ CLC_KERNEL_ARG_TYPE_RESTRICT = 1 << 1,
+ CLC_KERNEL_ARG_TYPE_VOLATILE = 1 << 2,
+};
+
+enum clc_kernel_arg_access_qualifier {
+ CLC_KERNEL_ARG_ACCESS_READ = 1 << 0,
+ CLC_KERNEL_ARG_ACCESS_WRITE = 1 << 1,
+};
+
+enum clc_kernel_arg_address_qualifier {
+ CLC_KERNEL_ARG_ADDRESS_PRIVATE,
+ CLC_KERNEL_ARG_ADDRESS_CONSTANT,
+ CLC_KERNEL_ARG_ADDRESS_LOCAL,
+ CLC_KERNEL_ARG_ADDRESS_GLOBAL,
+};
+
+struct clc_kernel_arg {
+ const char *name;
+ const char *type_name;
+ unsigned type_qualifier;
+ unsigned access_qualifier;
+ enum clc_kernel_arg_address_qualifier address_qualifier;
+};
+
+enum clc_vec_hint_type {
+ CLC_VEC_HINT_TYPE_CHAR = 0,
+ CLC_VEC_HINT_TYPE_SHORT = 1,
+ CLC_VEC_HINT_TYPE_INT = 2,
+ CLC_VEC_HINT_TYPE_LONG = 3,
+ CLC_VEC_HINT_TYPE_HALF = 4,
+ CLC_VEC_HINT_TYPE_FLOAT = 5,
+ CLC_VEC_HINT_TYPE_DOUBLE = 6
+};
+
+struct clc_kernel_info {
+ const char *name;
+ size_t num_args;
+ const struct clc_kernel_arg *args;
+
+ unsigned vec_hint_size;
+ enum clc_vec_hint_type vec_hint_type;
+};
+
+struct clc_object {
+ struct spirv_binary spvbin;
+ const struct clc_kernel_info *kernels;
+ unsigned num_kernels;
+};
+
+#define CLC_MAX_CONSTS 32
+#define CLC_MAX_BINDINGS_PER_ARG 3
+#define CLC_MAX_SAMPLERS 16
+
+struct clc_dxil_metadata {
+ struct {
+ unsigned offset;
+ unsigned size;
+ union {
+ struct {
+ unsigned buf_ids[CLC_MAX_BINDINGS_PER_ARG];
+ unsigned num_buf_ids;
+ } image;
+ struct {
+ unsigned sampler_id;
+ } sampler;
+ struct {
+ unsigned buf_id;
+ } globconstptr;
+ struct {
+ unsigned sharedmem_offset;
+ } localptr;
+ };
+ } *args;
+ unsigned kernel_inputs_cbv_id;
+ unsigned kernel_inputs_buf_size;
+ unsigned work_properties_cbv_id;
+ size_t num_uavs;
+ size_t num_srvs;
+ size_t num_samplers;
+
+ struct {
+ void *data;
+ size_t size;
+ unsigned uav_id;
+ } consts[CLC_MAX_CONSTS];
+ size_t num_consts;
+
+ struct {
+ unsigned sampler_id;
+ unsigned addressing_mode;
+ unsigned normalized_coords;
+ unsigned filter_mode;
+ } const_samplers[CLC_MAX_SAMPLERS];
+ size_t num_const_samplers;
+ size_t local_mem_size;
+ size_t priv_mem_size;
+
+ uint16_t local_size[3];
+ uint16_t local_size_hint[3];
+
+ int printf_uav_id;
+};
+
+struct clc_dxil_object {
+ const struct clc_kernel_info *kernel;
+ struct clc_dxil_metadata metadata;
+ struct {
+ void *data;
+ size_t size;
+ } binary;
+};
+
+struct clc_context {
+ const void *libclc_nir;
+};
+
+struct clc_context_options {
+ unsigned optimize;
+};
+
+struct clc_context *clc_context_new(const struct clc_logger *logger, const struct clc_context_options *options);
+
+void clc_free_context(struct clc_context *ctx);
+
+void clc_context_serialize(struct clc_context *ctx, void **serialized, size_t *size);
+void clc_context_free_serialized(void *serialized);
+struct clc_context *clc_context_deserialize(void *serialized, size_t size);
+
+struct clc_object *
+clc_compile(struct clc_context *ctx,
+ const struct clc_compile_args *args,
+ const struct clc_logger *logger);
+
+struct clc_object *
+clc_link(struct clc_context *ctx,
+ const struct clc_linker_args *args,
+ const struct clc_logger *logger);
+
+void clc_free_object(struct clc_object *obj);
+
+struct clc_runtime_arg_info {
+ union {
+ struct {
+ unsigned size;
+ } localptr;
+ struct {
+ unsigned normalized_coords;
+ unsigned addressing_mode; /* See SPIR-V spec for value meanings */
+ unsigned linear_filtering;
+ } sampler;
+ };
+};
+
+struct clc_runtime_kernel_conf {
+ uint16_t local_size[3];
+ struct clc_runtime_arg_info *args;
+ unsigned lower_bit_size;
+ unsigned support_global_work_id_offsets;
+ unsigned support_work_group_id_offsets;
+};
+
+struct clc_dxil_object *
+clc_to_dxil(struct clc_context *ctx,
+ const struct clc_object *obj,
+ const char *entrypoint,
+ const struct clc_runtime_kernel_conf *conf,
+ const struct clc_logger *logger);
+
+void clc_free_dxil_object(struct clc_dxil_object *dxil);
+
+/* This struct describes the layout of data expected in the CB bound at global_work_offset_cbv_id */
+struct clc_work_properties_data {
+ /* Returned from get_global_offset(), and added into get_global_id() */
+ unsigned global_offset_x;
+ unsigned global_offset_y;
+ unsigned global_offset_z;
+ /* Returned from get_work_dim() */
+ unsigned work_dim;
+ /* The number of work groups being launched (i.e. the parameters to Dispatch).
+ * If the requested global size doesn't fit in a single Dispatch, these values should
+ * indicate the total number of groups that *should* have been launched. */
+ unsigned group_count_total_x;
+ unsigned group_count_total_y;
+ unsigned group_count_total_z;
+ unsigned padding;
+ /* If the requested global size doesn't fit in a single Dispatch, subsequent dispatches
+ * should fill out these offsets to indicate how many groups have already been launched */
+ unsigned group_id_offset_x;
+ unsigned group_id_offset_y;
+ unsigned group_id_offset_z;
+};
+
+uint64_t clc_compiler_get_version();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdexcept>
+#include <vector>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <gtest/gtest.h>
+#include <wrl.h>
+
+#include "compute_test.h"
+
+using std::vector;
+
+TEST_F(ComputeTest, runtime_memcpy)
+{
+ struct shift { uint8_t val; uint8_t shift; uint16_t ret; };
+ const char *kernel_source =
+ "struct shift { uchar val; uchar shift; ushort ret; };\n\
+ __kernel void main_test(__global struct shift *inout)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ uint id2 = id + get_global_id(1);\n\
+ struct shift lc[4] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }};\n\
+ lc[id] = inout[id];\n\
+ inout[id2].ret = (ushort) lc[id2].val << (ushort) lc[id2].shift;\n\
+ }\n";
+
+ auto inout = ShaderArg<struct shift>({
+ { 0x10, 1, 0xffff },
+ { 0x20, 2, 0xffff },
+ { 0x30, 3, 0xffff },
+ { 0x40, 4, 0xffff },
+ },
+ SHADER_ARG_INOUT);
+ const uint16_t expected[] = { 0x20, 0x80, 0x180, 0x400 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i].ret, expected[i]);
+}
+
+TEST_F(ComputeTest, two_global_arrays)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *g1, __global uint *g2)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ g1[idx] -= g2[idx];\n\
+ }\n";
+ auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+ auto g2 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+ const uint32_t expected[] = {
+ 9, 18, 27, 36
+ };
+
+ run_shader(kernel_source, g1.size(), 1, 1, g1, g2);
+ for (int i = 0; i < g1.size(); ++i)
+ EXPECT_EQ(g1[i], expected[i]);
+}
+
+TEST_F(ComputeTest, i64tof32)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global long *out, __constant long *in)\n\
+ {\n\
+ __local float tmp[12];\n\
+ uint idx = get_global_id(0);\n\
+ tmp[idx] = in[idx];\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ out[idx] = tmp[idx + get_global_id(1)];\n\
+ }\n";
+ auto in = ShaderArg<int64_t>({ 0x100000000LL,
+ -0x100000000LL,
+ 0x7fffffffffffffffLL,
+ 0x4000004000000000LL,
+ 0x4000003fffffffffLL,
+ 0x4000004000000001LL,
+ -1,
+ -0x4000004000000000LL,
+ -0x4000003fffffffffLL,
+ -0x4000004000000001LL,
+ 0,
+ INT64_MIN },
+ SHADER_ARG_INPUT);
+ auto out = ShaderArg<int64_t>(std::vector<int64_t>(12, 0xdeadbeed), SHADER_ARG_OUTPUT);
+ const int64_t expected[] = {
+ 0x100000000LL,
+ -0x100000000LL,
+ 0x7fffffffffffffffLL,
+ 0x4000000000000000LL,
+ 0x4000000000000000LL,
+ 0x4000008000000000LL,
+ -1,
+ -0x4000000000000000LL,
+ -0x4000000000000000LL,
+ -0x4000008000000000LL,
+ 0,
+ INT64_MIN,
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ((int64_t)out[i], expected[i]);
+ }
+}
+TEST_F(ComputeTest, two_constant_arrays)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__constant uint *c1, __global uint *g1, __constant uint *c2)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ g1[idx] -= c1[idx] + c2[idx];\n\
+ }\n";
+ auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+ auto c1 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+ auto c2 = ShaderArg<uint32_t>(std::vector<uint32_t>(16384, 5), SHADER_ARG_INPUT);
+ const uint32_t expected[] = {
+ 4, 13, 22, 31
+ };
+
+ run_shader(kernel_source, g1.size(), 1, 1, c1, g1, c2);
+ for (int i = 0; i < g1.size(); ++i)
+ EXPECT_EQ(g1[i], expected[i]);
+}
+
+TEST_F(ComputeTest, null_constant_ptr)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *g1, __constant uint *c1)\n\
+ {\n\
+ __constant uint fallback[] = {2, 3, 4, 5};\n\
+ __constant uint *c = c1 ? c1 : fallback;\n\
+ uint idx = get_global_id(0);\n\
+ g1[idx] -= c[idx];\n\
+ }\n";
+ auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+ auto c1 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+ const uint32_t expected1[] = {
+ 9, 18, 27, 36
+ };
+
+ run_shader(kernel_source, g1.size(), 1, 1, g1, c1);
+ for (int i = 0; i < g1.size(); ++i)
+ EXPECT_EQ(g1[i], expected1[i]);
+
+ const uint32_t expected2[] = {
+ 8, 17, 26, 35
+ };
+
+ g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+ auto c2 = NullShaderArg();
+ run_shader(kernel_source, g1.size(), 1, 1, g1, c2);
+ for (int i = 0; i < g1.size(); ++i)
+ EXPECT_EQ(g1[i], expected2[i]);
+}
+
+/* This test seems to fail on older versions of WARP. */
+TEST_F(ComputeTest, DISABLED_null_global_ptr)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *g1, __global uint *g2)\n\
+ {\n\
+ __constant uint fallback[] = {2, 3, 4, 5};\n\
+ uint idx = get_global_id(0);\n\
+ g1[idx] -= g2 ? g2[idx] : fallback[idx];\n\
+ }\n";
+ auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+ auto g2 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
+ const uint32_t expected1[] = {
+ 9, 18, 27, 36
+ };
+
+ run_shader(kernel_source, g1.size(), 1, 1, g1, g2);
+ for (int i = 0; i < g1.size(); ++i)
+ EXPECT_EQ(g1[i], expected1[i]);
+
+ const uint32_t expected2[] = {
+ 8, 17, 26, 35
+ };
+
+ g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
+ auto g2null = NullShaderArg();
+ run_shader(kernel_source, g1.size(), 1, 1, g1, g2null);
+ for (int i = 0; i < g1.size(); ++i)
+ EXPECT_EQ(g1[i], expected2[i]);
+}
+
+TEST_F(ComputeTest, ret_constant_ptr)
+{
+ struct s { uint64_t ptr; uint32_t val; };
+ const char *kernel_source =
+ "struct s { __constant uint *ptr; uint val; };\n\
+ __kernel void main_test(__global struct s *out, __constant uint *in)\n\
+ {\n\
+ __constant uint foo[] = { 1, 2 };\n\
+ uint idx = get_global_id(0);\n\
+ if (idx == 0)\n\
+ out[idx].ptr = foo;\n\
+ else\n\
+ out[idx].ptr = in;\n\
+ out[idx].val = out[idx].ptr[idx];\n\
+ }\n";
+ auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT);
+ auto in = ShaderArg<uint32_t>({ 3, 4 }, SHADER_ARG_INPUT);
+ const uint32_t expected_val[] = {
+ 1, 4
+ };
+ const uint64_t expected_ptr[] = {
+ 2ull << 32, 1ull << 32
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].val, expected_val[i]);
+ EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+ }
+}
+
+TEST_F(ComputeTest, ret_global_ptr)
+{
+ struct s { uint64_t ptr; uint32_t val; };
+ const char *kernel_source =
+ "struct s { __global uint *ptr; uint val; };\n\
+ __kernel void main_test(__global struct s *out, __global uint *in1, __global uint *in2)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ out[idx].ptr = idx ? in2 : in1;\n\
+ out[idx].val = out[idx].ptr[idx];\n\
+ }\n";
+ auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT);
+ auto in1 = ShaderArg<uint32_t>({ 1, 2 }, SHADER_ARG_INPUT);
+ auto in2 = ShaderArg<uint32_t>({ 3, 4 }, SHADER_ARG_INPUT);
+ const uint32_t expected_val[] = {
+ 1, 4
+ };
+ const uint64_t expected_ptr[] = {
+ 1ull << 32, 2ull << 32
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out, in1, in2);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].val, expected_val[i]);
+ EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+ }
+}
+
+TEST_F(ComputeTest, ret_local_ptr)
+{
+ struct s { uint64_t ptr; };
+ const char *kernel_source =
+ "struct s { __local uint *ptr; };\n\
+ __kernel void main_test(__global struct s *out)\n\
+ {\n\
+ __local uint tmp[2];\n\
+ uint idx = get_global_id(0);\n\
+ tmp[idx] = idx;\n\
+ out[idx].ptr = &tmp[idx];\n\
+ }\n";
+ auto out = ShaderArg<struct s>(std::vector<struct s>(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT);
+ const uint64_t expected_ptr[] = {
+ 0, 4,
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+ }
+}
+
+TEST_F(ComputeTest, ret_private_ptr)
+{
+ struct s { uint64_t ptr; uint32_t value; };
+ const char *kernel_source =
+ "struct s { __private uint *ptr; uint value; };\n\
+ __kernel void main_test(__global struct s *out)\n\
+ {\n\
+ uint tmp[2] = {1, 2};\n\
+ uint idx = get_global_id(0);\n\
+ out[idx].ptr = &tmp[idx];\n\
+ out[idx].value = *out[idx].ptr;\n\
+ }\n";
+ auto out = ShaderArg<struct s>(std::vector<struct s>(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT);
+ const uint64_t expected_ptr[] = {
+ 0, 4,
+ };
+ const uint32_t expected_value[] = {
+ 1, 2
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].ptr, expected_ptr[i]);
+ }
+}
+
+TEST_F(ComputeTest, globals_8bit)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global unsigned char *inout)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx] = inout[idx] + 1;\n\
+ }\n";
+ auto inout = ShaderArg<uint8_t> ({ 100, 110, 120, 130 }, SHADER_ARG_INOUT);
+ const uint8_t expected[] = {
+ 101, 111, 121, 131
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, globals_16bit)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global unsigned short *inout)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx] = inout[idx] + 1;\n\
+ }\n";
+ auto inout = ShaderArg<uint16_t> ({ 10000, 10010, 10020, 10030 }, SHADER_ARG_INOUT);
+ const uint16_t expected[] = {
+ 10001, 10011, 10021, 10031
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_globals_64bit)
+{
+ /* Test disabled, because we need a fixed version of WARP that hasn't
+ been officially shipped yet */
+
+ const char *kernel_source =
+ "__kernel void main_test(__global unsigned long *inout)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx] = inout[idx] + 1;\n\
+ }\n";
+ uint64_t base = 1ull << 50;
+ auto inout = ShaderArg<uint64_t>({ base, base + 10, base + 20, base + 30 },
+ SHADER_ARG_INOUT);
+ const uint64_t expected[] = {
+ base + 1, base + 11, base + 21, base + 31
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, built_ins_global_id)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = get_global_id(0);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 0, 1, 2, 3
+ };
+
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, built_ins_global_id_rmw)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ output[id] = output[id] * (id + 1);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({0x00000001, 0x10000001, 0x00020002, 0x04010203},
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0x00000001, 0x20000002, 0x00060006, 0x1004080c
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_float_basics)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = (uint)((float)get_global_id(0) + 1.5f);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 1, 2, 3, 4
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_types_double_basics)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = (uint)((double)get_global_id(0) + 1.5);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 1, 2, 3, 4
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_short_basics)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = (uint)((short)get_global_id(0) + (short)1);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 1, 2, 3, 4
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_char_basics)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = (uint)((char)get_global_id(0) + (char)1);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 1, 2, 3, 4
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_if_statement)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ int idx = get_global_id(0);\n\
+ if (idx > 0)\n\
+ output[idx] = ~idx;\n\
+ else\n\
+ output[0] = 0xff;\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 0xff, ~1u, ~2u, ~3u
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_do_while_loop)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ int value = 1;\n\
+ int i = 1, n = get_global_id(0);\n\
+ do {\n\
+ value *= i++;\n\
+ } while (i <= n);\n\
+ output[n] = value;\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(5, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 1, 1, 1*2, 1*2*3, 1*2*3*4
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, types_for_loop)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ int value = 1;\n\
+ int n = get_global_id(0);\n\
+ for (int i = 1; i <= n; ++i)\n\
+ value *= i;\n\
+ output[n] = value;\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(5, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 1, 1, 1*2, 1*2*3, 1*2*3*4
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_complex_types_local_array_long)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global ulong *inout)\n\
+ {\n\
+ ushort tmp[] = {\n\
+ get_global_id(1) + 0x00000000,\n\
+ get_global_id(1) + 0x10000001,\n\
+ get_global_id(1) + 0x20000020,\n\
+ get_global_id(1) + 0x30000300,\n\
+ };\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx] = tmp[idx];\n\
+ }\n";
+ auto inout = ShaderArg<uint64_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+ const uint16_t expected[] = {
+ 0x00000000, 0x10000001, 0x20000020, 0x30000300,
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_local_array_short)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global ushort *inout)\n\
+ {\n\
+ ushort tmp[] = {\n\
+ get_global_id(1) + 0x00,\n\
+ get_global_id(1) + 0x10,\n\
+ get_global_id(1) + 0x20,\n\
+ get_global_id(1) + 0x30,\n\
+ };\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx] = tmp[idx];\n\
+ }\n";
+ auto inout = ShaderArg<uint16_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+ const uint16_t expected[] = {
+ 0x00, 0x10, 0x20, 0x30,
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_local_array_struct_vec_float_misaligned)
+{
+ const char *kernel_source =
+ "struct has_vecs { uchar c; ushort s; float2 f; };\n\
+ __kernel void main_test(__global uint *inout)\n\
+ {\n\
+ struct has_vecs tmp[] = {\n\
+ { 10 + get_global_id(0), get_global_id(1), { 10.0f, 1.0f } },\n\
+ { 19 + get_global_id(0), get_global_id(1), { 20.0f, 4.0f } },\n\
+ { 28 + get_global_id(0), get_global_id(1), { 30.0f, 9.0f } },\n\
+ { 37 + get_global_id(0), get_global_id(1), { 40.0f, 16.0f } },\n\
+ };\n\
+ uint idx = get_global_id(0);\n\
+ uint mul = (tmp[idx].c + tmp[idx].s) * trunc(tmp[idx].f[0]);\n\
+ inout[idx] = mul + trunc(tmp[idx].f[1]);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+ const uint16_t expected[] = { 101, 404, 909, 1616 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_local_array)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ uint tmp[] = {\n\
+ get_global_id(1) + 0x00,\n\
+ get_global_id(1) + 0x10,\n\
+ get_global_id(1) + 0x20,\n\
+ get_global_id(1) + 0x30,\n\
+ };\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx] = tmp[idx];\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0x00, 0x10, 0x20, 0x30,
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, complex_types_global_struct_array)
+{
+ struct two_vals { uint32_t add; uint32_t mul; };
+ const char *kernel_source =
+ "struct two_vals { uint add; uint mul; };\n\
+ __kernel void main_test(__global struct two_vals *in_out)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ in_out[id].add = in_out[id].add + id;\n\
+ in_out[id].mul = in_out[id].mul * id;\n\
+ }\n";
+ auto inout = ShaderArg<struct two_vals>({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } },
+ SHADER_ARG_INOUT);
+ const struct two_vals expected[] = {
+ { 8 + 0, 8 * 0 },
+ { 16 + 1, 16 * 1 },
+ { 64 + 2, 64 * 2 },
+ { 65536 + 3, 65536 * 3 }
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i].add, expected[i].add);
+ EXPECT_EQ(inout[i].mul, expected[i].mul);
+ }
+}
+
+TEST_F(ComputeTest, complex_types_global_uint2)
+{
+ struct uint2 { uint32_t x; uint32_t y; };
+ const char *kernel_source =
+ "__kernel void main_test(__global uint2 *inout)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ inout[id].x = inout[id].x + id;\n\
+ inout[id].y = inout[id].y * id;\n\
+ }\n";
+ auto inout = ShaderArg<struct uint2>({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } },
+ SHADER_ARG_INOUT);
+ const struct uint2 expected[] = {
+ { 8 + 0, 8 * 0 },
+ { 16 + 1, 16 * 1 },
+ { 64 + 2, 64 * 2 },
+ { 65536 + 3, 65536 * 3 }
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i].x, expected[i].x);
+ EXPECT_EQ(inout[i].y, expected[i].y);
+ }
+}
+
+TEST_F(ComputeTest, complex_types_global_ushort2)
+{
+ struct ushort2 { uint16_t x; uint16_t y; };
+ const char *kernel_source =
+ "__kernel void main_test(__global ushort2 *inout)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ inout[id].x = inout[id].x + id;\n\
+ inout[id].y = inout[id].y * id;\n\
+ }\n";
+ auto inout = ShaderArg<struct ushort2>({ { 8, 8 }, { 16, 16 }, { 64, 64 },
+ { (uint16_t)65536, (uint16_t)65536 } },
+ SHADER_ARG_INOUT);
+ const struct ushort2 expected[] = {
+ { 8 + 0, 8 * 0 },
+ { 16 + 1, 16 * 1 },
+ { 64 + 2, 64 * 2 },
+ { (uint16_t)(65536 + 3), (uint16_t)(65536 * 3) }
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i].x, expected[i].x);
+ EXPECT_EQ(inout[i].y, expected[i].y);
+ }
+}
+
+TEST_F(ComputeTest, complex_types_global_uchar3)
+{
+ struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; };
+ const char *kernel_source =
+ "__kernel void main_test(__global uchar3 *inout)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ inout[id].x = inout[id].x + id;\n\
+ inout[id].y = inout[id].y * id;\n\
+ inout[id].z = inout[id].y + inout[id].x;\n\
+ }\n";
+ auto inout = ShaderArg<struct uchar3>({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } },
+ SHADER_ARG_INOUT);
+ const struct uchar3 expected[] = {
+ { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) },
+ { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) },
+ { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) },
+ { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) }
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i].x, expected[i].x);
+ EXPECT_EQ(inout[i].y, expected[i].y);
+ EXPECT_EQ(inout[i].z, expected[i].z);
+ }
+}
+
+TEST_F(ComputeTest, complex_types_constant_uchar3)
+{
+ struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; };
+ const char *kernel_source =
+ "__kernel void main_test(__global uchar3 *out, __constant uchar3 *in)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ out[id].x = in[id].x + id;\n\
+ out[id].y = in[id].y * id;\n\
+ out[id].z = out[id].y + out[id].x;\n\
+ }\n";
+ auto in = ShaderArg<struct uchar3>({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } },
+ SHADER_ARG_INPUT);
+ auto out = ShaderArg<struct uchar3>(std::vector<struct uchar3>(4, { 0xff, 0xff, 0xff }),
+ SHADER_ARG_OUTPUT);
+ const struct uchar3 expected[] = {
+ { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) },
+ { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) },
+ { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) },
+ { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) }
+ };
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].x, expected[i].x);
+ EXPECT_EQ(out[i].y, expected[i].y);
+ EXPECT_EQ(out[i].z, expected[i].z);
+ }
+}
+
+TEST_F(ComputeTest, complex_types_global_uint8)
+{
+ struct uint8 {
+ uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
+ uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
+ };
+ const char *kernel_source =
+ "__kernel void main_test(__global uint8 *inout)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ inout[id].s01234567 = inout[id].s01234567 * 2;\n\
+ }\n";
+ auto inout = ShaderArg<struct uint8>({ { 1, 2, 3, 4, 5, 6, 7, 8 } },
+ SHADER_ARG_INOUT);
+ const struct uint8 expected[] = {
+ { 2, 4, 6, 8, 10, 12, 14, 16 }
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i].s0, expected[i].s0);
+ EXPECT_EQ(inout[i].s1, expected[i].s1);
+ EXPECT_EQ(inout[i].s2, expected[i].s2);
+ EXPECT_EQ(inout[i].s3, expected[i].s3);
+ EXPECT_EQ(inout[i].s4, expected[i].s4);
+ EXPECT_EQ(inout[i].s5, expected[i].s5);
+ EXPECT_EQ(inout[i].s6, expected[i].s6);
+ EXPECT_EQ(inout[i].s7, expected[i].s7);
+ }
+}
+
+TEST_F(ComputeTest, complex_types_local_ulong16)
+{
+ struct ulong16 {
+ uint64_t values[16];
+ };
+ const char *kernel_source =
+ R"(__kernel void main_test(__global ulong16 *inout)
+ {
+ __local ulong16 local_array[2];
+ uint id = get_global_id(0);
+ local_array[id] = inout[id];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ inout[id] = local_array[0] * 2;
+ })";
+ auto inout = ShaderArg<struct ulong16>({ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } },
+ SHADER_ARG_INOUT);
+ const struct ulong16 expected[] = {
+ { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ for (int j = 0; j < 16; ++j) {
+ EXPECT_EQ(inout[i].values[j], expected[i].values[j]);
+ }
+ }
+}
+
+TEST_F(ComputeTest, complex_types_constant_uint8)
+{
+ struct uint8 {
+ uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
+ uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
+ };
+ const char *kernel_source =
+ "__kernel void main_test(__global uint8 *out, __constant uint8 *in)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ out[id].s01234567 = in[id].s01234567 * 2;\n\
+ }\n";
+ auto in = ShaderArg<struct uint8>({ { 1, 2, 3, 4, 5, 6, 7, 8 } },
+ SHADER_ARG_INPUT);
+ auto out = ShaderArg<struct uint8>({ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } },
+ SHADER_ARG_INOUT);
+ const struct uint8 expected[] = {
+ { 2, 4, 6, 8, 10, 12, 14, 16 }
+ };
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].s0, expected[i].s0);
+ EXPECT_EQ(out[i].s1, expected[i].s1);
+ EXPECT_EQ(out[i].s2, expected[i].s2);
+ EXPECT_EQ(out[i].s3, expected[i].s3);
+ EXPECT_EQ(out[i].s4, expected[i].s4);
+ EXPECT_EQ(out[i].s5, expected[i].s5);
+ EXPECT_EQ(out[i].s6, expected[i].s6);
+ EXPECT_EQ(out[i].s7, expected[i].s7);
+ }
+}
+
+TEST_F(ComputeTest, DISABLED_complex_types_const_array)
+{
+ /* DISABLED because current release versions of WARP either return
+ * rubbish from reads or crash: they are not prepared to handle
+ * non-float global constants */
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ const uint foo[] = { 100, 101, 102, 103 };\n\
+ output[get_global_id(0)] = foo[get_global_id(0) % 4];\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 100, 101, 102, 103
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, mem_access_load_store_ordering)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ uint foo[4];\n\
+ foo[0] = 0x11111111;\n\
+ foo[1] = 0x22222222;\n\
+ foo[2] = 0x44444444;\n\
+ foo[3] = 0x88888888;\n\
+ foo[get_global_id(1)] -= 0x11111111; // foo[0] = 0 \n\
+ foo[0] += get_global_id(0); // foo[0] = tid\n\
+ foo[foo[get_global_id(1)]] = get_global_id(0); // foo[tid] = tid\n\
+ output[get_global_id(0)] = foo[get_global_id(0)]; // output[tid] = tid\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint16_t expected[] = {
+ 0, 1, 2, 3
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, DISABLED_two_const_arrays)
+{
+ /* DISABLED because current release versions of WARP either return
+ * rubbish from reads or crash: they are not prepared to handle
+ * non-float global constants */
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *output)\n\
+ {\n\
+ uint id = get_global_id(0);\n\
+ uint foo[4] = {100, 101, 102, 103};\n\
+ uint bar[4] = {1, 2, 3, 4};\n\
+ output[id] = foo[id] * bar[id];\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 100, 202, 306, 412
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, imod_pos)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global int *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = inout[get_global_id(0)] % 3;\n\
+ }\n";
+ auto inout = ShaderArg<int32_t>({ -4, -3, -2, -1, 0, 1, 2, 3, 4 },
+ SHADER_ARG_INOUT);
+ const int32_t expected[] = {
+ -1, 0, -2, -1, 0, 1, 2, 0, 1
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, imod_neg)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global int *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = inout[get_global_id(0)] % -3;\n\
+ }\n";
+ auto inout = ShaderArg<int32_t>({ -4, -3, -2, -1, 0, 1, 2, 3, 4 },
+ SHADER_ARG_INOUT);
+ const int32_t expected[] = {
+ -1, 0, -2, -1, 0, 1, 2, 0, 1
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, umod)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = inout[get_global_id(0)] % 0xfffffffc;\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0xfffffffa, 0xfffffffb, 0xfffffffc, 0xfffffffd, 0xfffffffe },
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0xfffffffa, 0xfffffffb, 0, 1, 2
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, rotate)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = rotate(inout[get_global_id(0)], (uint)get_global_id(0) * 4);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0xdeadbeef, 0xeadbeefd, 0xadbeefde, 0xdbeefdea
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, popcount)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = popcount(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 0x1, 0x3, 0x101, 0x110011, ~0u },
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0, 1, 2, 2, 4, 32
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, hadd)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = hadd(inout[get_global_id(0)], 1u << 31);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff },
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ (1u << 31) >> 1,
+ ((1u << 31) + 1) >> 1,
+ ((1u << 31) + 2) >> 1,
+ ((1u << 31) + 3) >> 1,
+ ((1ull << 31) + 0xfffffffc) >> 1,
+ ((1ull << 31) + 0xfffffffd) >> 1,
+ ((1ull << 31) + 0xfffffffe) >> 1,
+ ((1ull << 31) + 0xffffffff) >> 1,
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, rhadd)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = rhadd(inout[get_global_id(0)], 1u << 31);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff },
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ ((1u << 31) + 1) >> 1,
+ ((1u << 31) + 2) >> 1,
+ ((1u << 31) + 3) >> 1,
+ ((1u << 31) + 4) >> 1,
+ ((1ull << 31) + 0xfffffffd) >> 1,
+ ((1ull << 31) + 0xfffffffe) >> 1,
+ ((1ull << 31) + 0xffffffff) >> 1,
+ ((1ull << 31) + (1ull << 32)) >> 1,
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, add_sat)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = add_sat(inout[get_global_id(0)], 2u);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0xffffffff - 3, 0xffffffff - 2, 0xffffffff - 1, 0xffffffff },
+ SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0xffffffff - 1, 0xffffffff, 0xffffffff, 0xffffffff
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, sub_sat)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = sub_sat(inout[get_global_id(0)], 2u);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3 }, SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0, 0, 0, 1
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, mul_hi)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = mul_hi(inout[get_global_id(0)], 1u << 31);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, (1u << 31) }, SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 0, 0, 1, 1, (1u << 30)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, ldexp_x)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], 5);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 2.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ ldexp(0.0f, 5), ldexp(0.5f, 5), ldexp(1.0f, 5), ldexp(2.0f, 5)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, ldexp_y)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], get_global_id(0));\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.25f, 0.5f, 0.75f, 1.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ ldexp(0.25f, 0), ldexp(0.5f, 1), ldexp(0.75f, 2), ldexp(1.0f, 3)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, frexp_ret)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ int exp;\n\
+ inout[get_global_id(0)] = frexp(inout[get_global_id(0)], &exp);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ 0.0f, 0.5f, 0.5f, 0.75f
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, frexp_exp)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ int exp;\n\
+ frexp(inout[get_global_id(0)], &exp);\n\
+ inout[get_global_id(0)] = (float)exp;\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ 0.0f, 0.0f, 1.0f, 2.0f
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, clz)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = clz(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 0, 1, 0xffff, (1u << 30), (1u << 31) }, SHADER_ARG_INOUT);
+ const uint32_t expected[] = {
+ 32, 31, 16, 1, 0
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, sin)
+{
+ struct sin_vals { float in; float clc; float native; };
+ const char *kernel_source =
+ "struct sin_vals { float in; float clc; float native; };\n\
+ __kernel void main_test(__global struct sin_vals *inout)\n\
+ {\n\
+ inout[get_global_id(0)].clc = sin(inout[get_global_id(0)].in);\n\
+ inout[get_global_id(0)].native = native_sin(inout[get_global_id(0)].in);\n\
+ }\n";
+ const vector<sin_vals> input = {
+ { 0.0f, 0.0f, 0.0f },
+ { 1.0f, 0.0f, 0.0f },
+ { 2.0f, 0.0f, 0.0f },
+ { 3.0f, 0.0f, 0.0f },
+ };
+ auto inout = ShaderArg<sin_vals>(input, SHADER_ARG_INOUT);
+ const struct sin_vals expected[] = {
+ { 0.0f, 0.0f, 0.0f },
+ { 1.0f, sin(1.0f), sin(1.0f) },
+ { 2.0f, sin(2.0f), sin(2.0f) },
+ { 3.0f, sin(3.0f), sin(3.0f) },
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_FLOAT_EQ(inout[i].in, inout[i].in);
+ EXPECT_FLOAT_EQ(inout[i].clc, inout[i].clc);
+ EXPECT_NEAR(inout[i].clc, inout[i].native, 0.008f); // range from DXIL spec
+ }
+}
+
+TEST_F(ComputeTest, DISABLED_cosh)
+{
+ /* Disabled because of WARP failures, where we fetch incorrect results when
+ * sourcing from non-float ICBs */
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = cosh(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ cosh(0.0f), cosh(1.0f), cosh(2.0f), cosh(3.0f)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, exp)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = native_exp(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ exp(0.0f), exp(1.0f), exp(2.0f), exp(3.0f)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, exp10)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = native_exp10(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ pow(10.0f, 0.0f), pow(10.0f, 1.0f), pow(10.0f, 2.0f), pow(10.0f, 3.0f)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, exp2)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = native_exp2(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ pow(2.0f, 0.0f), pow(2.0f, 1.0f), pow(2.0f, 2.0f), pow(2.0f, 3.0f)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, log)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = native_log(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ log(0.0f), log(1.0f), log(2.0f), log(3.0f)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, log10)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = native_log10(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ log10(0.0f), log10(1.0f), log10(2.0f), log10(3.0f)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, log2)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = native_log2(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ log(0.0f) / log(2), log(1.0f) / log(2), log(2.0f) / log(2), log(3.0f) / log(2)
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, rint)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = rint(inout[get_global_id(0)]);\n\
+ }\n";
+
+ auto inout = ShaderArg<float>({ 0.5f, 1.5f, -0.5f, -1.5f, 1.4f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ 0.0f, 2.0f, 0.0f, -2.0f, 1.0f,
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, round)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = round(inout[get_global_id(0)]);\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
+ SHADER_ARG_INOUT);
+ const float expected[] = {
+ 0.0f, 0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, arg_by_val)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout, float mul)\n\
+ {\n\
+ inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
+ SHADER_ARG_INOUT);
+ auto mul = ShaderArg<float>(10.0f, SHADER_ARG_INPUT);
+ const float expected[] = {
+ 0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout, mul);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, uint8_by_val)
+{
+ struct uint8 {
+ uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
+ uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
+ };
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *out, uint8 val)\n\
+ {\n\
+ out[get_global_id(0)] = val.s0 + val.s1 + val.s2 + val.s3 +\n\
+ val.s4 + val.s5 + val.s6 + val.s7;\n\
+ }\n";
+ auto out = ShaderArg<uint32_t>({ 0 }, SHADER_ARG_OUTPUT);
+ auto val = ShaderArg<struct uint8>({ {0, 1, 2, 3, 4, 5, 6, 7 }}, SHADER_ARG_INPUT);
+ const uint32_t expected[] = { 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 };
+ run_shader(kernel_source, out.size(), 1, 1, out, val);
+ for (int i = 0; i < out.size(); ++i)
+ EXPECT_EQ(out[i], expected[i]);
+}
+
+TEST_F(ComputeTest, link)
+{
+ const char *foo_src =
+ "float foo(float in)\n\
+ {\n\
+ return in * in;\n\
+ }\n";
+ const char *kernel_source =
+ "float foo(float in);\n\
+ __kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\
+ }\n";
+ std::vector<const char *> srcs = { foo_src, kernel_source };
+ auto inout = ShaderArg<float>({ 2.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ 4.0f,
+ };
+ run_shader(srcs, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, link_library)
+{
+ const char *bar_src =
+ "float bar(float in)\n\
+ {\n\
+ return in * 5;\n\
+ }\n";
+ const char *foo_src =
+ "float bar(float in);\n\
+ float foo(float in)\n\
+ {\n\
+ return in * bar(in);\n\
+ }\n";
+ const char *kernel_source =
+ "float foo(float in);\n\
+ __kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\
+ }\n";
+ std::vector<Shader> libraries = {
+ compile({ bar_src, kernel_source }, {}, true),
+ compile({ foo_src }, {}, true)
+ };
+ Shader exe = link(libraries);
+ auto inout = ShaderArg<float>({ 2.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ 20.0f,
+ };
+ run_shader(exe, { (unsigned)inout.size(), 1, 1 }, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, localvar)
+{
+ const char *kernel_source =
+ "__kernel __attribute__((reqd_work_group_size(2, 1, 1)))\n\
+ void main_test(__global float *inout)\n\
+ {\n\
+ __local float2 tmp[2];\n\
+ tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+ tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
+ }\n";
+
+ auto inout = ShaderArg<float>({ 2.0f, 4.0f }, SHADER_ARG_INOUT);
+ const float expected[] = {
+ 9.0f, 5.0f
+ };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, localvar_uchar2)
+{
+ const char *kernel_source =
+ "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+ __kernel void main_test(__global uchar *inout)\n\
+ {\n\
+ __local uchar2 tmp[2];\n\
+ tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+ tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
+ }\n";
+
+ auto inout = ShaderArg<uint8_t>({ 2, 4 }, SHADER_ARG_INOUT);
+ const uint8_t expected[] = { 9, 5 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, work_group_size_hint)
+{
+ const char *kernel_source =
+ "__attribute__((work_group_size_hint(2, 1, 1)))\n\
+ __kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = get_local_id(0);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 0, 1, 2, 3
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, reqd_work_group_size)
+{
+ const char *kernel_source =
+ "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+ __kernel void main_test(__global uint *output)\n\
+ {\n\
+ output[get_global_id(0)] = get_local_id(0);\n\
+ }\n";
+ auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
+ SHADER_ARG_OUTPUT);
+ const uint32_t expected[] = {
+ 0, 1, 0, 1
+ };
+ run_shader(kernel_source, output.size(), 1, 1, output);
+ for (int i = 0; i < output.size(); ++i)
+ EXPECT_EQ(output[i], expected[i]);
+}
+
+TEST_F(ComputeTest, image)
+{
+ const char* kernel_source =
+ "__kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
+ {\n\
+ int2 coords = (int2)(get_global_id(0), get_global_id(1));\n\
+ write_imagef(output, coords, read_imagef(input, coords));\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, image_two_reads)
+{
+ const char* kernel_source =
+ "__kernel void main_test(image2d_t image, int is_float, __global float* output)\n\
+ {\n\
+ if (is_float)\n\
+ output[get_global_id(0)] = read_imagef(image, (int2)(0, 0)).x;\n\
+ else \n\
+ output[get_global_id(0)] = (float)read_imagei(image, (int2)(0, 0)).x;\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, sampler)
+{
+ const char* kernel_source =
+ "__kernel void main_test(image2d_t image, sampler_t sampler, __global float* output)\n\
+ {\n\
+ output[get_global_id(0)] = read_imagef(image, sampler, (int2)(0, 0)).x;\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, image_dims)
+{
+ const char* kernel_source =
+ "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\
+ {\n\
+ output[get_global_id(0)] = get_image_width(roimage);\n\
+ output[get_global_id(0) + 1] = get_image_width(woimage);\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, image_format)
+{
+ const char* kernel_source =
+ "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\
+ {\n\
+ output[get_global_id(0)] = get_image_channel_data_type(roimage);\n\
+ output[get_global_id(0) + 1] = get_image_channel_order(woimage);\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, image1d_buffer_t)
+{
+ const char* kernel_source =
+ "__kernel void main_test(read_only image1d_buffer_t input, write_only image1d_buffer_t output)\n\
+ {\n\
+ write_imageui(output, get_global_id(0), read_imageui(input, get_global_id(0)));\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, local_ptr)
+{
+ struct uint2 { uint32_t x, y; };
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout, __local uint2 *tmp)\n\
+ {\n\
+ tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+ tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 2, 4 }, SHADER_ARG_INOUT);
+ auto tmp = ShaderArg<struct uint2>(std::vector<struct uint2>(4096), SHADER_ARG_INPUT);
+ const uint8_t expected[] = { 9, 5 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout, tmp);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, two_local_ptrs)
+{
+ struct uint2 { uint32_t x, y; };
+ const char *kernel_source =
+ "__kernel void main_test(__global uint *inout, __local uint2 *tmp, __local uint *tmp2)\n\
+ {\n\
+ tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
+ tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
+ tmp2[get_local_id(0)] = get_global_id(0);\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y + tmp2[get_local_id(0) % 2];\n\
+ }\n";
+ auto inout = ShaderArg<uint32_t>({ 2, 4 }, SHADER_ARG_INOUT);
+ auto tmp = ShaderArg<struct uint2>(std::vector<struct uint2>(1024), SHADER_ARG_INPUT);
+ auto tmp2 = ShaderArg<uint32_t>(std::vector<uint32_t>(1024), SHADER_ARG_INPUT);
+ const uint8_t expected[] = { 9, 6 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout, tmp, tmp2);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, int8_to_float)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global char* in, __global float* out)\n\
+ {\n\
+ uint pos = get_global_id(0);\n\
+ out[pos] = in[pos] / 100.0f;\n\
+ }";
+ auto in = ShaderArg<char>({ 10, 20, 30, 40 }, SHADER_ARG_INPUT);
+ auto out = ShaderArg<float>(std::vector<float>(4, std::numeric_limits<float>::infinity()), SHADER_ARG_OUTPUT);
+ const float expected[] = { 0.1f, 0.2f, 0.3f, 0.4f };
+ run_shader(kernel_source, in.size(), 1, 1, in, out);
+ for (int i = 0; i < in.size(); ++i)
+ EXPECT_FLOAT_EQ(out[i], expected[i]);
+}
+
+TEST_F(ComputeTest, vec_hint_float4)
+{
+ const char *kernel_source =
+ "__kernel __attribute__((vec_type_hint(float4))) void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
+ }";
+ Shader shader = compile({ kernel_source });
+ EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 4);
+ EXPECT_EQ(shader.obj->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_FLOAT);
+}
+
+TEST_F(ComputeTest, vec_hint_uchar2)
+{
+ const char *kernel_source =
+ "__kernel __attribute__((vec_type_hint(uchar2))) void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
+ }";
+ Shader shader = compile({ kernel_source });
+ EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 2);
+ EXPECT_EQ(shader.obj->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_CHAR);
+}
+
+TEST_F(ComputeTest, vec_hint_none)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout)\n\
+ {\n\
+ inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
+ }";
+ Shader shader = compile({ kernel_source });
+ EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 0);
+}
+
+TEST_F(ComputeTest, DISABLED_debug_layer_failure)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *inout, float mul)\n\
+ {\n\
+ inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\
+ }\n";
+ auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
+ SHADER_ARG_INOUT);
+ auto mul = ShaderArg<float>(10.0f, SHADER_ARG_INPUT);
+ const float expected[] = {
+ 0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f
+ };
+ ComPtr<ID3D12InfoQueue> info_queue;
+ dev->QueryInterface(info_queue.ReleaseAndGetAddressOf());
+ if (!info_queue) {
+ GTEST_SKIP() << "No info queue";
+ return;
+ }
+
+ info_queue->AddApplicationMessage(D3D12_MESSAGE_SEVERITY_ERROR, "This should cause the test to fail");
+ run_shader(kernel_source, inout.size(), 1, 1, inout, mul);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_FLOAT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, compiler_defines)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global int* out)\n\
+ {\n\
+ out[0] = OUT_VAL0;\n\
+ out[1] = __OPENCL_C_VERSION__;\n\
+ }";
+ auto out = ShaderArg<int>(std::vector<int>(2, 0), SHADER_ARG_OUTPUT);
+ CompileArgs compile_args = { 1, 1, 1 };
+ compile_args.compiler_command_line = { "-DOUT_VAL0=5", "-cl-std=cl" };
+ std::vector<RawShaderArg *> raw_args = { &out };
+ run_shader({ kernel_source }, compile_args, out);
+ EXPECT_EQ(out[0], 5);
+ EXPECT_EQ(out[1], 100);
+}
+
+/* There's a bug in WARP turning atomic_add(ptr, x) into
+ * atomic_add(ptr, x * 4). Works fine on intel HW.
+ */
+TEST_F(ComputeTest, DISABLED_global_atomic_add)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global int *inout, __global int *old)\n\
+ {\n\
+ old[get_global_id(0)] = atomic_add(inout + get_global_id(0), 3);\n\
+ }\n";
+ auto inout = ShaderArg<int32_t>({ 2, 4 }, SHADER_ARG_INOUT);
+ auto old = ShaderArg<int32_t>(std::vector<int32_t>(2, 0xdeadbeef), SHADER_ARG_OUTPUT);
+ const int32_t expected_inout[] = { 5, 7 };
+ const int32_t expected_old[] = { 2, 4 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout, old);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i], expected_inout[i]);
+ EXPECT_EQ(old[i], expected_old[i]);
+ }
+}
+
+TEST_F(ComputeTest, global_atomic_imin)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global int *inout, __global int *old)\n\
+ {\n\
+ old[get_global_id(0)] = atomic_min(inout + get_global_id(0), 1);\n\
+ }\n";
+ auto inout = ShaderArg<int32_t>({ 0, 2, -1 }, SHADER_ARG_INOUT);
+ auto old = ShaderArg<int32_t>(std::vector<int32_t>(3, 0xdeadbeef), SHADER_ARG_OUTPUT);
+ const int32_t expected_inout[] = { 0, 1, -1 };
+ const int32_t expected_old[] = { 0, 2, -1 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout, old);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i], expected_inout[i]);
+ EXPECT_EQ(old[i], expected_old[i]);
+ }
+}
+
+TEST_F(ComputeTest, global_atomic_and_or)
+{
+ const char *kernel_source =
+ "__attribute__((reqd_work_group_size(3, 1, 1)))\n\
+ __kernel void main_test(__global int *inout)\n\
+ {\n\
+ atomic_and(inout, ~(1 << get_global_id(0)));\n\
+ atomic_or(inout, (1 << (get_global_id(0) + 4)));\n\
+ }\n";
+ auto inout = ShaderArg<int32_t>(0xf, SHADER_ARG_INOUT);
+ const int32_t expected[] = { 0x78 };
+ run_shader(kernel_source, 3, 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, global_atomic_cmpxchg)
+{
+ const char *kernel_source =
+ "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+ __kernel void main_test(__global int *inout)\n\
+ {\n\
+ while (atomic_cmpxchg(inout, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\
+ ;\n\
+ }\n";
+ auto inout = ShaderArg<int32_t>(0, SHADER_ARG_INOUT);
+ const int32_t expected_inout[] = { 2 };
+ run_shader(kernel_source, 2, 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected_inout[i]);
+}
+
+TEST_F(ComputeTest, local_atomic_and_or)
+{
+ const char *kernel_source =
+ "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+ __kernel void main_test(__global ushort *inout)\n\
+ {\n\
+ __local ushort tmp;\n\
+ atomic_and(&tmp, ~(0xff << (get_global_id(0) * 8)));\n\
+ atomic_or(&tmp, inout[get_global_id(0)] << (get_global_id(0) * 8));\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ inout[get_global_id(0)] = tmp;\n\
+ }\n";
+ auto inout = ShaderArg<uint16_t>({ 2, 4 }, SHADER_ARG_INOUT);
+ const uint16_t expected[] = { 0x402, 0x402 };
+ run_shader(kernel_source, inout.size(), 1, 1, inout);
+ for (int i = 0; i < inout.size(); ++i)
+ EXPECT_EQ(inout[i], expected[i]);
+}
+
+TEST_F(ComputeTest, local_atomic_cmpxchg)
+{
+ const char *kernel_source =
+ "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
+ __kernel void main_test(__global int *out)\n\
+ {\n\
+ __local uint tmp;\n\
+ tmp = 0;\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ while (atomic_cmpxchg(&tmp, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\
+ ;\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ out[0] = tmp;\n\
+ }\n";
+
+ auto out = ShaderArg<uint32_t>(0xdeadbeef, SHADER_ARG_OUTPUT);
+ const uint16_t expected[] = { 2 };
+ run_shader(kernel_source, 2, 1, 1, out);
+ for (int i = 0; i < out.size(); ++i)
+ EXPECT_EQ(out[i], expected[i]);
+}
+
+TEST_F(ComputeTest, constant_sampler)
+{
+ const char* kernel_source =
+ "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;\n\
+ __kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
+ {\n\
+ int2 coordsi = (int2)(get_global_id(0), get_global_id(1));\n\
+ float2 coordsf = (float2)((float)coordsi.x / get_image_width(input), (float)coordsi.y / get_image_height(input));\n\
+ write_imagef(output, coordsi, \n\
+ read_imagef(input, sampler, coordsf) + \n\
+ read_imagef(input, sampler, coordsf + (float2)(0.1, 0.1)));\n\
+ }\n";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+ EXPECT_EQ(shader.dxil->metadata.num_const_samplers, 1);
+}
+
+TEST_F(ComputeTest, hi)
+{
+ const char *kernel_source = R"(
+ __kernel void main_test(__global char3 *srcA, __global char2 *dst)
+ {
+ int tid = get_global_id(0);
+
+ char2 tmp = srcA[tid].hi;
+ dst[tid] = tmp;
+ })";
+ Shader shader = compile(std::vector<const char*>({ kernel_source }));
+ validate(shader);
+}
+
+TEST_F(ComputeTest, system_values)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global uint* outputs)\n\
+ {\n\
+ outputs[0] = get_work_dim();\n\
+ outputs[1] = get_global_size(0);\n\
+ outputs[2] = get_local_size(0);\n\
+ outputs[3] = get_num_groups(0);\n\
+ outputs[4] = get_group_id(0);\n\
+ outputs[5] = get_global_offset(0);\n\
+ outputs[6] = get_global_id(0);\n\
+ }\n";
+ auto out = ShaderArg<uint32_t>(std::vector<uint32_t>(6, 0xdeadbeef), SHADER_ARG_OUTPUT);
+ const uint16_t expected[] = { 3, 1, 1, 1, 0, 0, 0, };
+ CompileArgs args = { 1, 1, 1 };
+ Shader shader = compile({ kernel_source });
+ run_shader(shader, args, out);
+ for (int i = 0; i < out.size(); ++i)
+ EXPECT_EQ(out[i], expected[i]);
+
+ args.work_props.work_dim = 2;
+ args.work_props.global_offset_x = 100;
+ args.work_props.group_id_offset_x = 2;
+ args.work_props.group_count_total_x = 5;
+ const uint32_t expected_withoffsets[] = { 2, 5, 1, 5, 2, 100, 102 };
+ run_shader(shader, args, out);
+ for (int i = 0; i < out.size(); ++i)
+ EXPECT_EQ(out[i], expected_withoffsets[i]);
+}
+
+TEST_F(ComputeTest, convert_round_sat)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float *f, __global uchar *u)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ u[idx] = convert_uchar_sat_rtp(f[idx]);\n\
+ }\n";
+ auto f = ShaderArg<float>({ -1.0f, 1.1f, 20.0f, 255.5f }, SHADER_ARG_INPUT);
+ auto u = ShaderArg<uint8_t>({ 255, 0, 0, 0 }, SHADER_ARG_OUTPUT);
+ const uint8_t expected[] = {
+ 0, 2, 20, 255
+ };
+
+ run_shader(kernel_source, f.size(), 1, 1, f, u);
+ for (int i = 0; i < u.size(); ++i)
+ EXPECT_EQ(u[i], expected[i]);
+}
+
+TEST_F(ComputeTest, convert_round_sat_vec)
+{
+ const char *kernel_source =
+ "__kernel void main_test(__global float16 *f, __global uchar16 *u)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ u[idx] = convert_uchar16_sat_rtp(f[idx]);\n\
+ }\n";
+ auto f = ShaderArg<float>({
+ -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+ -0.5f, 1.9f, 20.0f, 254.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+ 0.0f, 1.3f, 20.0f, 255.1f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+ -0.0f, 1.5555f, 20.0f, 254.9f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
+ }, SHADER_ARG_INPUT);
+ auto u = ShaderArg<uint8_t>({
+ 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+ 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+ 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+ 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
+ }, SHADER_ARG_OUTPUT);
+ const uint8_t expected[] = {
+ 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+ 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+ 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+ 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
+ };
+
+ run_shader(kernel_source, 4, 1, 1, f, u);
+ for (int i = 0; i < u.size(); ++i)
+ EXPECT_EQ(u[i], expected[i]);
+}
+
+TEST_F(ComputeTest, convert_char2_uchar2)
+{
+ const char *kernel_source =
+ "__kernel void main_test( __global char2 *src, __global uchar2 *dest )\n\
+ {\n\
+ size_t i = get_global_id(0);\n\
+ dest[i] = convert_uchar2_sat( src[i] );\n\
+ }\n";
+
+ auto c = ShaderArg<int8_t>({ -127, -4, 0, 4, 126, 127, 16, 32 }, SHADER_ARG_INPUT);
+ auto u = ShaderArg<uint8_t>({ 99, 99, 99, 99, 99, 99, 99, 99 }, SHADER_ARG_OUTPUT);
+ const uint8_t expected[] = { 0, 0, 0, 4, 126, 127, 16, 32 };
+ run_shader(kernel_source, 4, 1, 1, c, u);
+ for (int i = 0; i < u.size(); i++)
+ EXPECT_EQ(u[i], expected[i]);
+}
+
+TEST_F(ComputeTest, async_copy)
+{
+ const char *kernel_source = R"(
+ __kernel void main_test( const __global char *src, __global char *dst, __local char *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )
+ {
+ int i;
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (char)(char)0;
+ barrier( CLK_LOCAL_MEM_FENCE );
+ event_t event;
+ event = async_work_group_copy( (__local char*)localBuffer, (__global const char*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 );
+ wait_group_events( 1, &event );
+ for(i=0; i<copiesPerWorkItem; i++)
+ dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];
+ })";
+ Shader shader = compile({ kernel_source });
+ validate(shader);
+}
+
+TEST_F(ComputeTest, packed_struct_global)
+{
+#pragma pack(push, 1)
+ struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+ const char *kernel_source =
+ "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+ __kernel void main_test(__global struct s *inout, global uint *size)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ inout[idx].uc = idx + 1;\n\
+ inout[idx].ul = ((ulong)(idx + 1 + 0xfbfcfdfe) << 32) | 0x12345678;\n\
+ inout[idx].us = ((ulong)(idx + 1 + 0xa0) << 8) | 0x12;\n\
+ *size = sizeof(struct s);\n\
+ }\n";
+ auto inout = ShaderArg<struct s>({0, 0, 0}, SHADER_ARG_OUTPUT);
+ auto size = ShaderArg<uint32_t>(0, SHADER_ARG_OUTPUT);
+ const struct s expected[] = {
+ { 1, 0xfbfcfdff12345678, 0xa112 }
+ };
+
+ run_shader(kernel_source, inout.size(), 1, 1, inout, size);
+ for (int i = 0; i < inout.size(); ++i) {
+ EXPECT_EQ(inout[i].uc, expected[i].uc);
+ EXPECT_EQ(inout[i].ul, expected[i].ul);
+ EXPECT_EQ(inout[i].us, expected[i].us);
+ }
+ EXPECT_EQ(size, sizeof(struct s));
+}
+
+TEST_F(ComputeTest, packed_struct_arg)
+{
+#pragma pack(push, 1)
+ struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+ const char *kernel_source =
+ "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+ __kernel void main_test(__global struct s *out, struct s in)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ out[idx].uc = in.uc + 0x12;\n\
+ out[idx].ul = in.ul + 0x123456789abcdef;\n\
+ out[idx].us = in.us + 0x1234;\n\
+ }\n";
+ auto out = ShaderArg<struct s>({0, 0, 0}, SHADER_ARG_OUTPUT);
+ auto in = ShaderArg<struct s>({1, 2, 3}, SHADER_ARG_INPUT);
+ const struct s expected[] = {
+ { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 }
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].uc, expected[i].uc);
+ EXPECT_EQ(out[i].ul, expected[i].ul);
+ EXPECT_EQ(out[i].us, expected[i].us);
+ }
+}
+
+TEST_F(ComputeTest, packed_struct_local)
+{
+#pragma pack(push, 1)
+ struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+ const char *kernel_source =
+ "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+ __kernel void main_test(__global struct s *out, __constant struct s *in)\n\
+ {\n\
+ uint idx = get_global_id(0);\n\
+ __local struct s tmp[2];\n\
+ tmp[get_local_id(0)] = in[idx];\n\
+ barrier(CLK_LOCAL_MEM_FENCE);\n\
+ out[idx] = tmp[(get_local_id(0) + 1) % 2];\n\
+ }\n";
+ auto out = ShaderArg<struct s>({{0, 0, 0}, {0, 0, 0}}, SHADER_ARG_OUTPUT);
+ auto in = ShaderArg<struct s>({{1, 2, 3}, {0x12, 0x123456789abcdef, 0x1234} }, SHADER_ARG_INPUT);
+ const struct s expected[] = {
+ { 0x12, 0x123456789abcdef, 0x1234 },
+ { 1, 2, 3 },
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].uc, expected[i].uc);
+ EXPECT_EQ(out[i].ul, expected[i].ul);
+ EXPECT_EQ(out[i].us, expected[i].us);
+ }
+}
+
+/* DISABLED because current release versions of WARP either return
+ * rubbish from reads or crash: they are not prepared to handle
+ * non-float global constants */
+TEST_F(ComputeTest, DISABLED_packed_struct_const)
+{
+#pragma pack(push, 1)
+ struct s { uint8_t uc; uint64_t ul; uint16_t us; };
+#pragma pack(pop)
+
+ const char *kernel_source =
+ "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
+ __kernel void main_test(__global struct s *out, struct s in)\n\
+ {\n\
+ __constant struct s base[] = {\n\
+ {0x12, 0x123456789abcdef, 0x1234},\n\
+ {0x11, 0x123456789abcdee, 0x1233},\n\
+ };\n\
+ uint idx = get_global_id(0);\n\
+ out[idx].uc = base[idx % 2].uc + in.uc;\n\
+ out[idx].ul = base[idx % 2].ul + in.ul;\n\
+ out[idx].us = base[idx % 2].us + in.us;\n\
+ }\n";
+ auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0, 0, 0}), SHADER_ARG_OUTPUT);
+ auto in = ShaderArg<struct s>({1, 2, 3}, SHADER_ARG_INPUT);
+ const struct s expected[] = {
+ { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 },
+ { 0x11 + 1, 0x123456789abcdee + 2, 0x1233 + 3 },
+ };
+
+ run_shader(kernel_source, out.size(), 1, 1, out, in);
+ for (int i = 0; i < out.size(); ++i) {
+ EXPECT_EQ(out[i].uc, expected[i].uc);
+ EXPECT_EQ(out[i].ul, expected[i].ul);
+ EXPECT_EQ(out[i].us, expected[i].us);
+ }
+}
+
+TEST_F(ComputeTest, DISABLED_printf)
+{
+ const char *kernel_source = R"(
+ __kernel void main_test(__global float *src, __global uint *dest)
+ {
+ __constant char *format_str = "%s: %f";
+ __constant char *str_val = "Test";
+ *dest = printf(format_str, str_val, src[0]);
+ })";
+
+ auto src = ShaderArg<float>({ 1.0f }, SHADER_ARG_INPUT);
+ auto dest = ShaderArg<uint32_t>({ 0xdeadbeef }, SHADER_ARG_OUTPUT);
+ run_shader(kernel_source, 1, 1, 1, src, dest);
+ EXPECT_EQ(dest[0], 0);
+}
+
+TEST_F(ComputeTest, vload_half)
+{
+ const char *kernel_source = R"(
+ __kernel void main_test(__global half *src, __global float4 *dest)
+ {
+ int offset = get_global_id(0);
+ dest[offset] = vload_half4(offset, src);
+ })";
+ auto src = ShaderArg<uint16_t>({ 0x3c00, 0x4000, 0x4200, 0x4400,
+ 0x4500, 0x4600, 0x4700, 0x4800 }, SHADER_ARG_INPUT);
+ auto dest = ShaderArg<float>({ FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+ FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }, SHADER_ARG_OUTPUT);
+ run_shader(kernel_source, 2, 1, 1, src, dest);
+ for (unsigned i = 0; i < 8; ++i)
+ EXPECT_FLOAT_EQ(dest[i], (float)(i + 1));
+}
+
+TEST_F(ComputeTest, vstore_half)
+{
+ const char *kernel_source = R"(
+ __kernel void main_test(__global half *dst, __global float4 *src)
+ {
+ int offset = get_global_id(0);
+ vstore_half4(src[offset], offset, dst);
+ })";
+ auto dest = ShaderArg<uint16_t>({0xdead, 0xdead, 0xdead, 0xdead,
+ 0xdead, 0xdead, 0xdead, 0xdead}, SHADER_ARG_OUTPUT);
+ auto src = ShaderArg<float>({ 1.0, 2.0, 3.0, 4.0,
+ 5.0, 6.0, 7.0, 8.0 }, SHADER_ARG_INPUT);
+ run_shader(kernel_source, 2, 1, 1, dest, src);
+ const uint16_t expected[] = { 0x3c00, 0x4000, 0x4200, 0x4400,
+ 0x4500, 0x4600, 0x4700, 0x4800 };
+ for (unsigned i = 0; i < 8; ++i)
+ EXPECT_EQ(dest[i], expected[i]);
+}
--- /dev/null
+//
+// Copyright 2012-2016 Francisco Jerez
+// Copyright 2012-2016 Advanced Micro Devices, Inc.
+// Copyright 2014-2016 Jan Vesely
+// Copyright 2014-2015 Serge Martin
+// Copyright 2015 Zoltan Gilian
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <sstream>
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/DiagnosticPrinter.h>
+#include <llvm/IR/DiagnosticInfo.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm-c/Core.h>
+#include <llvm-c/Target.h>
+#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
+
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Lex/PreprocessorOptions.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/TextDiagnosticBuffer.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Basic/TargetInfo.h>
+
+#include <spirv-tools/libspirv.hpp>
+#include <spirv-tools/linker.hpp>
+
+#include "util/macros.h"
+#include "glsl_types.h"
+#include "nir.h"
+#include "nir_types.h"
+
+#include "clc_helpers.h"
+#include "spirv.h"
+
+#include "opencl-c.h.h"
+#include "opencl-c-base.h.h"
+
+using ::llvm::Function;
+using ::llvm::LLVMContext;
+using ::llvm::Module;
+using ::llvm::raw_string_ostream;
+
+static void
+llvm_log_handler(const ::llvm::DiagnosticInfo &di, void *data) {
+ raw_string_ostream os { *reinterpret_cast<std::string *>(data) };
+ ::llvm::DiagnosticPrinterRawOStream printer { os };
+ di.print(printer);
+}
+
+class SPIRVKernelArg {
+public:
+ SPIRVKernelArg(uint32_t id, uint32_t typeId) : id(id), typeId(typeId),
+ addrQualifier(CLC_KERNEL_ARG_ADDRESS_PRIVATE),
+ accessQualifier(0),
+ typeQualifier(0) { }
+ ~SPIRVKernelArg() { }
+
+ uint32_t id;
+ uint32_t typeId;
+ std::string name;
+ std::string typeName;
+ enum clc_kernel_arg_address_qualifier addrQualifier;
+ unsigned accessQualifier;
+ unsigned typeQualifier;
+};
+
+class SPIRVKernelInfo {
+public:
+ SPIRVKernelInfo(uint32_t fid, const char *nm) : funcId(fid), name(nm), vecHint(0) { }
+ ~SPIRVKernelInfo() { }
+
+ uint32_t funcId;
+ std::string name;
+ std::vector<SPIRVKernelArg> args;
+ unsigned vecHint;
+};
+
+class SPIRVKernelParser {
+public:
+ SPIRVKernelParser() : curKernel(NULL)
+ {
+ ctx = spvContextCreate(SPV_ENV_UNIVERSAL_1_0);
+ }
+
+ ~SPIRVKernelParser()
+ {
+ spvContextDestroy(ctx);
+ }
+
+ void parseEntryPoint(const spv_parsed_instruction_t *ins)
+ {
+ assert(ins->num_operands >= 3);
+
+ const spv_parsed_operand_t *op = &ins->operands[1];
+
+ assert(op->type == SPV_OPERAND_TYPE_ID);
+
+ uint32_t funcId = ins->words[op->offset];
+
+ for (auto &iter : kernels) {
+ if (funcId == iter.funcId)
+ return;
+ }
+
+ op = &ins->operands[2];
+ assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+ const char *name = reinterpret_cast<const char *>(ins->words + op->offset);
+
+ kernels.push_back(SPIRVKernelInfo(funcId, name));
+ }
+
+ void parseFunction(const spv_parsed_instruction_t *ins)
+ {
+ assert(ins->num_operands == 4);
+
+ const spv_parsed_operand_t *op = &ins->operands[1];
+
+ assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+
+ uint32_t funcId = ins->words[op->offset];
+
+ SPIRVKernelInfo *kernel = NULL;
+
+ for (auto &kernel : kernels) {
+ if (funcId == kernel.funcId && !kernel.args.size()) {
+ curKernel = &kernel;
+ return;
+ }
+ }
+ }
+
+ void parseFunctionParam(const spv_parsed_instruction_t *ins)
+ {
+ const spv_parsed_operand_t *op;
+ uint32_t id, typeId;
+
+ if (!curKernel)
+ return;
+
+ assert(ins->num_operands == 2);
+ op = &ins->operands[0];
+ assert(op->type == SPV_OPERAND_TYPE_TYPE_ID);
+ typeId = ins->words[op->offset];
+ op = &ins->operands[1];
+ assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+ id = ins->words[op->offset];
+ curKernel->args.push_back(SPIRVKernelArg(id, typeId));
+ }
+
+ void parseName(const spv_parsed_instruction_t *ins)
+ {
+ const spv_parsed_operand_t *op;
+ const char *name;
+ uint32_t id;
+
+ assert(ins->num_operands == 2);
+
+ op = &ins->operands[0];
+ assert(op->type == SPV_OPERAND_TYPE_ID);
+ id = ins->words[op->offset];
+ op = &ins->operands[1];
+ assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+ name = reinterpret_cast<const char *>(ins->words + op->offset);
+
+ for (auto &kernel : kernels) {
+ for (auto &arg : kernel.args) {
+ if (arg.id == id && arg.name.empty()) {
+ arg.name = name;
+ break;
+ }
+ }
+ }
+ }
+
+ void parseTypePointer(const spv_parsed_instruction_t *ins)
+ {
+ enum clc_kernel_arg_address_qualifier addrQualifier;
+ uint32_t typeId, targetTypeId, storageClass;
+ const spv_parsed_operand_t *op;
+ const char *typeName;
+
+ assert(ins->num_operands == 3);
+
+ op = &ins->operands[0];
+ assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+ typeId = ins->words[op->offset];
+
+ op = &ins->operands[1];
+ assert(op->type == SPV_OPERAND_TYPE_STORAGE_CLASS);
+ storageClass = ins->words[op->offset];
+ switch (storageClass) {
+ case SpvStorageClassCrossWorkgroup:
+ addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL;
+ break;
+ case SpvStorageClassWorkgroup:
+ addrQualifier = CLC_KERNEL_ARG_ADDRESS_LOCAL;
+ break;
+ case SpvStorageClassUniformConstant:
+ addrQualifier = CLC_KERNEL_ARG_ADDRESS_CONSTANT;
+ break;
+ default:
+ addrQualifier = CLC_KERNEL_ARG_ADDRESS_PRIVATE;
+ break;
+ }
+
+ for (auto &kernel : kernels) {
+ for (auto &arg : kernel.args) {
+ if (arg.typeId == typeId)
+ arg.addrQualifier = addrQualifier;
+ }
+ }
+ }
+
+ void parseOpString(const spv_parsed_instruction_t *ins)
+ {
+ const spv_parsed_operand_t *op;
+ std::string str;
+
+ assert(ins->num_operands == 2);
+
+ op = &ins->operands[1];
+ assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+ str = reinterpret_cast<const char *>(ins->words + op->offset);
+
+ if (str.find("kernel_arg_type.") != 0)
+ return;
+
+ size_t start = sizeof("kernel_arg_type.") - 1;
+
+ for (auto &kernel : kernels) {
+ size_t pos;
+
+ pos = str.find(kernel.name, start);
+ if (pos == std::string::npos ||
+ pos != start || str[start + kernel.name.size()] != '.')
+ continue;
+
+ pos = start + kernel.name.size();
+ if (str[pos++] != '.')
+ continue;
+
+ for (auto &arg : kernel.args) {
+ if (arg.name.empty())
+ break;
+
+ size_t typeEnd = str.find(',', pos);
+ if (typeEnd == std::string::npos)
+ break;
+
+ arg.typeName = str.substr(pos, typeEnd - pos);
+ pos = typeEnd + 1;
+ }
+ }
+ }
+
+ void applyDecoration(uint32_t id, const spv_parsed_instruction_t *ins)
+ {
+ auto iter = decorationGroups.find(id);
+ if (iter != decorationGroups.end()) {
+ for (uint32_t entry : iter->second)
+ applyDecoration(entry, ins);
+ return;
+ }
+
+ const spv_parsed_operand_t *op;
+ uint32_t decoration;
+
+ assert(ins->num_operands >= 2);
+
+ op = &ins->operands[1];
+ assert(op->type == SPV_OPERAND_TYPE_DECORATION);
+ decoration = ins->words[op->offset];
+
+ for (auto &kernel : kernels) {
+ for (auto &arg : kernel.args) {
+ if (arg.id == id) {
+ switch (decoration) {
+ case SpvDecorationVolatile:
+ arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_VOLATILE;
+ break;
+ case SpvDecorationConstant:
+ arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST;
+ break;
+ case SpvDecorationRestrict:
+ arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT;
+ break;
+ case SpvDecorationFuncParamAttr:
+ op = &ins->operands[2];
+ assert(op->type == SPV_OPERAND_TYPE_FUNCTION_PARAMETER_ATTRIBUTE);
+ switch (ins->words[op->offset]) {
+ case SpvFunctionParameterAttributeNoAlias:
+ arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT;
+ break;
+ case SpvFunctionParameterAttributeNoWrite:
+ arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST;
+ break;
+ }
+ break;
+ }
+ }
+
+ }
+ }
+ }
+
+ void parseOpDecorate(const spv_parsed_instruction_t *ins)
+ {
+ const spv_parsed_operand_t *op;
+ uint32_t id, decoration;
+
+ assert(ins->num_operands >= 2);
+
+ op = &ins->operands[0];
+ assert(op->type == SPV_OPERAND_TYPE_ID);
+ id = ins->words[op->offset];
+
+ applyDecoration(id, ins);
+ }
+
+ void parseOpGroupDecorate(const spv_parsed_instruction_t *ins)
+ {
+ assert(ins->num_operands >= 2);
+
+ const spv_parsed_operand_t *op = &ins->operands[0];
+ assert(op->type == SPV_OPERAND_TYPE_ID);
+ uint32_t groupId = ins->words[op->offset];
+
+ auto lowerBound = decorationGroups.lower_bound(groupId);
+ if (lowerBound != decorationGroups.end() &&
+ lowerBound->first == groupId)
+ // Group already filled out
+ return;
+
+ auto iter = decorationGroups.emplace_hint(lowerBound, groupId, std::vector<uint32_t>{});
+ auto& vec = iter->second;
+ vec.reserve(ins->num_operands - 1);
+ for (uint32_t i = 1; i < ins->num_operands; ++i) {
+ op = &ins->operands[i];
+ assert(op->type == SPV_OPERAND_TYPE_ID);
+ vec.push_back(ins->words[op->offset]);
+ }
+ }
+
+ void parseOpTypeImage(const spv_parsed_instruction_t *ins)
+ {
+ const spv_parsed_operand_t *op;
+ uint32_t typeId;
+ unsigned accessQualifier = CLC_KERNEL_ARG_ACCESS_READ;
+
+ op = &ins->operands[0];
+ assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+ typeId = ins->words[op->offset];
+
+ if (ins->num_operands >= 9) {
+ op = &ins->operands[8];
+ assert(op->type == SPV_OPERAND_TYPE_ACCESS_QUALIFIER);
+ switch (ins->words[op->offset]) {
+ case SpvAccessQualifierReadOnly:
+ accessQualifier = CLC_KERNEL_ARG_ACCESS_READ;
+ break;
+ case SpvAccessQualifierWriteOnly:
+ accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE;
+ break;
+ case SpvAccessQualifierReadWrite:
+ accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE |
+ CLC_KERNEL_ARG_ACCESS_READ;
+ break;
+ }
+ }
+
+ for (auto &kernel : kernels) {
+ for (auto &arg : kernel.args) {
+ if (arg.typeId == typeId) {
+ arg.accessQualifier = accessQualifier;
+ arg.addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL;
+ }
+ }
+ }
+ }
+
+ void parseExecutionMode(const spv_parsed_instruction_t *ins)
+ {
+ uint32_t executionMode = ins->words[ins->operands[1].offset];
+ if (executionMode != SpvExecutionModeVecTypeHint)
+ return;
+
+ uint32_t funcId = ins->words[ins->operands[0].offset];
+ uint32_t vecHint = ins->words[ins->operands[2].offset];
+ for (auto& kernel : kernels) {
+ if (kernel.funcId == funcId)
+ kernel.vecHint = vecHint;
+ }
+ }
+
+ static spv_result_t
+ parseInstruction(void *data, const spv_parsed_instruction_t *ins)
+ {
+ SPIRVKernelParser *parser = reinterpret_cast<SPIRVKernelParser *>(data);
+
+ switch (ins->opcode) {
+ case SpvOpName:
+ parser->parseName(ins);
+ break;
+ case SpvOpEntryPoint:
+ parser->parseEntryPoint(ins);
+ break;
+ case SpvOpFunction:
+ parser->parseFunction(ins);
+ break;
+ case SpvOpFunctionParameter:
+ parser->parseFunctionParam(ins);
+ break;
+ case SpvOpFunctionEnd:
+ case SpvOpLabel:
+ parser->curKernel = NULL;
+ break;
+ case SpvOpTypePointer:
+ parser->parseTypePointer(ins);
+ break;
+ case SpvOpTypeImage:
+ parser->parseOpTypeImage(ins);
+ break;
+ case SpvOpString:
+ parser->parseOpString(ins);
+ break;
+ case SpvOpDecorate:
+ parser->parseOpDecorate(ins);
+ break;
+ case SpvOpGroupDecorate:
+ parser->parseOpGroupDecorate(ins);
+ break;
+ case SpvOpExecutionMode:
+ parser->parseExecutionMode(ins);
+ break;
+ default:
+ break;
+ }
+
+ return SPV_SUCCESS;
+ }
+
+ bool parsingComplete()
+ {
+ for (auto &kernel : kernels) {
+ if (kernel.name.empty())
+ return false;
+
+ for (auto &arg : kernel.args) {
+ if (arg.name.empty() || arg.typeName.empty())
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ void parseBinary(const struct spirv_binary &spvbin)
+ {
+ /* 3 passes should be enough to retrieve all kernel information:
+ * 1st pass: all entry point name and number of args
+ * 2nd pass: argument names and type names
+ * 3rd pass: pointer type names
+ */
+ for (unsigned pass = 0; pass < 3; pass++) {
+ spvBinaryParse(ctx, reinterpret_cast<void *>(this),
+ spvbin.data, spvbin.size / 4,
+ NULL, parseInstruction, NULL);
+
+ if (parsingComplete())
+ return;
+ }
+
+ assert(0);
+ }
+
+ std::vector<SPIRVKernelInfo> kernels;
+ std::map<uint32_t, std::vector<uint32_t>> decorationGroups;
+ SPIRVKernelInfo *curKernel;
+ spv_context ctx;
+};
+
+const struct clc_kernel_info *
+clc_spirv_get_kernels_info(const struct spirv_binary *spvbin,
+ unsigned *num_kernels)
+{
+ struct clc_kernel_info *kernels;
+
+ SPIRVKernelParser parser;
+
+ parser.parseBinary(*spvbin);
+ *num_kernels = parser.kernels.size();
+ if (!*num_kernels)
+ return NULL;
+
+ kernels = reinterpret_cast<struct clc_kernel_info *>(calloc(*num_kernels,
+ sizeof(*kernels)));
+ assert(kernels);
+ for (unsigned i = 0; i < parser.kernels.size(); i++) {
+ kernels[i].name = strdup(parser.kernels[i].name.c_str());
+ kernels[i].num_args = parser.kernels[i].args.size();
+ kernels[i].vec_hint_size = parser.kernels[i].vecHint >> 16;
+ kernels[i].vec_hint_type = (enum clc_vec_hint_type)(parser.kernels[i].vecHint & 0xFFFF);
+ if (!kernels[i].num_args)
+ continue;
+
+ struct clc_kernel_arg *args;
+
+ args = reinterpret_cast<struct clc_kernel_arg *>(calloc(kernels[i].num_args,
+ sizeof(*kernels->args)));
+ kernels[i].args = args;
+ assert(args);
+ for (unsigned j = 0; j < kernels[i].num_args; j++) {
+ if (!parser.kernels[i].args[j].name.empty())
+ args[j].name = strdup(parser.kernels[i].args[j].name.c_str());
+ args[j].type_name = strdup(parser.kernels[i].args[j].typeName.c_str());
+ args[j].address_qualifier = parser.kernels[i].args[j].addrQualifier;
+ args[j].type_qualifier = parser.kernels[i].args[j].typeQualifier;
+ args[j].access_qualifier = parser.kernels[i].args[j].accessQualifier;
+ }
+ }
+
+ return kernels;
+}
+
+void
+clc_free_kernels_info(const struct clc_kernel_info *kernels,
+ unsigned num_kernels)
+{
+ if (!kernels)
+ return;
+
+ for (unsigned i = 0; i < num_kernels; i++) {
+ if (kernels[i].args) {
+ for (unsigned j = 0; j < kernels[i].num_args; j++) {
+ free((void *)kernels[i].args[j].name);
+ free((void *)kernels[i].args[j].type_name);
+ }
+ }
+ free((void *)kernels[i].name);
+ }
+
+ free((void *)kernels);
+}
+
+int
+clc_to_spirv(const struct clc_compile_args *args,
+ struct spirv_binary *spvbin,
+ const struct clc_logger *logger)
+{
+ LLVMInitializeAllTargets();
+ LLVMInitializeAllTargetInfos();
+ LLVMInitializeAllTargetMCs();
+ LLVMInitializeAllAsmPrinters();
+
+ std::string log;
+ std::unique_ptr<LLVMContext> llvm_ctx { new LLVMContext };
+ llvm_ctx->setDiagnosticHandlerCallBack(llvm_log_handler, &log);
+
+ std::unique_ptr<clang::CompilerInstance> c { new clang::CompilerInstance };
+ clang::DiagnosticsEngine diag { new clang::DiagnosticIDs,
+ new clang::DiagnosticOptions,
+ new clang::TextDiagnosticPrinter(*new raw_string_ostream(log),
+ &c->getDiagnosticOpts(), true)};
+
+ std::vector<const char *> clang_opts = {
+ args->source.name,
+ "-triple", "spir64-unknown-unknown",
+ // By default, clang prefers to use modules to pull in the default headers,
+ // which doesn't work with our technique of embedding the headers in our binary
+ "-finclude-default-header",
+ // Add a default CL compiler version. Clang will pick the last one specified
+ // on the command line, so the app can override this one.
+ "-cl-std=cl1.2",
+ // The LLVM-SPIRV-Translator doesn't support memset with variable size
+ "-fno-builtin-memset",
+ // LLVM's optimizations can produce code that the translator can't translate
+ "-O0",
+ };
+ // We assume there's appropriate defines for __OPENCL_VERSION__ and __IMAGE_SUPPORT__
+ // being provided by the caller here.
+ clang_opts.insert(clang_opts.end(), args->args, args->args + args->num_args);
+
+ if (!clang::CompilerInvocation::CreateFromArgs(c->getInvocation(),
+#if LLVM_VERSION_MAJOR >= 10
+ clang_opts,
+#else
+ clang_opts.data(),
+ clang_opts.data() + clang_opts.size(),
+#endif
+ diag)) {
+ log += "Couldn't create Clang invocation.\n";
+ clc_error(logger, log.c_str());
+ return -1;
+ }
+
+ if (diag.hasErrorOccurred()) {
+ log += "Errors occurred during Clang invocation.\n";
+ clc_error(logger, log.c_str());
+ return -1;
+ }
+
+ // This is a workaround for a Clang bug which causes the number
+ // of warnings and errors to be printed to stderr.
+ // http://www.llvm.org/bugs/show_bug.cgi?id=19735
+ c->getDiagnosticOpts().ShowCarets = false;
+
+ c->createDiagnostics(new clang::TextDiagnosticPrinter(
+ *new raw_string_ostream(log),
+ &c->getDiagnosticOpts(), true));
+
+ c->setTarget(clang::TargetInfo::CreateTargetInfo(
+ c->getDiagnostics(), c->getInvocation().TargetOpts));
+
+ c->getFrontendOpts().ProgramAction = clang::frontend::EmitLLVMOnly;
+ c->getHeaderSearchOpts().UseBuiltinIncludes = false;
+ c->getHeaderSearchOpts().UseStandardSystemIncludes = false;
+
+ // Add opencl-c generic search path
+ {
+ ::llvm::SmallString<128> system_header_path;
+ ::llvm::sys::path::system_temp_directory(true, system_header_path);
+ ::llvm::sys::path::append(system_header_path, "openclon12");
+ c->getHeaderSearchOpts().AddPath(system_header_path.str(),
+ clang::frontend::Angled,
+ false, false);
+
+ ::llvm::sys::path::append(system_header_path, "opencl-c.h");
+ c->getPreprocessorOpts().addRemappedFile(system_header_path.str(),
+ ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_source, _countof(opencl_c_source) - 1)).release());
+
+ ::llvm::sys::path::remove_filename(system_header_path);
+ ::llvm::sys::path::append(system_header_path, "opencl-c-base.h");
+ c->getPreprocessorOpts().addRemappedFile(system_header_path.str(),
+ ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_base_source, _countof(opencl_c_base_source) - 1)).release());
+ }
+
+ if (args->num_headers) {
+ ::llvm::SmallString<128> tmp_header_path;
+ ::llvm::sys::path::system_temp_directory(true, tmp_header_path);
+ ::llvm::sys::path::append(tmp_header_path, "openclon12");
+
+ c->getHeaderSearchOpts().AddPath(tmp_header_path.str(),
+ clang::frontend::Quoted,
+ false, false);
+
+ for (size_t i = 0; i < args->num_headers; i++) {
+ auto path_copy = tmp_header_path;
+ ::llvm::sys::path::append(path_copy, ::llvm::sys::path::convert_to_slash(args->headers[i].name));
+ c->getPreprocessorOpts().addRemappedFile(path_copy.str(),
+ ::llvm::MemoryBuffer::getMemBufferCopy(args->headers[i].value).release());
+ }
+ }
+
+ c->getPreprocessorOpts().addRemappedFile(
+ args->source.name,
+ ::llvm::MemoryBuffer::getMemBufferCopy(std::string(args->source.value)).release());
+
+ // Compile the code
+ clang::EmitLLVMOnlyAction act(llvm_ctx.get());
+ if (!c->ExecuteAction(act)) {
+ log += "Error executing LLVM compilation action.\n";
+ clc_error(logger, log.c_str());
+ return -1;
+ }
+
+ auto mod = act.takeModule();
+ std::ostringstream spv_stream;
+ if (!::llvm::writeSpirv(mod.get(), spv_stream, log)) {
+ log += "Translation from LLVM IR to SPIR-V failed.\n";
+ clc_error(logger, log.c_str());
+ return -1;
+ }
+
+ const std::string spv_out = spv_stream.str();
+ spvbin->size = spv_out.size();
+ spvbin->data = static_cast<uint32_t *>(malloc(spvbin->size));
+ memcpy(spvbin->data, spv_out.data(), spvbin->size);
+
+ return 0;
+}
+
+static const char *
+spv_result_to_str(spv_result_t res)
+{
+ switch (res) {
+ case SPV_SUCCESS: return "success";
+ case SPV_UNSUPPORTED: return "unsupported";
+ case SPV_END_OF_STREAM: return "end of stream";
+ case SPV_WARNING: return "warning";
+ case SPV_FAILED_MATCH: return "failed match";
+ case SPV_REQUESTED_TERMINATION: return "requested termination";
+ case SPV_ERROR_INTERNAL: return "internal error";
+ case SPV_ERROR_OUT_OF_MEMORY: return "out of memory";
+ case SPV_ERROR_INVALID_POINTER: return "invalid pointer";
+ case SPV_ERROR_INVALID_BINARY: return "invalid binary";
+ case SPV_ERROR_INVALID_TEXT: return "invalid text";
+ case SPV_ERROR_INVALID_TABLE: return "invalid table";
+ case SPV_ERROR_INVALID_VALUE: return "invalid value";
+ case SPV_ERROR_INVALID_DIAGNOSTIC: return "invalid diagnostic";
+ case SPV_ERROR_INVALID_LOOKUP: return "invalid lookup";
+ case SPV_ERROR_INVALID_ID: return "invalid id";
+ case SPV_ERROR_INVALID_CFG: return "invalid config";
+ case SPV_ERROR_INVALID_LAYOUT: return "invalid layout";
+ case SPV_ERROR_INVALID_CAPABILITY: return "invalid capability";
+ case SPV_ERROR_INVALID_DATA: return "invalid data";
+ case SPV_ERROR_MISSING_EXTENSION: return "missing extension";
+ case SPV_ERROR_WRONG_VERSION: return "wrong version";
+ default: return "unknown error";
+ }
+}
+
+class SPIRVMessageConsumer {
+public:
+ SPIRVMessageConsumer(const struct clc_logger *logger): logger(logger) {}
+
+ void operator()(spv_message_level_t level, const char *src,
+ const spv_position_t &pos, const char *msg)
+ {
+ switch(level) {
+ case SPV_MSG_FATAL:
+ case SPV_MSG_INTERNAL_ERROR:
+ case SPV_MSG_ERROR:
+ clc_error(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s",
+ src, pos.line, pos.column, pos.index, msg);
+ break;
+
+ case SPV_MSG_WARNING:
+ clc_warning(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s",
+ src, pos.line, pos.column, pos.index, msg);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+private:
+ const struct clc_logger *logger;
+};
+
+int
+clc_link_spirv_binaries(const struct clc_linker_args *args,
+ struct spirv_binary *dst_bin,
+ const struct clc_logger *logger)
+{
+ std::vector<std::vector<uint32_t>> binaries;
+
+ for (unsigned i = 0; i < args->num_in_objs; i++) {
+ std::vector<uint32_t> bin(args->in_objs[i]->spvbin.data,
+ args->in_objs[i]->spvbin.data +
+ (args->in_objs[i]->spvbin.size / 4));
+ binaries.push_back(bin);
+ }
+
+ SPIRVMessageConsumer msgconsumer(logger);
+ spvtools::Context context(SPV_ENV_UNIVERSAL_1_0);
+ context.SetMessageConsumer(msgconsumer);
+ spvtools::LinkerOptions options;
+ options.SetAllowPartialLinkage(args->create_library);
+ options.SetCreateLibrary(args->create_library);
+ std::vector<uint32_t> linkingResult;
+ spv_result_t status = spvtools::Link(context, binaries, &linkingResult, options);
+ if (status != SPV_SUCCESS) {
+ return -1;
+ }
+
+ dst_bin->size = linkingResult.size() * 4;
+ dst_bin->data = static_cast<uint32_t *>(malloc(dst_bin->size));
+ memcpy(dst_bin->data, linkingResult.data(), dst_bin->size);
+
+ return 0;
+}
+
+void
+clc_dump_spirv(const struct spirv_binary *spvbin, FILE *f)
+{
+ spvtools::SpirvTools tools(SPV_ENV_UNIVERSAL_1_0);
+ std::vector<uint32_t> bin(spvbin->data, spvbin->data + (spvbin->size / 4));
+ std::string out;
+ tools.Disassemble(bin, &out,
+ SPV_BINARY_TO_TEXT_OPTION_INDENT |
+ SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES);
+ fwrite(out.c_str(), out.size(), 1, f);
+}
+
+void
+clc_free_spirv_binary(struct spirv_binary *spvbin)
+{
+ free(spvbin->data);
+}
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CLC_TO_NIR_H
+#define CLC_TO_NIR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nir_types.h"
+
+#include "clc_compiler.h"
+#include "util/u_string.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+
+const struct clc_kernel_info *
+clc_spirv_get_kernels_info(const struct spirv_binary *spvbin,
+ unsigned *num_kernels);
+
+void
+clc_free_kernels_info(const struct clc_kernel_info *kernels,
+ unsigned num_kernels);
+
+int
+clc_to_spirv(const struct clc_compile_args *args,
+ struct spirv_binary *spvbin,
+ const struct clc_logger *logger);
+
+int
+clc_link_spirv_binaries(const struct clc_linker_args *args,
+ struct spirv_binary *dst_bin,
+ const struct clc_logger *logger);
+
+void
+clc_dump_spirv(const struct spirv_binary *spvbin, FILE *f);
+
+void
+clc_free_spirv_binary(struct spirv_binary *spvbin);
+
+#define clc_log(logger, level, fmt, ...) do { \
+ if (!logger || !logger->level) break; \
+ char *msg = NULL; \
+ asprintf(&msg, fmt, __VA_ARGS__); \
+ assert(msg); \
+ logger->level(logger->priv, msg); \
+ free(msg); \
+ } while (0)
+
+#define clc_error(logger, fmt, ...) clc_log(logger, error, fmt, __VA_ARGS__)
+#define clc_warning(logger, fmt, ...) clc_log(logger, warning, fmt, __VA_ARGS__)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "u_math.h"
+#include "nir.h"
+#include "glsl_types.h"
+#include "nir_types.h"
+#include "nir_builder.h"
+
+#include "clc_nir.h"
+#include "clc_compiler.h"
+#include "../compiler/dxil_nir.h"
+
+static bool
+lower_load_base_global_invocation_id(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_variable *var)
+{
+ b->cursor = nir_after_instr(&intr->instr);
+
+ nir_ssa_def *offset =
+ build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+ nir_imm_int(b,
+ offsetof(struct clc_work_properties_data,
+ global_offset_x)),
+ nir_dest_num_components(intr->dest),
+ nir_dest_bit_size(intr->dest));
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(offset));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_load_work_dim(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_variable *var)
+{
+ b->cursor = nir_after_instr(&intr->instr);
+
+ nir_ssa_def *dim =
+ build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+ nir_imm_int(b,
+ offsetof(struct clc_work_properties_data,
+ work_dim)),
+ nir_dest_num_components(intr->dest),
+ nir_dest_bit_size(intr->dest));
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(dim));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_load_local_group_size(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ b->cursor = nir_after_instr(&intr->instr);
+
+ nir_const_value v[3] = {
+ nir_const_value_for_int(b->shader->info.cs.local_size[0], 32),
+ nir_const_value_for_int(b->shader->info.cs.local_size[1], 32),
+ nir_const_value_for_int(b->shader->info.cs.local_size[2], 32)
+ };
+ nir_ssa_def *size = nir_build_imm(b, 3, 32, v);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(size));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_load_num_work_groups(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_variable *var)
+{
+ b->cursor = nir_after_instr(&intr->instr);
+
+ nir_ssa_def *count =
+ build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+ nir_imm_int(b,
+ offsetof(struct clc_work_properties_data,
+ group_count_total_x)),
+ nir_dest_num_components(intr->dest),
+ nir_dest_bit_size(intr->dest));
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(count));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_load_base_work_group_id(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_variable *var)
+{
+ b->cursor = nir_after_instr(&intr->instr);
+
+ nir_ssa_def *offset =
+ build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding),
+ nir_imm_int(b,
+ offsetof(struct clc_work_properties_data,
+ group_id_offset_x)),
+ nir_dest_num_components(intr->dest),
+ nir_dest_bit_size(intr->dest));
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(offset));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+bool
+clc_nir_lower_system_values(nir_shader *nir, nir_variable *var)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_base_global_invocation_id:
+ progress |= lower_load_base_global_invocation_id(&b, intr, var);
+ break;
+ case nir_intrinsic_load_work_dim:
+ progress |= lower_load_work_dim(&b, intr, var);
+ break;
+ case nir_intrinsic_load_local_group_size:
+ lower_load_local_group_size(&b, intr);
+ break;
+ case nir_intrinsic_load_num_work_groups:
+ lower_load_num_work_groups(&b, intr, var);
+ break;
+ case nir_intrinsic_load_base_work_group_id:
+ lower_load_base_work_group_id(&b, intr, var);
+ break;
+ default: break;
+ }
+ }
+ }
+ }
+
+ return progress;
+}
+
+static bool
+lower_load_kernel_input(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_variable *var)
+{
+ nir_intrinsic_instr *load;
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ unsigned bit_size = nir_dest_bit_size(intr->dest);
+ enum glsl_base_type base_type;
+
+ switch (bit_size) {
+ case 64:
+ base_type = GLSL_TYPE_UINT64;
+ break;
+ case 32:
+ base_type = GLSL_TYPE_UINT;
+ break;
+ case 16:
+ base_type = GLSL_TYPE_UINT16;
+ break;
+ case 8:
+ base_type = GLSL_TYPE_UINT8;
+ break;
+ }
+
+ const struct glsl_type *type =
+ glsl_vector_type(base_type, nir_dest_num_components(intr->dest));
+ nir_ssa_def *ptr = nir_vec2(b, nir_imm_int(b, var->data.binding),
+ nir_u2u(b, intr->src[0].ssa, 32));
+ nir_deref_instr *deref = nir_build_deref_cast(b, ptr, nir_var_mem_ubo, type,
+ bit_size / 8);
+ deref->cast.align_mul = nir_intrinsic_align_mul(intr);
+ deref->cast.align_offset = nir_intrinsic_align_offset(intr);
+
+ nir_ssa_def *result =
+ nir_load_deref(b, deref);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+bool
+clc_nir_lower_kernel_input_loads(nir_shader *nir, nir_variable *var)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ if (intr->intrinsic == nir_intrinsic_load_kernel_input)
+ progress |= lower_load_kernel_input(&b, intr, var);
+ }
+ }
+ }
+
+ return progress;
+}
+
+
+static nir_variable *
+add_printf_var(struct nir_shader *nir, unsigned uav_id)
+{
+ /* This size is arbitrary. Minimum required per spec is 1MB */
+ const unsigned max_printf_size = 1 * 1024 * 1024;
+ const unsigned printf_array_size = max_printf_size / sizeof(unsigned);
+ nir_variable *var =
+ nir_variable_create(nir, nir_var_mem_ssbo,
+ glsl_array_type(glsl_uint_type(), printf_array_size, sizeof(unsigned)),
+ "kernel_work_properies");
+ var->data.binding = uav_id;
+ return var;
+}
+
+static void
+lower_printf_impl(nir_builder *b, nir_intrinsic_instr *instr, nir_variable *var)
+{
+ /* Atomic add a buffer size counter to determine where to write.
+ * If overflowed, return -1, otherwise, store the arguments and return 0.
+ */
+ b->cursor = nir_before_instr(&instr->instr);
+ nir_deref_instr *ssbo_deref = nir_build_deref_var(b, var);
+ nir_deref_instr *counter_deref = nir_build_deref_array_imm(b, ssbo_deref, 0);
+ nir_deref_instr *struct_deref = nir_instr_as_deref(instr->src[1].ssa->parent_instr);
+ nir_variable *struct_var = nir_deref_instr_get_variable(struct_deref);
+ const struct glsl_type *struct_type = struct_var->type;
+ /* Align the struct size to 4 for natural SSBO alignment */
+ int struct_size = align(glsl_get_cl_size(struct_type), 4);
+
+ /* Hardcoding 64bit pointers to simplify some code below */
+ assert(instr->src[0].ssa->num_components == 1 && instr->src[0].ssa->bit_size == 64);
+
+ nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, nir_intrinsic_deref_atomic_add);
+ nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);
+ atomic->src[0] = nir_src_for_ssa(&counter_deref->dest.ssa);
+ atomic->src[1] = nir_src_for_ssa(nir_imm_int(b, struct_size + sizeof(uint64_t)));
+ nir_builder_instr_insert(b, &atomic->instr);
+
+ int max_valid_offset =
+ glsl_get_cl_size(var->type) - /* buffer size */
+ struct_size - /* printf args size */
+ sizeof(uint64_t) - /* format string */
+ sizeof(int); /* the first int in the buffer is for the counter */
+ nir_push_if(b, nir_ilt(b, &atomic->dest.ssa, nir_imm_int(b, max_valid_offset)));
+ nir_ssa_def *printf_succ_val = nir_imm_int(b, 0);
+
+ nir_ssa_def *start_offset = nir_u2u64(b, nir_iadd(b, &atomic->dest.ssa, nir_imm_int(b, sizeof(int))));
+ nir_deref_instr *as_byte_array = nir_build_deref_cast(b, &ssbo_deref->dest.ssa, nir_var_mem_ssbo, glsl_uint8_t_type(), 1);
+ nir_deref_instr *as_offset_byte_array = nir_build_deref_ptr_as_array(b, as_byte_array, start_offset);
+ nir_deref_instr *format_string_write_deref =
+ nir_build_deref_cast(b, &as_offset_byte_array->dest.ssa, nir_var_mem_ssbo, glsl_uint64_t_type(), 8);
+ nir_store_deref(b, format_string_write_deref, instr->src[0].ssa, ~0);
+
+ for (unsigned i = 0; i < glsl_get_length(struct_type); ++i) {
+ nir_ssa_def *field_offset_from_start = nir_imm_int64(b, glsl_get_struct_field_offset(struct_type, i) + sizeof(uint64_t));
+ nir_ssa_def *field_offset = nir_iadd(b, start_offset, field_offset_from_start);
+
+ const struct glsl_type *field_type = glsl_get_struct_field(struct_type, i);
+ nir_deref_instr *field_read_deref = nir_build_deref_struct(b, struct_deref, i);
+ nir_ssa_def *field_value = nir_load_deref(b, field_read_deref);
+
+ /* Clang does promotion of arguments to their "native" size. That means that any floats
+ * have been converted to doubles for the call to printf. Since we don't support doubles,
+ * convert them back here; copy-prop and other optimizations should remove all hint of doubles.
+ */
+ if (glsl_get_base_type(field_type) == GLSL_TYPE_DOUBLE) {
+ field_value = nir_f2f32(b, field_value);
+ field_type = glsl_float_type();
+ }
+
+ as_offset_byte_array = nir_build_deref_ptr_as_array(b, as_byte_array, field_offset);
+ nir_deref_instr *field_write_deref =
+ nir_build_deref_cast(b, &as_offset_byte_array->dest.ssa, nir_var_mem_ssbo, field_type, glsl_get_cl_size(field_type));
+
+ nir_store_deref(b, field_write_deref, field_value, ~0);
+ }
+
+ nir_push_else(b, NULL);
+ nir_ssa_def *printf_fail_val = nir_imm_int(b, -1);
+ nir_pop_if(b, NULL);
+
+ nir_ssa_def *return_value = nir_if_phi(b, printf_succ_val, printf_fail_val);
+ nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(return_value));
+ nir_instr_remove(&instr->instr);
+}
+
+static nir_variable *
+find_identical_const_sampler(nir_shader *nir, nir_variable *sampler)
+{
+ nir_foreach_variable_with_modes(uniform, nir, nir_var_uniform) {
+ if (!glsl_type_is_sampler(uniform->type) || !uniform->data.sampler.is_inline_sampler)
+ continue;
+ if (uniform->data.sampler.addressing_mode == sampler->data.sampler.addressing_mode &&
+ uniform->data.sampler.normalized_coordinates == sampler->data.sampler.normalized_coordinates &&
+ uniform->data.sampler.filter_mode == sampler->data.sampler.filter_mode)
+ return uniform;
+ }
+ unreachable("Should have at least found the input sampler");
+}
+
+bool
+clc_nir_dedupe_const_samplers(nir_shader *nir)
+{
+ bool progress = false;
+ nir_foreach_function(func, nir) {
+ if (!func->impl)
+ continue;
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_tex)
+ continue;
+
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+ int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+ if (sampler_idx == -1)
+ continue;
+
+ nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_idx].src);
+ nir_variable *sampler = nir_deref_instr_get_variable(deref);
+ if (!sampler)
+ continue;
+
+ assert(sampler->data.mode == nir_var_uniform);
+
+ if (!sampler->data.sampler.is_inline_sampler)
+ continue;
+
+ nir_variable *replacement = find_identical_const_sampler(nir, sampler);
+ if (replacement == sampler)
+ continue;
+
+ b.cursor = nir_before_instr(&tex->instr);
+ nir_deref_instr *replacement_deref = nir_build_deref_var(&b, replacement);
+ nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_idx].src,
+ nir_src_for_ssa(&replacement_deref->dest.ssa));
+ nir_deref_instr_remove_if_unused(deref);
+ progress = true;
+ }
+ }
+
+ if (progress) {
+ nir_metadata_preserve(func->impl, nir_metadata_block_index | nir_metadata_dominance);
+ }
+ }
+ return progress;
+}
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CLC_NIR_H
+#define CLC_NIR_H
+
+#include <stdbool.h>
+#include "nir.h"
+
+bool
+clc_nir_lower_system_values(nir_shader *nir, nir_variable *var);
+bool dxil_nir_lower_kernel_input_loads(nir_shader *nir, nir_variable *var);
+
+bool
+clc_nir_lower_printf(nir_shader *nir, unsigned uav_id);
+
+bool
+clc_nir_dedupe_const_samplers(nir_shader *nir);
+
+#endif
--- /dev/null
+EXPORTS
+ clc_context_new
+ clc_free_context
+ clc_context_serialize
+ clc_context_free_serialized
+ clc_context_deserialize
+ clc_compile
+ clc_link
+ clc_free_object
+ clc_to_dxil
+ clc_free_dxil_object
+ clc_compiler_get_version
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdexcept>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <gtest/gtest.h>
+#include <wrl.h>
+
+#include "util/u_debug.h"
+#include "clc_compiler.h"
+#include "compute_test.h"
+#include "dxcapi.h"
+
+using std::runtime_error;
+using Microsoft::WRL::ComPtr;
+
+enum compute_test_debug_flags {
+ COMPUTE_DEBUG_EXPERIMENTAL_SHADERS = 1 << 0,
+ COMPUTE_DEBUG_USE_HW_D3D = 1 << 1,
+ COMPUTE_DEBUG_OPTIMIZE_LIBCLC = 1 << 2,
+ COMPUTE_DEBUG_SERIALIZE_LIBCLC = 1 << 3,
+};
+
+static const struct debug_named_value debug_options[] = {
+ { "experimental_shaders", COMPUTE_DEBUG_EXPERIMENTAL_SHADERS, "Enable experimental shaders" },
+ { "use_hw_d3d", COMPUTE_DEBUG_USE_HW_D3D, "Use a hardware D3D device" },
+ { "optimize_libclc", COMPUTE_DEBUG_OPTIMIZE_LIBCLC, "Optimize the clc_context before using it" },
+ { "serialize_libclc", COMPUTE_DEBUG_SERIALIZE_LIBCLC, "Serialize and deserialize the clc_context" },
+ DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(debug_compute, "COMPUTE_TEST_DEBUG", debug_options, 0)
+
+static void warning_callback(void *priv, const char *msg)
+{
+ fprintf(stderr, "WARNING: %s\n", msg);
+}
+
+static void error_callback(void *priv, const char *msg)
+{
+ fprintf(stderr, "ERROR: %s\n", msg);
+}
+
+static const struct clc_logger logger = {
+ NULL,
+ error_callback,
+ warning_callback,
+};
+
+void
+ComputeTest::enable_d3d12_debug_layer()
+{
+ HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL");
+ if (!hD3D12Mod) {
+ fprintf(stderr, "D3D12: failed to load D3D12.DLL\n");
+ return;
+ }
+
+ typedef HRESULT(WINAPI * PFN_D3D12_GET_DEBUG_INTERFACE)(REFIID riid,
+ void **ppFactory);
+ PFN_D3D12_GET_DEBUG_INTERFACE D3D12GetDebugInterface = (PFN_D3D12_GET_DEBUG_INTERFACE)GetProcAddress(hD3D12Mod, "D3D12GetDebugInterface");
+ if (!D3D12GetDebugInterface) {
+ fprintf(stderr, "D3D12: failed to load D3D12GetDebugInterface from D3D12.DLL\n");
+ return;
+ }
+
+ ID3D12Debug *debug;
+ if (FAILED(D3D12GetDebugInterface(__uuidof(ID3D12Debug), (void **)& debug))) {
+ fprintf(stderr, "D3D12: D3D12GetDebugInterface failed\n");
+ return;
+ }
+
+ debug->EnableDebugLayer();
+}
+
+IDXGIFactory4 *
+ComputeTest::get_dxgi_factory()
+{
+ static const GUID IID_IDXGIFactory4 = {
+ 0x1bc6ea02, 0xef36, 0x464f,
+ { 0xbf, 0x0c, 0x21, 0xca, 0x39, 0xe5, 0x16, 0x8a }
+ };
+
+ typedef HRESULT(WINAPI * PFN_CREATE_DXGI_FACTORY)(REFIID riid,
+ void **ppFactory);
+ PFN_CREATE_DXGI_FACTORY CreateDXGIFactory;
+
+ HMODULE hDXGIMod = LoadLibrary("DXGI.DLL");
+ if (!hDXGIMod)
+ throw runtime_error("Failed to load DXGI.DLL");
+
+ CreateDXGIFactory = (PFN_CREATE_DXGI_FACTORY)GetProcAddress(hDXGIMod, "CreateDXGIFactory");
+ if (!CreateDXGIFactory)
+ throw runtime_error("Failed to load CreateDXGIFactory from DXGI.DLL");
+
+ IDXGIFactory4 *factory = NULL;
+ HRESULT hr = CreateDXGIFactory(IID_IDXGIFactory4, (void **)&factory);
+ if (FAILED(hr))
+ throw runtime_error("CreateDXGIFactory failed");
+
+ return factory;
+}
+
+IDXGIAdapter1 *
+ComputeTest::choose_adapter(IDXGIFactory4 *factory)
+{
+ IDXGIAdapter1 *ret;
+
+ if (debug_get_option_debug_compute() & COMPUTE_DEBUG_USE_HW_D3D) {
+ for (unsigned i = 0; SUCCEEDED(factory->EnumAdapters1(i, &ret)); i++) {
+ DXGI_ADAPTER_DESC1 desc;
+ ret->GetDesc1(&desc);
+ if (!(desc.Flags & D3D_DRIVER_TYPE_SOFTWARE))
+ return ret;
+ }
+ throw runtime_error("Failed to enum hardware adapter");
+ } else {
+ if (FAILED(factory->EnumWarpAdapter(__uuidof(IDXGIAdapter1),
+ (void **)& ret)))
+ throw runtime_error("Failed to enum warp adapter");
+ return ret;
+ }
+}
+
+ID3D12Device *
+ComputeTest::create_device(IDXGIAdapter1 *adapter)
+{
+ typedef HRESULT(WINAPI *PFN_D3D12CREATEDEVICE)(IUnknown *, D3D_FEATURE_LEVEL, REFIID, void **);
+ PFN_D3D12CREATEDEVICE D3D12CreateDevice;
+
+ HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL");
+ if (!hD3D12Mod)
+ throw runtime_error("failed to load D3D12.DLL");
+
+ if (debug_get_option_debug_compute() & COMPUTE_DEBUG_EXPERIMENTAL_SHADERS) {
+ typedef HRESULT(WINAPI *PFN_D3D12ENABLEEXPERIMENTALFEATURES)(UINT, const IID *, void *, UINT *);
+ PFN_D3D12ENABLEEXPERIMENTALFEATURES D3D12EnableExperimentalFeatures;
+ D3D12EnableExperimentalFeatures = (PFN_D3D12ENABLEEXPERIMENTALFEATURES)
+ GetProcAddress(hD3D12Mod, "D3D12EnableExperimentalFeatures");
+ if (FAILED(D3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModels, NULL, NULL)))
+ throw runtime_error("failed to enable experimental shader models");
+ }
+
+ D3D12CreateDevice = (PFN_D3D12CREATEDEVICE)GetProcAddress(hD3D12Mod, "D3D12CreateDevice");
+ if (!D3D12CreateDevice)
+ throw runtime_error("failed to load D3D12CreateDevice from D3D12.DLL");
+
+ ID3D12Device *dev;
+ if (FAILED(D3D12CreateDevice(adapter, D3D_FEATURE_LEVEL_12_0,
+ __uuidof(ID3D12Device), (void **)& dev)))
+ throw runtime_error("D3D12CreateDevice failed");
+
+ return dev;
+}
+
+ComPtr<ID3D12RootSignature>
+ComputeTest::create_root_signature(const ComputeTest::Resources &resources)
+{
+ D3D12_ROOT_PARAMETER1 root_param;
+ root_param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+ root_param.DescriptorTable.NumDescriptorRanges = resources.ranges.size();
+ root_param.DescriptorTable.pDescriptorRanges = resources.ranges.data();
+ root_param.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+
+ D3D12_ROOT_SIGNATURE_DESC1 root_sig_desc;
+ root_sig_desc.NumParameters = 1;
+ root_sig_desc.pParameters = &root_param;
+ root_sig_desc.NumStaticSamplers = 0;
+ root_sig_desc.pStaticSamplers = NULL;
+ root_sig_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
+
+ D3D12_VERSIONED_ROOT_SIGNATURE_DESC versioned_desc;
+ versioned_desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+ versioned_desc.Desc_1_1 = root_sig_desc;
+
+ ID3DBlob *sig, *error;
+ if (FAILED(D3D12SerializeVersionedRootSignature(&versioned_desc,
+ &sig, &error)))
+ throw runtime_error("D3D12SerializeVersionedRootSignature failed");
+
+ ComPtr<ID3D12RootSignature> ret;
+ if (FAILED(dev->CreateRootSignature(0,
+ sig->GetBufferPointer(),
+ sig->GetBufferSize(),
+ __uuidof(ret),
+ (void **)& ret)))
+ throw runtime_error("CreateRootSignature failed");
+
+ return ret;
+}
+
+ComPtr<ID3D12PipelineState>
+ComputeTest::create_pipeline_state(ComPtr<ID3D12RootSignature> &root_sig,
+ const struct clc_dxil_object &dxil)
+{
+ D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc = { root_sig.Get() };
+ pipeline_desc.CS.pShaderBytecode = dxil.binary.data;
+ pipeline_desc.CS.BytecodeLength = dxil.binary.size;
+
+ ComPtr<ID3D12PipelineState> pipeline_state;
+ if (FAILED(dev->CreateComputePipelineState(&pipeline_desc,
+ __uuidof(pipeline_state),
+ (void **)& pipeline_state)))
+ throw runtime_error("Failed to create pipeline state");
+ return pipeline_state;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::create_buffer(int size, D3D12_HEAP_TYPE heap_type)
+{
+ D3D12_RESOURCE_DESC desc;
+ desc.Format = DXGI_FORMAT_UNKNOWN;
+ desc.Alignment = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT;
+ desc.Width = size;
+ desc.Height = 1;
+ desc.DepthOrArraySize = 1;
+ desc.MipLevels = 1;
+ desc.SampleDesc.Count = 1;
+ desc.SampleDesc.Quality = 0;
+ desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+ desc.Flags = heap_type == D3D12_HEAP_TYPE_DEFAULT ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE;
+ desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+
+ D3D12_HEAP_PROPERTIES heap_pris = dev->GetCustomHeapProperties(0, heap_type);
+
+ D3D12_RESOURCE_STATES initial_state = D3D12_RESOURCE_STATE_COMMON;
+ switch (heap_type) {
+ case D3D12_HEAP_TYPE_UPLOAD:
+ initial_state = D3D12_RESOURCE_STATE_GENERIC_READ;
+ break;
+
+ case D3D12_HEAP_TYPE_READBACK:
+ initial_state = D3D12_RESOURCE_STATE_COPY_DEST;
+ break;
+ }
+
+ ComPtr<ID3D12Resource> res;
+ if (FAILED(dev->CreateCommittedResource(&heap_pris,
+ D3D12_HEAP_FLAG_NONE, &desc, initial_state,
+ NULL, __uuidof(ID3D12Resource), (void **)&res)))
+ throw runtime_error("CreateCommittedResource failed");
+
+ return res;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::create_upload_buffer_with_data(const void *data, size_t size)
+{
+ auto upload_res = create_buffer(size, D3D12_HEAP_TYPE_UPLOAD);
+
+ void *ptr = NULL;
+ D3D12_RANGE res_range = { 0, (SIZE_T)size };
+ if (FAILED(upload_res->Map(0, &res_range, (void **)&ptr)))
+ throw runtime_error("Failed to map upload-buffer");
+ assert(ptr);
+ memcpy(ptr, data, size);
+ upload_res->Unmap(0, &res_range);
+ return upload_res;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::create_sized_buffer_with_data(size_t buffer_size,
+ const void *data,
+ size_t data_size)
+{
+ auto upload_res = create_upload_buffer_with_data(data, data_size);
+
+ auto res = create_buffer(buffer_size, D3D12_HEAP_TYPE_DEFAULT);
+ resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST);
+ cmdlist->CopyBufferRegion(res.Get(), 0, upload_res.Get(), 0, data_size);
+ resource_barrier(res, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON);
+ execute_cmdlist();
+
+ return res;
+}
+
+void
+ComputeTest::get_buffer_data(ComPtr<ID3D12Resource> res,
+ void *buf, size_t size)
+{
+ auto readback_res = create_buffer(align(size, 4), D3D12_HEAP_TYPE_READBACK);
+ resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE);
+ cmdlist->CopyResource(readback_res.Get(), res.Get());
+ resource_barrier(res, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_COMMON);
+ execute_cmdlist();
+
+ void *ptr = NULL;
+ D3D12_RANGE res_range = { 0, size };
+ if (FAILED(readback_res->Map(0, &res_range, &ptr)))
+ throw runtime_error("Failed to map readback-buffer");
+
+ memcpy(buf, ptr, size);
+
+ D3D12_RANGE empty_range = { 0, 0 };
+ readback_res->Unmap(0, &empty_range);
+}
+
+void
+ComputeTest::resource_barrier(ComPtr<ID3D12Resource> &res,
+ D3D12_RESOURCE_STATES state_before,
+ D3D12_RESOURCE_STATES state_after)
+{
+ D3D12_RESOURCE_BARRIER barrier;
+ barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+ barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+ barrier.Transition.pResource = res.Get();
+ barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+ barrier.Transition.StateBefore = state_before;
+ barrier.Transition.StateAfter = state_after;
+ cmdlist->ResourceBarrier(1, &barrier);
+}
+
+void
+ComputeTest::execute_cmdlist()
+{
+ if (FAILED(cmdlist->Close()))
+ throw runtime_error("Closing ID3D12GraphicsCommandList failed");
+
+ ID3D12CommandList *cmdlists[] = { cmdlist };
+ cmdqueue->ExecuteCommandLists(1, cmdlists);
+ cmdqueue_fence->SetEventOnCompletion(fence_value, event);
+ cmdqueue->Signal(cmdqueue_fence, fence_value);
+ fence_value++;
+ WaitForSingleObject(event, INFINITE);
+
+ if (FAILED(cmdalloc->Reset()))
+ throw runtime_error("resetting ID3D12CommandAllocator failed");
+
+ if (FAILED(cmdlist->Reset(cmdalloc, NULL)))
+ throw runtime_error("resetting ID3D12GraphicsCommandList failed");
+}
+
+void
+ComputeTest::create_uav_buffer(ComPtr<ID3D12Resource> res,
+ size_t width, size_t byte_stride,
+ D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle)
+{
+ D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc;
+ uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+ uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+ uav_desc.Buffer.FirstElement = 0;
+ uav_desc.Buffer.NumElements = DIV_ROUND_UP(width * byte_stride, 4);
+ uav_desc.Buffer.StructureByteStride = 0;
+ uav_desc.Buffer.CounterOffsetInBytes = 0;
+ uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
+
+ dev->CreateUnorderedAccessView(res.Get(), NULL, &uav_desc, cpu_handle);
+}
+
+void
+ComputeTest::create_cbv(ComPtr<ID3D12Resource> res, size_t size,
+ D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle)
+{
+ D3D12_CONSTANT_BUFFER_VIEW_DESC cbv_desc;
+ cbv_desc.BufferLocation = res ? res->GetGPUVirtualAddress() : 0;
+ cbv_desc.SizeInBytes = size;
+
+ dev->CreateConstantBufferView(&cbv_desc, cpu_handle);
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::add_uav_resource(ComputeTest::Resources &resources,
+ unsigned spaceid, unsigned resid,
+ const void *data, size_t num_elems,
+ size_t elem_size)
+{
+ size_t size = align(elem_size * num_elems, 4);
+ D3D12_CPU_DESCRIPTOR_HANDLE handle;
+ ComPtr<ID3D12Resource> res;
+ handle = uav_heap->GetCPUDescriptorHandleForHeapStart();
+ handle = offset_cpu_handle(handle, resources.descs.size() * uav_heap_incr);
+
+ if (size) {
+ if (data)
+ res = create_buffer_with_data(data, size);
+ else
+ res = create_buffer(size, D3D12_HEAP_TYPE_DEFAULT);
+
+ resource_barrier(res, D3D12_RESOURCE_STATE_COMMON,
+ D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+ }
+ create_uav_buffer(res, num_elems, elem_size, handle);
+ resources.add(res, D3D12_DESCRIPTOR_RANGE_TYPE_UAV, spaceid, resid);
+ return res;
+}
+
+ComPtr<ID3D12Resource>
+ComputeTest::add_cbv_resource(ComputeTest::Resources &resources,
+ unsigned spaceid, unsigned resid,
+ const void *data, size_t size)
+{
+ unsigned aligned_size = align(size, 256);
+ D3D12_CPU_DESCRIPTOR_HANDLE handle;
+ ComPtr<ID3D12Resource> res;
+ handle = uav_heap->GetCPUDescriptorHandleForHeapStart();
+ handle = offset_cpu_handle(handle, resources.descs.size() * uav_heap_incr);
+
+ if (size) {
+ assert(data);
+ res = create_sized_buffer_with_data(aligned_size, data, size);
+ }
+ create_cbv(res, aligned_size, handle);
+ resources.add(res, D3D12_DESCRIPTOR_RANGE_TYPE_CBV, spaceid, resid);
+ return res;
+}
+
+void
+ComputeTest::run_shader_with_raw_args(Shader shader,
+ const CompileArgs &compile_args,
+ const std::vector<RawShaderArg *> &args)
+{
+ if (args.size() < 1)
+ throw runtime_error("no inputs");
+
+ static HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL");
+ if (!hD3D12Mod)
+ throw runtime_error("Failed to load D3D12.DLL");
+
+ D3D12SerializeVersionedRootSignature = (PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE)GetProcAddress(hD3D12Mod, "D3D12SerializeVersionedRootSignature");
+
+ if (args.size() != shader.dxil->kernel->num_args)
+ throw runtime_error("incorrect number of inputs");
+
+ struct clc_runtime_kernel_conf conf = { 0 };
+
+ // Older WARP and some hardware doesn't support int64, so for these tests, unconditionally lower away int64
+ // A more complex runtime can be smarter about detecting when this needs to be done
+ conf.lower_bit_size = 64;
+
+ if (!shader.dxil->metadata.local_size[0])
+ conf.local_size[0] = compile_args.x;
+ else
+ conf.local_size[0] = shader.dxil->metadata.local_size[0];
+
+ if (!shader.dxil->metadata.local_size[1])
+ conf.local_size[1] = compile_args.y;
+ else
+ conf.local_size[1] = shader.dxil->metadata.local_size[1];
+
+ if (!shader.dxil->metadata.local_size[2])
+ conf.local_size[2] = compile_args.z;
+ else
+ conf.local_size[2] = shader.dxil->metadata.local_size[2];
+
+ if (compile_args.x % conf.local_size[0] ||
+ compile_args.y % conf.local_size[1] ||
+ compile_args.z % conf.local_size[2])
+ throw runtime_error("invalid global size must be a multiple of local size");
+
+ std::vector<struct clc_runtime_arg_info> argsinfo(args.size());
+
+ conf.args = argsinfo.data();
+ conf.support_global_work_id_offsets =
+ compile_args.work_props.global_offset_x != 0 ||
+ compile_args.work_props.global_offset_y != 0 ||
+ compile_args.work_props.global_offset_z != 0;
+ conf.support_work_group_id_offsets =
+ compile_args.work_props.group_id_offset_x != 0 ||
+ compile_args.work_props.group_id_offset_y != 0 ||
+ compile_args.work_props.group_id_offset_z != 0;
+
+ for (unsigned i = 0; i < shader.dxil->kernel->num_args; ++i) {
+ RawShaderArg *arg = args[i];
+ size_t size = arg->get_elem_size() * arg->get_num_elems();
+
+ switch (shader.dxil->kernel->args[i].address_qualifier) {
+ case CLC_KERNEL_ARG_ADDRESS_LOCAL:
+ argsinfo[i].localptr.size = size;
+ break;
+ default:
+ break;
+ }
+ }
+
+ configure(shader, &conf);
+ validate(shader);
+
+ std::shared_ptr<struct clc_dxil_object> &dxil = shader.dxil;
+
+ std::vector<uint8_t> argsbuf(dxil->metadata.kernel_inputs_buf_size);
+ std::vector<ComPtr<ID3D12Resource>> argres(shader.dxil->kernel->num_args);
+ clc_work_properties_data work_props = compile_args.work_props;
+ if (!conf.support_work_group_id_offsets) {
+ work_props.group_count_total_x = compile_args.x / conf.local_size[0];
+ work_props.group_count_total_y = compile_args.y / conf.local_size[1];
+ work_props.group_count_total_z = compile_args.z / conf.local_size[2];
+ }
+ if (work_props.work_dim == 0)
+ work_props.work_dim = 3;
+ Resources resources;
+
+ for (unsigned i = 0; i < dxil->kernel->num_args; ++i) {
+ RawShaderArg *arg = args[i];
+ size_t size = arg->get_elem_size() * arg->get_num_elems();
+ void *slot = argsbuf.data() + dxil->metadata.args[i].offset;
+
+ switch (dxil->kernel->args[i].address_qualifier) {
+ case CLC_KERNEL_ARG_ADDRESS_CONSTANT:
+ case CLC_KERNEL_ARG_ADDRESS_GLOBAL: {
+ assert(dxil->metadata.args[i].size == sizeof(uint64_t));
+ uint64_t *ptr_slot = (uint64_t *)slot;
+ if (arg->get_data())
+ *ptr_slot = (uint64_t)dxil->metadata.args[i].globconstptr.buf_id << 32;
+ else
+ *ptr_slot = ~0ull;
+ break;
+ }
+ case CLC_KERNEL_ARG_ADDRESS_LOCAL: {
+ assert(dxil->metadata.args[i].size == sizeof(uint64_t));
+ uint64_t *ptr_slot = (uint64_t *)slot;
+ *ptr_slot = dxil->metadata.args[i].localptr.sharedmem_offset;
+ break;
+ }
+ case CLC_KERNEL_ARG_ADDRESS_PRIVATE: {
+ assert(size == dxil->metadata.args[i].size);
+ memcpy(slot, arg->get_data(), size);
+ break;
+ }
+ default:
+ assert(0);
+ }
+ }
+
+ for (unsigned i = 0; i < dxil->kernel->num_args; ++i) {
+ RawShaderArg *arg = args[i];
+
+ if (dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL ||
+ dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_CONSTANT) {
+ argres[i] = add_uav_resource(resources, 0,
+ dxil->metadata.args[i].globconstptr.buf_id,
+ arg->get_data(), arg->get_num_elems(),
+ arg->get_elem_size());
+ }
+ }
+
+ if (dxil->metadata.printf_uav_id > 0)
+ add_uav_resource(resources, 0, dxil->metadata.printf_uav_id, NULL, 1024 * 1024 / 4, 4);
+
+ for (unsigned i = 0; i < dxil->metadata.num_consts; ++i)
+ add_uav_resource(resources, 0, dxil->metadata.consts[i].uav_id,
+ dxil->metadata.consts[i].data,
+ dxil->metadata.consts[i].size / 4, 4);
+
+ if (argsbuf.size())
+ add_cbv_resource(resources, 0, dxil->metadata.kernel_inputs_cbv_id,
+ argsbuf.data(), argsbuf.size());
+
+ add_cbv_resource(resources, 0, dxil->metadata.work_properties_cbv_id,
+ &work_props, sizeof(work_props));
+
+ auto root_sig = create_root_signature(resources);
+ auto pipeline_state = create_pipeline_state(root_sig, *dxil);
+
+ cmdlist->SetDescriptorHeaps(1, &uav_heap);
+ cmdlist->SetComputeRootSignature(root_sig.Get());
+ cmdlist->SetComputeRootDescriptorTable(0, uav_heap->GetGPUDescriptorHandleForHeapStart());
+ cmdlist->SetPipelineState(pipeline_state.Get());
+
+ cmdlist->Dispatch(compile_args.x / conf.local_size[0],
+ compile_args.y / conf.local_size[1],
+ compile_args.z / conf.local_size[2]);
+
+ for (auto &range : resources.ranges) {
+ if (range.RangeType == D3D12_DESCRIPTOR_RANGE_TYPE_UAV) {
+ for (unsigned i = range.OffsetInDescriptorsFromTableStart;
+ i < range.NumDescriptors; i++) {
+ if (!resources.descs[i].Get())
+ continue;
+
+ resource_barrier(resources.descs[i],
+ D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+ D3D12_RESOURCE_STATE_COMMON);
+ }
+ }
+ }
+
+ execute_cmdlist();
+
+ for (unsigned i = 0; i < args.size(); i++) {
+ if (!(args[i]->get_direction() & SHADER_ARG_OUTPUT))
+ continue;
+
+ assert(dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL);
+ get_buffer_data(argres[i], args[i]->get_data(),
+ args[i]->get_elem_size() * args[i]->get_num_elems());
+ }
+
+ ComPtr<ID3D12InfoQueue> info_queue;
+ dev->QueryInterface(info_queue.ReleaseAndGetAddressOf());
+ if (info_queue)
+ {
+ EXPECT_EQ(0, info_queue->GetNumStoredMessages());
+ for (unsigned i = 0; i < info_queue->GetNumStoredMessages(); ++i) {
+ SIZE_T message_size = 0;
+ info_queue->GetMessageA(i, nullptr, &message_size);
+ D3D12_MESSAGE* message = (D3D12_MESSAGE*)malloc(message_size);
+ info_queue->GetMessageA(i, message, &message_size);
+ FAIL() << message->pDescription;
+ free(message);
+ }
+ }
+}
+
+void
+ComputeTest::SetUp()
+{
+ static struct clc_context *compiler_ctx_g = nullptr;
+
+ if (!compiler_ctx_g) {
+ clc_context_options options = { };
+ options.optimize = (debug_get_option_debug_compute() & COMPUTE_DEBUG_OPTIMIZE_LIBCLC) != 0;
+
+ compiler_ctx_g = clc_context_new(&logger, &options);
+ if (!compiler_ctx_g)
+ throw runtime_error("failed to create CLC compiler context");
+
+ if (debug_get_option_debug_compute() & COMPUTE_DEBUG_SERIALIZE_LIBCLC) {
+ void *serialized = nullptr;
+ size_t serialized_size = 0;
+ clc_context_serialize(compiler_ctx_g, &serialized, &serialized_size);
+ if (!serialized)
+ throw runtime_error("failed to serialize CLC compiler context");
+
+ clc_free_context(compiler_ctx_g);
+ compiler_ctx_g = nullptr;
+
+ compiler_ctx_g = clc_context_deserialize(serialized, serialized_size);
+ if (!compiler_ctx_g)
+ throw runtime_error("failed to deserialize CLC compiler context");
+
+ clc_context_free_serialized(serialized);
+ }
+ }
+ compiler_ctx = compiler_ctx_g;
+
+ enable_d3d12_debug_layer();
+
+ factory = get_dxgi_factory();
+ if (!factory)
+ throw runtime_error("failed to create DXGI factory");
+
+ adapter = choose_adapter(factory);
+ if (!adapter)
+ throw runtime_error("failed to choose adapter");
+
+ dev = create_device(adapter);
+ if (!dev)
+ throw runtime_error("failed to create device");
+
+ if (FAILED(dev->CreateFence(0, D3D12_FENCE_FLAG_NONE,
+ __uuidof(cmdqueue_fence),
+ (void **)&cmdqueue_fence)))
+ throw runtime_error("failed to create fence\n");
+
+ D3D12_COMMAND_QUEUE_DESC queue_desc;
+ queue_desc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
+ queue_desc.Priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL;
+ queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+ queue_desc.NodeMask = 0;
+ if (FAILED(dev->CreateCommandQueue(&queue_desc,
+ __uuidof(cmdqueue),
+ (void **)&cmdqueue)))
+ throw runtime_error("failed to create command queue");
+
+ if (FAILED(dev->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE,
+ __uuidof(cmdalloc), (void **)&cmdalloc)))
+ throw runtime_error("failed to create command allocator");
+
+ if (FAILED(dev->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE,
+ cmdalloc, NULL, __uuidof(cmdlist), (void **)&cmdlist)))
+ throw runtime_error("failed to create command list");
+
+ D3D12_DESCRIPTOR_HEAP_DESC heap_desc;
+ heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+ heap_desc.NumDescriptors = 1000;
+ heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+ heap_desc.NodeMask = 0;
+ if (FAILED(dev->CreateDescriptorHeap(&heap_desc,
+ __uuidof(uav_heap), (void **)&uav_heap)))
+ throw runtime_error("failed to create descriptor heap");
+
+ uav_heap_incr = dev->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+
+ event = CreateEvent(NULL, FALSE, FALSE, NULL);
+ if (!event)
+ throw runtime_error("Failed to create event");
+ fence_value = 1;
+}
+
+void
+ComputeTest::TearDown()
+{
+ CloseHandle(event);
+
+ uav_heap->Release();
+ cmdlist->Release();
+ cmdalloc->Release();
+ cmdqueue->Release();
+ cmdqueue_fence->Release();
+ dev->Release();
+ adapter->Release();
+ factory->Release();
+}
+
+PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE ComputeTest::D3D12SerializeVersionedRootSignature;
+
+bool
+validate_module(const struct clc_dxil_object &dxil)
+{
+ static HMODULE hmod = LoadLibrary("DXIL.DLL");
+ if (!hmod) {
+ /* Enabling experimental shaders allows us to run unsigned shader code,
+ * such as when under the debugger where we can't run the validator. */
+ if (debug_get_option_debug_compute() & COMPUTE_DEBUG_EXPERIMENTAL_SHADERS)
+ return true;
+ else
+ throw runtime_error("failed to load DXIL.DLL");
+ }
+
+ DxcCreateInstanceProc pfnDxcCreateInstance =
+ (DxcCreateInstanceProc)GetProcAddress(hmod, "DxcCreateInstance");
+ if (!pfnDxcCreateInstance)
+ throw runtime_error("failed to load DxcCreateInstance");
+
+ struct shader_blob : public IDxcBlob {
+ shader_blob(void *data, size_t size) : data(data), size(size) {}
+ LPVOID STDMETHODCALLTYPE GetBufferPointer() override { return data; }
+ SIZE_T STDMETHODCALLTYPE GetBufferSize() override { return size; }
+ HRESULT STDMETHODCALLTYPE QueryInterface(REFIID, void **) override { return E_NOINTERFACE; }
+ ULONG STDMETHODCALLTYPE AddRef() override { return 1; }
+ ULONG STDMETHODCALLTYPE Release() override { return 0; }
+ void *data;
+ size_t size;
+ } blob(dxil.binary.data, dxil.binary.size);
+
+ IDxcValidator *validator;
+ if (FAILED(pfnDxcCreateInstance(CLSID_DxcValidator, __uuidof(IDxcValidator),
+ (void **)&validator)))
+ throw runtime_error("failed to create IDxcValidator");
+
+ IDxcOperationResult *result;
+ if (FAILED(validator->Validate(&blob, DxcValidatorFlags_InPlaceEdit,
+ &result)))
+ throw runtime_error("Validate failed");
+
+ HRESULT hr;
+ if (FAILED(result->GetStatus(&hr)) ||
+ FAILED(hr)) {
+ IDxcBlobEncoding *message;
+ result->GetErrorBuffer(&message);
+ fprintf(stderr, "D3D12: validation failed: %*s\n",
+ (int)message->GetBufferSize(),
+ (char *)message->GetBufferPointer());
+ message->Release();
+ validator->Release();
+ result->Release();
+ return false;
+ }
+
+ validator->Release();
+ result->Release();
+ return true;
+}
+
+static void
+dump_blob(const char *path, const struct clc_dxil_object &dxil)
+{
+ FILE *fp = fopen(path, "wb");
+ if (fp) {
+ fwrite(dxil.binary.data, 1, dxil.binary.size, fp);
+ fclose(fp);
+ printf("D3D12: wrote '%s'...\n", path);
+ }
+}
+
+ComputeTest::Shader
+ComputeTest::compile(const std::vector<const char *> &sources,
+ const std::vector<const char *> &compile_args,
+ bool create_library)
+{
+ struct clc_compile_args args = { 0 };
+ args.args = compile_args.data();
+ args.num_args = (unsigned)compile_args.size();
+ struct clc_dxil_object *dxil;
+ ComputeTest::Shader shader;
+
+ std::vector<Shader> shaders;
+
+ args.source.name = "obj.cl";
+
+ for (unsigned i = 0; i < sources.size(); i++) {
+ args.source.value = sources[i];
+
+ auto obj = clc_compile(compiler_ctx, &args, &logger);
+ if (!obj)
+ throw runtime_error("failed to compile object!");
+
+ Shader shader;
+ shader.obj = std::shared_ptr<struct clc_object>(obj, clc_free_object);
+ shaders.push_back(shader);
+ }
+
+ if (shaders.size() == 1 && create_library)
+ return shaders[0];
+
+ return link(shaders, create_library);
+}
+
+ComputeTest::Shader
+ComputeTest::link(const std::vector<Shader> &sources,
+ bool create_library)
+{
+ std::vector<const clc_object*> objs;
+ for (auto& source : sources)
+ objs.push_back(&*source.obj);
+
+ struct clc_linker_args link_args = {};
+ link_args.in_objs = objs.data();
+ link_args.num_in_objs = (unsigned)objs.size();
+ link_args.create_library = create_library;
+ struct clc_object *obj = clc_link(compiler_ctx,
+ &link_args,
+ &logger);
+ if (!obj)
+ throw runtime_error("failed to link objects!");
+
+ ComputeTest::Shader shader;
+ shader.obj = std::shared_ptr<struct clc_object>(obj, clc_free_object);
+ if (!link_args.create_library)
+ configure(shader, NULL);
+
+ return shader;
+}
+
+void
+ComputeTest::configure(Shader &shader,
+ const struct clc_runtime_kernel_conf *conf)
+{
+ struct clc_dxil_object *dxil;
+
+ dxil = clc_to_dxil(compiler_ctx, shader.obj.get(), "main_test", conf, &logger);
+ if (!dxil)
+ throw runtime_error("failed to compile kernel!");
+
+ shader.dxil = std::shared_ptr<struct clc_dxil_object>(dxil, clc_free_dxil_object);
+}
+
+void
+ComputeTest::validate(ComputeTest::Shader &shader)
+{
+ dump_blob("unsigned.cso", *shader.dxil);
+ if (!validate_module(*shader.dxil))
+ throw runtime_error("failed to validate module!");
+
+ dump_blob("signed.cso", *shader.dxil);
+}
--- /dev/null
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdexcept>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <gtest/gtest.h>
+#include <wrl.h>
+
+#include "clc_compiler.h"
+
+using std::runtime_error;
+using Microsoft::WRL::ComPtr;
+
+inline D3D12_CPU_DESCRIPTOR_HANDLE
+offset_cpu_handle(D3D12_CPU_DESCRIPTOR_HANDLE handle, UINT offset)
+{
+ handle.ptr += offset;
+ return handle;
+}
+
+inline size_t
+align(size_t value, unsigned alignment)
+{
+ assert(alignment > 0);
+ return ((value + (alignment - 1)) / alignment) * alignment;
+}
+
+class ComputeTest : public ::testing::Test {
+protected:
+ struct Shader {
+ std::shared_ptr<struct clc_object> obj;
+ std::shared_ptr<struct clc_dxil_object> dxil;
+ };
+
+ static void
+ enable_d3d12_debug_layer();
+
+ static IDXGIFactory4 *
+ get_dxgi_factory();
+
+ static IDXGIAdapter1 *
+ choose_adapter(IDXGIFactory4 *factory);
+
+ static ID3D12Device *
+ create_device(IDXGIAdapter1 *adapter);
+
+ struct Resources {
+ void add(ComPtr<ID3D12Resource> res,
+ D3D12_DESCRIPTOR_RANGE_TYPE type,
+ unsigned spaceid,
+ unsigned resid)
+ {
+ descs.push_back(res);
+
+ if(!ranges.empty() &&
+ ranges.back().RangeType == type &&
+ ranges.back().RegisterSpace == spaceid &&
+ ranges.back().BaseShaderRegister + ranges.back().NumDescriptors == resid) {
+ ranges.back().NumDescriptors++;
+ return;
+ }
+
+ D3D12_DESCRIPTOR_RANGE1 range;
+
+ range.RangeType = type;
+ range.NumDescriptors = 1;
+ range.BaseShaderRegister = resid;
+ range.RegisterSpace = spaceid;
+ range.OffsetInDescriptorsFromTableStart = descs.size() - 1;
+ range.Flags = D3D12_DESCRIPTOR_RANGE_FLAG_DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS;
+ ranges.push_back(range);
+ }
+
+ std::vector<D3D12_DESCRIPTOR_RANGE1> ranges;
+ std::vector<ComPtr<ID3D12Resource>> descs;
+ };
+
+ ComPtr<ID3D12RootSignature>
+ create_root_signature(const Resources &resources);
+
+ ComPtr<ID3D12PipelineState>
+ create_pipeline_state(ComPtr<ID3D12RootSignature> &root_sig,
+ const struct clc_dxil_object &dxil);
+
+ ComPtr<ID3D12Resource>
+ create_buffer(int size, D3D12_HEAP_TYPE heap_type);
+
+ ComPtr<ID3D12Resource>
+ create_upload_buffer_with_data(const void *data, size_t size);
+
+ ComPtr<ID3D12Resource>
+ create_sized_buffer_with_data(size_t buffer_size, const void *data,
+ size_t data_size);
+
+ ComPtr<ID3D12Resource>
+ create_buffer_with_data(const void *data, size_t size)
+ {
+ return create_sized_buffer_with_data(size, data, size);
+ }
+
+ void
+ get_buffer_data(ComPtr<ID3D12Resource> res,
+ void *buf, size_t size);
+
+ void
+ resource_barrier(ComPtr<ID3D12Resource> &res,
+ D3D12_RESOURCE_STATES state_before,
+ D3D12_RESOURCE_STATES state_after);
+
+ void
+ execute_cmdlist();
+
+ void
+ create_uav_buffer(ComPtr<ID3D12Resource> res,
+ size_t width, size_t byte_stride,
+ D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle);
+
+ void create_cbv(ComPtr<ID3D12Resource> res, size_t size,
+ D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle);
+
+ ComPtr<ID3D12Resource>
+ add_uav_resource(Resources &resources, unsigned spaceid, unsigned resid,
+ const void *data = NULL, size_t num_elems = 0,
+ size_t elem_size = 0);
+
+ ComPtr<ID3D12Resource>
+ add_cbv_resource(Resources &resources, unsigned spaceid, unsigned resid,
+ const void *data, size_t size);
+
+ void
+ SetUp() override;
+
+ void
+ TearDown() override;
+
+ Shader
+ compile(const std::vector<const char *> &sources,
+ const std::vector<const char *> &compile_args = {},
+ bool create_library = false);
+
+ Shader
+ link(const std::vector<Shader> &sources,
+ bool create_library = false);
+
+ void
+ configure(Shader &shader,
+ const struct clc_runtime_kernel_conf *conf);
+
+ void
+ validate(Shader &shader);
+
+ enum ShaderArgDirection {
+ SHADER_ARG_INPUT = 1,
+ SHADER_ARG_OUTPUT = 2,
+ SHADER_ARG_INOUT = SHADER_ARG_INPUT | SHADER_ARG_OUTPUT,
+ };
+
+ class RawShaderArg {
+ public:
+ RawShaderArg(enum ShaderArgDirection dir) : dir(dir) { }
+ virtual size_t get_elem_size() const = 0;
+ virtual size_t get_num_elems() const = 0;
+ virtual const void *get_data() const = 0;
+ virtual void *get_data() = 0;
+ enum ShaderArgDirection get_direction() { return dir; }
+ private:
+ enum ShaderArgDirection dir;
+ };
+
+ class NullShaderArg : public RawShaderArg {
+ public:
+ NullShaderArg() : RawShaderArg(SHADER_ARG_INPUT) { }
+ size_t get_elem_size() const override { return 0; }
+ size_t get_num_elems() const override { return 0; }
+ const void *get_data() const override { return NULL; }
+ void *get_data() override { return NULL; }
+ };
+
+ template <typename T>
+ class ShaderArg : public std::vector<T>, public RawShaderArg
+ {
+ public:
+ ShaderArg(const T &v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) :
+ std::vector<T>({ v }), RawShaderArg(dir) { }
+ ShaderArg(const std::vector<T> &v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) :
+ std::vector<T>(v), RawShaderArg(dir) { }
+ ShaderArg(const std::initializer_list<T> v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) :
+ std::vector<T>(v), RawShaderArg(dir) { }
+
+ ShaderArg<T>& operator =(const T &v)
+ {
+ this->clear();
+ this->push_back(v);
+ return *this;
+ }
+
+ operator T&() { return this->at(0); }
+ operator const T&() const { return this->at(0); }
+
+ ShaderArg<T>& operator =(const std::vector<T> &v)
+ {
+ *this = v;
+ return *this;
+ }
+
+ ShaderArg<T>& operator =(std::initializer_list<T> v)
+ {
+ *this = v;
+ return *this;
+ }
+
+ size_t get_elem_size() const override { return sizeof(T); }
+ size_t get_num_elems() const override { return this->size(); }
+ const void *get_data() const override { return this->data(); }
+ void *get_data() override { return this->data(); }
+ };
+
+ struct CompileArgs
+ {
+ unsigned x, y, z;
+ std::vector<const char *> compiler_command_line;
+ clc_work_properties_data work_props;
+ };
+
+private:
+ void gather_args(std::vector<RawShaderArg *> &args) { }
+
+ template <typename T, typename... Rest>
+ void gather_args(std::vector<RawShaderArg *> &args, T &arg, Rest&... rest)
+ {
+ args.push_back(&arg);
+ gather_args(args, rest...);
+ }
+
+ void run_shader_with_raw_args(Shader shader,
+ const CompileArgs &compile_args,
+ const std::vector<RawShaderArg *> &args);
+
+protected:
+ template <typename... Args>
+ void run_shader(Shader shader,
+ const CompileArgs &compile_args,
+ Args&... args)
+ {
+ std::vector<RawShaderArg *> raw_args;
+ gather_args(raw_args, args...);
+ run_shader_with_raw_args(shader, compile_args, raw_args);
+ }
+
+ template <typename... Args>
+ void run_shader(const std::vector<const char *> &sources,
+ unsigned x, unsigned y, unsigned z,
+ Args&... args)
+ {
+ std::vector<RawShaderArg *> raw_args;
+ gather_args(raw_args, args...);
+ CompileArgs compile_args = { x, y, z };
+ run_shader_with_raw_args(compile(sources), compile_args, raw_args);
+ }
+
+ template <typename... Args>
+ void run_shader(const std::vector<const char *> &sources,
+ const CompileArgs &compile_args,
+ Args&... args)
+ {
+ std::vector<RawShaderArg *> raw_args;
+ gather_args(raw_args, args...);
+ run_shader_with_raw_args(
+ compile(sources, compile_args.compiler_command_line),
+ compile_args, raw_args);
+ }
+
+ template <typename... Args>
+ void run_shader(const char *source,
+ unsigned x, unsigned y, unsigned z,
+ Args&... args)
+ {
+ std::vector<RawShaderArg *> raw_args;
+ gather_args(raw_args, args...);
+ CompileArgs compile_args = { x, y, z };
+ run_shader_with_raw_args(compile({ source }), compile_args, raw_args);
+ }
+
+ IDXGIFactory4 *factory;
+ IDXGIAdapter1 *adapter;
+ ID3D12Device *dev;
+ ID3D12Fence *cmdqueue_fence;
+ ID3D12CommandQueue *cmdqueue;
+ ID3D12CommandAllocator *cmdalloc;
+ ID3D12GraphicsCommandList *cmdlist;
+ ID3D12DescriptorHeap *uav_heap;
+
+ struct clc_context *compiler_ctx;
+
+ UINT uav_heap_incr;
+ int fence_value;
+
+ HANDLE event;
+ static PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE D3D12SerializeVersionedRootSignature;
+};
--- /dev/null
+# Copyright © Microsoft Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+clang_resource_dir = join_paths(
+ dep_clang.get_variable(cmake: 'CLANG_INCLUDE_DIRS'), '..',
+ 'lib', 'clang', dep_clang.version(), 'include'
+)
+
+opencl_c_h = custom_target(
+ 'opencl-c.h',
+ input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c.h')],
+ output : 'opencl-c.h.h',
+ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_source'],
+)
+opencl_c_base_h = custom_target(
+ 'opencl-c-base.h',
+ input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c-base.h')],
+ output : 'opencl-c-base.h.h',
+ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_base_source'],
+)
+
+libclc_compiler = shared_library(
+ 'clglon12compiler',
+ 'clc_compiler.c',
+ 'clc_nir.c',
+ 'clc_helpers.cpp',
+ opencl_c_h,
+ opencl_c_base_h,
+ vs_module_defs : 'clglon12compiler.def',
+ include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_compiler, inc_gallium, inc_spirv],
+ dependencies: [idep_nir_headers, dep_clang, dep_llvm, cc.find_library('version'),
+ dep_llvmspirvlib, idep_mesautil, idep_libdxil_compiler, idep_nir, dep_spirv_tools]
+)
+
+clc_compiler_test = executable('clc_compiler_test',
+ ['clc_compiler_test.cpp', 'compute_test.cpp'],
+ link_with : [libclc_compiler],
+ dependencies : [idep_gtest, idep_mesautil],
+ include_directories : [inc_include, inc_src])
+
+test('clc_compiler_test', clc_compiler_test, timeout: 120)
--- /dev/null
+
+///////////////////////////////////////////////////////////////////////////////
+// //
+// dxcapi.h //
+// Copyright (C) Microsoft Corporation. All rights reserved. //
+// This file is distributed under the University of Illinois Open Source //
+// License. See LICENSE.TXT for details. //
+// //
+// Provides declarations for the DirectX Compiler API entry point. //
+// //
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef __DXC_API__
+#define __DXC_API__
+
+#ifdef _WIN32
+#ifndef DXC_API_IMPORT
+#define DXC_API_IMPORT __declspec(dllimport)
+#endif
+#else
+#ifndef DXC_API_IMPORT
+#define DXC_API_IMPORT __attribute__ ((visibility ("default")))
+#endif
+#endif
+
+#ifdef _WIN32
+#define DECLARE_CROSS_PLATFORM_UUIDOF(T)
+#define DEFINE_CROSS_PLATFORM_UUIDOF(T)
+#else
+#include <dlfcn.h>
+#include "dxc/Support/WinAdapter.h"
+#endif
+
+struct IMalloc;
+
+struct IDxcIncludeHandler;
+
+typedef HRESULT (__stdcall *DxcCreateInstanceProc)(
+ _In_ REFCLSID rclsid,
+ _In_ REFIID riid,
+ _Out_ LPVOID* ppv
+);
+
+typedef HRESULT(__stdcall *DxcCreateInstance2Proc)(
+ _In_ IMalloc *pMalloc,
+ _In_ REFCLSID rclsid,
+ _In_ REFIID riid,
+ _Out_ LPVOID* ppv
+ );
+
+/// <summary>
+/// Creates a single uninitialized object of the class associated with a specified CLSID.
+/// </summary>
+/// <param name="rclsid">
+/// The CLSID associated with the data and code that will be used to create the object.
+/// </param>
+/// <param name="riid">
+/// A reference to the identifier of the interface to be used to communicate
+/// with the object.
+/// </param>
+/// <param name="ppv">
+/// Address of pointer variable that receives the interface pointer requested
+/// in riid. Upon successful return, *ppv contains the requested interface
+/// pointer. Upon failure, *ppv contains NULL.</param>
+/// <remarks>
+/// While this function is similar to CoCreateInstance, there is no COM involvement.
+/// </remarks>
+
+extern "C"
+DXC_API_IMPORT HRESULT __stdcall DxcCreateInstance(
+ _In_ REFCLSID rclsid,
+ _In_ REFIID riid,
+ _Out_ LPVOID* ppv
+ );
+
+extern "C"
+DXC_API_IMPORT HRESULT __stdcall DxcCreateInstance2(
+ _In_ IMalloc *pMalloc,
+ _In_ REFCLSID rclsid,
+ _In_ REFIID riid,
+ _Out_ LPVOID* ppv
+);
+
+// For convenience, equivalent definitions to CP_UTF8 and CP_UTF16.
+#define DXC_CP_UTF8 65001
+#define DXC_CP_UTF16 1200
+// Use DXC_CP_ACP for: Binary; ANSI Text; Autodetect UTF with BOM
+#define DXC_CP_ACP 0
+
+// This flag indicates that the shader hash was computed taking into account source information (-Zss)
+#define DXC_HASHFLAG_INCLUDES_SOURCE 1
+
+// Hash digest type for ShaderHash
+typedef struct DxcShaderHash {
+ UINT32 Flags; // DXC_HASHFLAG_*
+ BYTE HashDigest[16];
+} DxcShaderHash;
+
+#define DXC_FOURCC(ch0, ch1, ch2, ch3) ( \
+ (UINT32)(UINT8)(ch0) | (UINT32)(UINT8)(ch1) << 8 | \
+ (UINT32)(UINT8)(ch2) << 16 | (UINT32)(UINT8)(ch3) << 24 \
+ )
+#define DXC_PART_PDB DXC_FOURCC('I', 'L', 'D', 'B')
+#define DXC_PART_PDB_NAME DXC_FOURCC('I', 'L', 'D', 'N')
+#define DXC_PART_PRIVATE_DATA DXC_FOURCC('P', 'R', 'I', 'V')
+#define DXC_PART_ROOT_SIGNATURE DXC_FOURCC('R', 'T', 'S', '0')
+#define DXC_PART_DXIL DXC_FOURCC('D', 'X', 'I', 'L')
+#define DXC_PART_REFLECTION_DATA DXC_FOURCC('R', 'D', 'A', 'T')
+#define DXC_PART_SHADER_HASH DXC_FOURCC('H', 'A', 'S', 'H')
+#define DXC_PART_INPUT_SIGNATURE DXC_FOURCC('I', 'S', 'G', '1')
+#define DXC_PART_OUTPUT_SIGNATURE DXC_FOURCC('O', 'S', 'G', '1')
+#define DXC_PART_PATCH_CONSTANT_SIGNATURE DXC_FOURCC('P', 'S', 'G', '1')
+
+// Some option arguments are defined here for continuity with D3DCompile interface
+#define DXC_ARG_DEBUG L"-Zi"
+#define DXC_ARG_SKIP_VALIDATION L"-Vd"
+#define DXC_ARG_SKIP_OPTIMIZATIONS L"-Od"
+#define DXC_ARG_PACK_MATRIX_ROW_MAJOR L"-Zpr"
+#define DXC_ARG_PACK_MATRIX_COLUMN_MAJOR L"-Zpc"
+#define DXC_ARG_AVOID_FLOW_CONTROL L"-Gfa"
+#define DXC_ARG_PREFER_FLOW_CONTROL L"-Gfp"
+#define DXC_ARG_ENABLE_STRICTNESS L"-Ges"
+#define DXC_ARG_ENABLE_BACKWARDS_COMPATIBILITY L"-Gec"
+#define DXC_ARG_IEEE_STRICTNESS L"-Gis"
+#define DXC_ARG_OPTIMIZATION_LEVEL0 L"-O0"
+#define DXC_ARG_OPTIMIZATION_LEVEL1 L"-O1"
+#define DXC_ARG_OPTIMIZATION_LEVEL2 L"-O2"
+#define DXC_ARG_OPTIMIZATION_LEVEL3 L"-O3"
+#define DXC_ARG_WARNINGS_ARE_ERRORS L"-WX"
+#define DXC_ARG_RESOURCES_MAY_ALIAS L"-res_may_alias"
+#define DXC_ARG_ALL_RESOURCES_BOUND L"-all_resources_bound"
+#define DXC_ARG_DEBUG_NAME_FOR_SOURCE L"-Zss"
+#define DXC_ARG_DEBUG_NAME_FOR_BINARY L"-Zsb"
+
+// IDxcBlob is an alias of ID3D10Blob and ID3DBlob
+struct __declspec(uuid("8BA5FB08-5195-40e2-AC58-0D989C3A0102"))
+IDxcBlob : public IUnknown {
+public:
+ virtual LPVOID STDMETHODCALLTYPE GetBufferPointer(void) = 0;
+ virtual SIZE_T STDMETHODCALLTYPE GetBufferSize(void) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlob)
+};
+
+struct __declspec(uuid("7241d424-2646-4191-97c0-98e96e42fc68"))
+IDxcBlobEncoding : public IDxcBlob {
+public:
+ virtual HRESULT STDMETHODCALLTYPE GetEncoding(_Out_ BOOL *pKnown,
+ _Out_ UINT32 *pCodePage) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobEncoding)
+};
+
+// Notes on IDxcBlobUtf16 and IDxcBlobUtf8
+// These guarantee null-terminated text and the stated encoding.
+// GetBufferSize() will return the size in bytes, including null-terminator
+// GetStringLength() will return the length in characters, excluding the null-terminator
+// Name strings will use IDxcBlobUtf16, while other string output blobs,
+// such as errors/warnings, preprocessed HLSL, or other text will be based
+// on the -encoding option.
+
+// The API will use this interface for output name strings
+struct __declspec(uuid("A3F84EAB-0FAA-497E-A39C-EE6ED60B2D84"))
+IDxcBlobUtf16 : public IDxcBlobEncoding {
+public:
+ virtual LPCWSTR STDMETHODCALLTYPE GetStringPointer(void) = 0;
+ virtual SIZE_T STDMETHODCALLTYPE GetStringLength(void) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobUtf16)
+};
+struct __declspec(uuid("3DA636C9-BA71-4024-A301-30CBF125305B"))
+IDxcBlobUtf8 : public IDxcBlobEncoding {
+public:
+ virtual LPCSTR STDMETHODCALLTYPE GetStringPointer(void) = 0;
+ virtual SIZE_T STDMETHODCALLTYPE GetStringLength(void) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobUtf8)
+};
+
+struct __declspec(uuid("7f61fc7d-950d-467f-b3e3-3c02fb49187c"))
+IDxcIncludeHandler : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE LoadSource(
+ _In_z_ LPCWSTR pFilename, // Candidate filename.
+ _COM_Outptr_result_maybenull_ IDxcBlob **ppIncludeSource // Resultant source object for included file, nullptr if not found.
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcIncludeHandler)
+};
+
+// Structure for supplying bytes or text input to Dxc APIs.
+// Use Encoding = 0 for non-text bytes, ANSI text, or unknown with BOM.
+typedef struct DxcBuffer {
+ LPCVOID Ptr;
+ SIZE_T Size;
+ UINT Encoding;
+} DxcText;
+
+struct DxcDefine {
+ LPCWSTR Name;
+ _Maybenull_ LPCWSTR Value;
+};
+
+struct __declspec(uuid("73EFFE2A-70DC-45F8-9690-EFF64C02429D"))
+IDxcCompilerArgs : public IUnknown {
+ // Pass GetArguments() and GetCount() to Compile
+ virtual LPCWSTR* STDMETHODCALLTYPE GetArguments() = 0;
+ virtual UINT32 STDMETHODCALLTYPE GetCount() = 0;
+
+ // Add additional arguments or defines here, if desired.
+ virtual HRESULT STDMETHODCALLTYPE AddArguments(
+ _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments to add
+ _In_ UINT32 argCount // Number of arguments to add
+ ) = 0;
+ virtual HRESULT STDMETHODCALLTYPE AddArgumentsUTF8(
+ _In_opt_count_(argCount)LPCSTR *pArguments, // Array of pointers to UTF-8 arguments to add
+ _In_ UINT32 argCount // Number of arguments to add
+ ) = 0;
+ virtual HRESULT STDMETHODCALLTYPE AddDefines(
+ _In_count_(defineCount) const DxcDefine *pDefines, // Array of defines
+ _In_ UINT32 defineCount // Number of defines
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompilerArgs)
+};
+
+//////////////////////////
+// Legacy Interfaces
+/////////////////////////
+
+// NOTE: IDxcUtils replaces IDxcLibrary
+struct __declspec(uuid("e5204dc7-d18c-4c3c-bdfb-851673980fe7"))
+IDxcLibrary : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE SetMalloc(_In_opt_ IMalloc *pMalloc) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobFromBlob(
+ _In_ IDxcBlob *pBlob, UINT32 offset, UINT32 length, _COM_Outptr_ IDxcBlob **ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobFromFile(
+ _In_z_ LPCWSTR pFileName, _In_opt_ UINT32* codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingFromPinned(
+ _In_bytecount_(size) LPCVOID pText, UINT32 size, UINT32 codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingOnHeapCopy(
+ _In_bytecount_(size) LPCVOID pText, UINT32 size, UINT32 codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingOnMalloc(
+ _In_bytecount_(size) LPCVOID pText, IMalloc *pIMalloc, UINT32 size, UINT32 codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateIncludeHandler(
+ _COM_Outptr_ IDxcIncludeHandler **ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE CreateStreamFromBlobReadOnly(
+ _In_ IDxcBlob *pBlob, _COM_Outptr_ IStream **ppStream) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf8(
+ _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf16(
+ _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcLibrary)
+};
+
+// NOTE: IDxcResult replaces IDxcOperationResult
+struct __declspec(uuid("CEDB484A-D4E9-445A-B991-CA21CA157DC2"))
+IDxcOperationResult : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE GetStatus(_Out_ HRESULT *pStatus) = 0;
+
+ // GetResult returns the main result of the operation.
+ // This corresponds to:
+ // DXC_OUT_OBJECT - Compile() with shader or library target
+ // DXC_OUT_DISASSEMBLY - Disassemble()
+ // DXC_OUT_HLSL - Compile() with -P
+ // DXC_OUT_ROOT_SIGNATURE - Compile() with rootsig_* target
+ virtual HRESULT STDMETHODCALLTYPE GetResult(_COM_Outptr_result_maybenull_ IDxcBlob **ppResult) = 0;
+
+ // GetErrorBuffer Corresponds to DXC_OUT_ERRORS.
+ virtual HRESULT STDMETHODCALLTYPE GetErrorBuffer(_COM_Outptr_result_maybenull_ IDxcBlobEncoding **ppErrors) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOperationResult)
+};
+
+// NOTE: IDxcCompiler3 replaces IDxcCompiler and IDxcCompiler2
+struct __declspec(uuid("8c210bf3-011f-4422-8d70-6f9acb8db617"))
+IDxcCompiler : public IUnknown {
+ // Compile a single entry point to the target shader model
+ virtual HRESULT STDMETHODCALLTYPE Compile(
+ _In_ IDxcBlob *pSource, // Source text to compile
+ _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers.
+ _In_opt_z_ LPCWSTR pEntryPoint, // entry point name
+ _In_z_ LPCWSTR pTargetProfile, // shader profile to compile
+ _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+ _In_ UINT32 argCount, // Number of arguments
+ _In_count_(defineCount)
+ const DxcDefine *pDefines, // Array of defines
+ _In_ UINT32 defineCount, // Number of defines
+ _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+ _COM_Outptr_ IDxcOperationResult **ppResult // Compiler output status, buffer, and errors
+ ) = 0;
+
+ // Preprocess source text
+ virtual HRESULT STDMETHODCALLTYPE Preprocess(
+ _In_ IDxcBlob *pSource, // Source text to preprocess
+ _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers.
+ _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+ _In_ UINT32 argCount, // Number of arguments
+ _In_count_(defineCount)
+ const DxcDefine *pDefines, // Array of defines
+ _In_ UINT32 defineCount, // Number of defines
+ _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+ _COM_Outptr_ IDxcOperationResult **ppResult // Preprocessor output status, buffer, and errors
+ ) = 0;
+
+ // Disassemble a program.
+ virtual HRESULT STDMETHODCALLTYPE Disassemble(
+ _In_ IDxcBlob *pSource, // Program to disassemble.
+ _COM_Outptr_ IDxcBlobEncoding **ppDisassembly // Disassembly text.
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler)
+};
+
+// NOTE: IDxcCompiler3 replaces IDxcCompiler and IDxcCompiler2
+struct __declspec(uuid("A005A9D9-B8BB-4594-B5C9-0E633BEC4D37"))
+IDxcCompiler2 : public IDxcCompiler {
+ // Compile a single entry point to the target shader model with debug information.
+ virtual HRESULT STDMETHODCALLTYPE CompileWithDebug(
+ _In_ IDxcBlob *pSource, // Source text to compile
+ _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers.
+ _In_opt_z_ LPCWSTR pEntryPoint, // Entry point name
+ _In_z_ LPCWSTR pTargetProfile, // Shader profile to compile
+ _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+ _In_ UINT32 argCount, // Number of arguments
+ _In_count_(defineCount)
+ const DxcDefine *pDefines, // Array of defines
+ _In_ UINT32 defineCount, // Number of defines
+ _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+ _COM_Outptr_ IDxcOperationResult **ppResult, // Compiler output status, buffer, and errors
+ _Outptr_opt_result_z_ LPWSTR *ppDebugBlobName,// Suggested file name for debug blob. (Must be HeapFree()'d!)
+ _COM_Outptr_opt_ IDxcBlob **ppDebugBlob // Debug blob
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler2)
+};
+
+struct __declspec(uuid("F1B5BE2A-62DD-4327-A1C2-42AC1E1E78E6"))
+IDxcLinker : public IUnknown {
+public:
+ // Register a library with name to ref it later.
+ virtual HRESULT RegisterLibrary(
+ _In_opt_ LPCWSTR pLibName, // Name of the library.
+ _In_ IDxcBlob *pLib // Library blob.
+ ) = 0;
+
+ // Links the shader and produces a shader blob that the Direct3D runtime can
+ // use.
+ virtual HRESULT STDMETHODCALLTYPE Link(
+ _In_opt_ LPCWSTR pEntryName, // Entry point name
+ _In_ LPCWSTR pTargetProfile, // shader profile to link
+ _In_count_(libCount)
+ const LPCWSTR *pLibNames, // Array of library names to link
+ _In_ UINT32 libCount, // Number of libraries to link
+ _In_opt_count_(argCount) const LPCWSTR *pArguments, // Array of pointers to arguments
+ _In_ UINT32 argCount, // Number of arguments
+ _COM_Outptr_
+ IDxcOperationResult **ppResult // Linker output status, buffer, and errors
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcLinker)
+};
+
+/////////////////////////
+// Latest interfaces. Please use these
+////////////////////////
+
+// NOTE: IDxcUtils replaces IDxcLibrary
+struct __declspec(uuid("4605C4CB-2019-492A-ADA4-65F20BB7D67F"))
+IDxcUtils : public IUnknown {
+ // Create a sub-blob that holds a reference to the outer blob and points to its memory.
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobFromBlob(
+ _In_ IDxcBlob *pBlob, UINT32 offset, UINT32 length, _COM_Outptr_ IDxcBlob **ppResult) = 0;
+
+ // For codePage, use 0 (or DXC_CP_ACP) for raw binary or ANSI code page
+
+ // Creates a blob referencing existing memory, with no copy.
+ // User must manage the memory lifetime separately.
+ // (was: CreateBlobWithEncodingFromPinned)
+ virtual HRESULT STDMETHODCALLTYPE CreateBlobFromPinned(
+ _In_bytecount_(size) LPCVOID pData, UINT32 size, UINT32 codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+ // Create blob, taking ownership of memory allocated with supplied allocator.
+ // (was: CreateBlobWithEncodingOnMalloc)
+ virtual HRESULT STDMETHODCALLTYPE MoveToBlob(
+ _In_bytecount_(size) LPCVOID pData, IMalloc *pIMalloc, UINT32 size, UINT32 codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+ ////
+ // New blobs and copied contents are allocated with the current allocator
+
+ // Copy blob contents to memory owned by the new blob.
+ // (was: CreateBlobWithEncodingOnHeapCopy)
+ virtual HRESULT STDMETHODCALLTYPE CreateBlob(
+ _In_bytecount_(size) LPCVOID pData, UINT32 size, UINT32 codePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+ // (was: CreateBlobFromFile)
+ virtual HRESULT STDMETHODCALLTYPE LoadFile(
+ _In_z_ LPCWSTR pFileName, _In_opt_ UINT32* pCodePage,
+ _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE CreateReadOnlyStreamFromBlob(
+ _In_ IDxcBlob *pBlob, _COM_Outptr_ IStream **ppStream) = 0;
+
+ // Create default file-based include handler
+ virtual HRESULT STDMETHODCALLTYPE CreateDefaultIncludeHandler(
+ _COM_Outptr_ IDxcIncludeHandler **ppResult) = 0;
+
+ // Convert or return matching encoded text blobs
+ virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf8(
+ _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobUtf8 **pBlobEncoding) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf16(
+ _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobUtf16 **pBlobEncoding) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE GetDxilContainerPart(
+ _In_ const DxcBuffer *pShader,
+ _In_ UINT32 DxcPart,
+ _Outptr_result_nullonfailure_ void **ppPartData,
+ _Out_ UINT32 *pPartSizeInBytes) = 0;
+
+ // Create reflection interface from serialized Dxil container, or DXC_PART_REFLECTION_DATA.
+ // TBD: Require part header for RDAT? (leaning towards yes)
+ virtual HRESULT STDMETHODCALLTYPE CreateReflection(
+ _In_ const DxcBuffer *pData, REFIID iid, void **ppvReflection) = 0;
+
+ virtual HRESULT STDMETHODCALLTYPE BuildArguments(
+ _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers.
+ _In_opt_z_ LPCWSTR pEntryPoint, // Entry point name. (-E)
+ _In_z_ LPCWSTR pTargetProfile, // Shader profile to compile. (-T)
+ _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+ _In_ UINT32 argCount, // Number of arguments
+ _In_count_(defineCount)
+ const DxcDefine *pDefines, // Array of defines
+ _In_ UINT32 defineCount, // Number of defines
+ _COM_Outptr_ IDxcCompilerArgs **ppArgs // Arguments you can use with Compile() method
+ ) = 0;
+
+ // Takes the shader PDB and returns the hash and the container inside it
+ virtual HRESULT STDMETHODCALLTYPE GetPDBContents(
+ _In_ IDxcBlob *pPDBBlob, _COM_Outptr_ IDxcBlob **ppHash, _COM_Outptr_ IDxcBlob **ppContainer) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcUtils)
+};
+
+// For use with IDxcResult::[Has|Get]Output dxcOutKind argument
+// Note: text outputs returned from version 2 APIs are UTF-8 or UTF-16 based on -encoding option
+typedef enum DXC_OUT_KIND {
+ DXC_OUT_NONE = 0,
+ DXC_OUT_OBJECT = 1, // IDxcBlob - Shader or library object
+ DXC_OUT_ERRORS = 2, // IDxcBlobUtf8 or IDxcBlobUtf16
+ DXC_OUT_PDB = 3, // IDxcBlob
+ DXC_OUT_SHADER_HASH = 4, // IDxcBlob - DxcShaderHash of shader or shader with source info (-Zsb/-Zss)
+ DXC_OUT_DISASSEMBLY = 5, // IDxcBlobUtf8 or IDxcBlobUtf16 - from Disassemble
+ DXC_OUT_HLSL = 6, // IDxcBlobUtf8 or IDxcBlobUtf16 - from Preprocessor or Rewriter
+ DXC_OUT_TEXT = 7, // IDxcBlobUtf8 or IDxcBlobUtf16 - other text, such as -ast-dump or -Odump
+ DXC_OUT_REFLECTION = 8, // IDxcBlob - RDAT part with reflection data
+ DXC_OUT_ROOT_SIGNATURE = 9, // IDxcBlob - Serialized root signature output
+
+ DXC_OUT_FORCE_DWORD = 0xFFFFFFFF
+} DXC_OUT_KIND;
+
+struct __declspec(uuid("58346CDA-DDE7-4497-9461-6F87AF5E0659"))
+IDxcResult : public IDxcOperationResult {
+ virtual BOOL STDMETHODCALLTYPE HasOutput(_In_ DXC_OUT_KIND dxcOutKind) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetOutput(_In_ DXC_OUT_KIND dxcOutKind,
+ _In_ REFIID iid, _COM_Outptr_opt_result_maybenull_ void **ppvObject,
+ _COM_Outptr_ IDxcBlobUtf16 **ppOutputName) = 0;
+
+ virtual UINT32 GetNumOutputs() = 0;
+ virtual DXC_OUT_KIND GetOutputByIndex(UINT32 Index) = 0;
+ virtual DXC_OUT_KIND PrimaryOutput() = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcResult)
+};
+
+struct __declspec(uuid("228B4687-5A6A-4730-900C-9702B2203F54"))
+IDxcCompiler3 : public IUnknown {
+ // Compile a single entry point to the target shader model,
+ // Compile a library to a library target (-T lib_*),
+ // Compile a root signature (-T rootsig_*), or
+ // Preprocess HLSL source (-P)
+ virtual HRESULT STDMETHODCALLTYPE Compile(
+ _In_ const DxcBuffer *pSource, // Source text to compile
+ _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments
+ _In_ UINT32 argCount, // Number of arguments
+ _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional)
+ _In_ REFIID riid, _Out_ LPVOID *ppResult // IDxcResult: status, buffer, and errors
+ ) = 0;
+
+ // Disassemble a program.
+ virtual HRESULT STDMETHODCALLTYPE Disassemble(
+ _In_ const DxcBuffer *pObject, // Program to disassemble: dxil container or bitcode.
+ _In_ REFIID riid, _Out_ LPVOID *ppResult // IDxcResult: status, disassembly text, and errors
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler3)
+};
+
+static const UINT32 DxcValidatorFlags_Default = 0;
+static const UINT32 DxcValidatorFlags_InPlaceEdit = 1; // Validator is allowed to update shader blob in-place.
+static const UINT32 DxcValidatorFlags_RootSignatureOnly = 2;
+static const UINT32 DxcValidatorFlags_ModuleOnly = 4;
+static const UINT32 DxcValidatorFlags_ValidMask = 0x7;
+
+struct __declspec(uuid("A6E82BD2-1FD7-4826-9811-2857E797F49A"))
+IDxcValidator : public IUnknown {
+ // Validate a shader.
+ virtual HRESULT STDMETHODCALLTYPE Validate(
+ _In_ IDxcBlob *pShader, // Shader to validate.
+ _In_ UINT32 Flags, // Validation flags.
+ _COM_Outptr_ IDxcOperationResult **ppResult // Validation output status, buffer, and errors
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcValidator)
+};
+
+struct __declspec(uuid("334b1f50-2292-4b35-99a1-25588d8c17fe"))
+IDxcContainerBuilder : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pDxilContainerHeader) = 0; // Loads DxilContainer to the builder
+ virtual HRESULT STDMETHODCALLTYPE AddPart(_In_ UINT32 fourCC, _In_ IDxcBlob *pSource) = 0; // Part to add to the container
+ virtual HRESULT STDMETHODCALLTYPE RemovePart(_In_ UINT32 fourCC) = 0; // Remove the part with fourCC
+ virtual HRESULT STDMETHODCALLTYPE SerializeContainer(_Out_ IDxcOperationResult **ppResult) = 0; // Builds a container of the given container builder state
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcContainerBuilder)
+};
+
+struct __declspec(uuid("091f7a26-1c1f-4948-904b-e6e3a8a771d5"))
+IDxcAssembler : public IUnknown {
+ // Assemble dxil in ll or llvm bitcode to DXIL container.
+ virtual HRESULT STDMETHODCALLTYPE AssembleToContainer(
+ _In_ IDxcBlob *pShader, // Shader to assemble.
+ _COM_Outptr_ IDxcOperationResult **ppResult // Assembly output status, buffer, and errors
+ ) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcAssembler)
+};
+
+struct __declspec(uuid("d2c21b26-8350-4bdc-976a-331ce6f4c54c"))
+IDxcContainerReflection : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pContainer) = 0; // Container to load.
+ virtual HRESULT STDMETHODCALLTYPE GetPartCount(_Out_ UINT32 *pResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetPartKind(UINT32 idx, _Out_ UINT32 *pResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetPartContent(UINT32 idx, _COM_Outptr_ IDxcBlob **ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE FindFirstPartKind(UINT32 kind, _Out_ UINT32 *pResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetPartReflection(UINT32 idx, REFIID iid, void **ppvObject) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcContainerReflection)
+};
+
+struct __declspec(uuid("AE2CD79F-CC22-453F-9B6B-B124E7A5204C"))
+IDxcOptimizerPass : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE GetOptionName(_COM_Outptr_ LPWSTR *ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetDescription(_COM_Outptr_ LPWSTR *ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetOptionArgCount(_Out_ UINT32 *pCount) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetOptionArgName(UINT32 argIndex, _COM_Outptr_ LPWSTR *ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetOptionArgDescription(UINT32 argIndex, _COM_Outptr_ LPWSTR *ppResult) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOptimizerPass)
+};
+
+struct __declspec(uuid("25740E2E-9CBA-401B-9119-4FB42F39F270"))
+IDxcOptimizer : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE GetAvailablePassCount(_Out_ UINT32 *pCount) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetAvailablePass(UINT32 index, _COM_Outptr_ IDxcOptimizerPass** ppResult) = 0;
+ virtual HRESULT STDMETHODCALLTYPE RunOptimizer(IDxcBlob *pBlob,
+ _In_count_(optionCount) LPCWSTR *ppOptions, UINT32 optionCount,
+ _COM_Outptr_ IDxcBlob **pOutputModule,
+ _COM_Outptr_opt_ IDxcBlobEncoding **ppOutputText) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOptimizer)
+};
+
+static const UINT32 DxcVersionInfoFlags_None = 0;
+static const UINT32 DxcVersionInfoFlags_Debug = 1; // Matches VS_FF_DEBUG
+static const UINT32 DxcVersionInfoFlags_Internal = 2; // Internal Validator (non-signing)
+
+struct __declspec(uuid("b04f5b50-2059-4f12-a8ff-a1e0cde1cc7e"))
+IDxcVersionInfo : public IUnknown {
+ virtual HRESULT STDMETHODCALLTYPE GetVersion(_Out_ UINT32 *pMajor, _Out_ UINT32 *pMinor) = 0;
+ virtual HRESULT STDMETHODCALLTYPE GetFlags(_Out_ UINT32 *pFlags) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcVersionInfo)
+};
+
+struct __declspec(uuid("fb6904c4-42f0-4b62-9c46-983af7da7c83"))
+IDxcVersionInfo2 : public IDxcVersionInfo {
+ virtual HRESULT STDMETHODCALLTYPE GetCommitInfo(_Out_ UINT32 *pCommitCount, _Out_ char **pCommitHash) = 0;
+
+ DECLARE_CROSS_PLATFORM_UUIDOF(IDxcVersionInfo2)
+};
+
+// Note: __declspec(selectany) requires 'extern'
+// On Linux __declspec(selectany) is removed and using 'extern' results in link error.
+#ifdef _MSC_VER
+#define CLSID_SCOPE __declspec(selectany) extern
+#else
+#define CLSID_SCOPE
+#endif
+
+CLSID_SCOPE const CLSID CLSID_DxcCompiler = {
+ 0x73e22d93,
+ 0xe6ce,
+ 0x47f3,
+ {0xb5, 0xbf, 0xf0, 0x66, 0x4f, 0x39, 0xc1, 0xb0}};
+
+// {EF6A8087-B0EA-4D56-9E45-D07E1A8B7806}
+CLSID_SCOPE const GUID CLSID_DxcLinker = {
+ 0xef6a8087,
+ 0xb0ea,
+ 0x4d56,
+ {0x9e, 0x45, 0xd0, 0x7e, 0x1a, 0x8b, 0x78, 0x6}};
+
+// {CD1F6B73-2AB0-484D-8EDC-EBE7A43CA09F}
+CLSID_SCOPE const CLSID CLSID_DxcDiaDataSource = {
+ 0xcd1f6b73,
+ 0x2ab0,
+ 0x484d,
+ {0x8e, 0xdc, 0xeb, 0xe7, 0xa4, 0x3c, 0xa0, 0x9f}};
+
+// {3E56AE82-224D-470F-A1A1-FE3016EE9F9D}
+CLSID_SCOPE const CLSID CLSID_DxcCompilerArgs = {
+ 0x3e56ae82,
+ 0x224d,
+ 0x470f,
+ {0xa1, 0xa1, 0xfe, 0x30, 0x16, 0xee, 0x9f, 0x9d}};
+
+// {6245D6AF-66E0-48FD-80B4-4D271796748C}
+CLSID_SCOPE const GUID CLSID_DxcLibrary = {
+ 0x6245d6af,
+ 0x66e0,
+ 0x48fd,
+ {0x80, 0xb4, 0x4d, 0x27, 0x17, 0x96, 0x74, 0x8c}};
+
+CLSID_SCOPE const GUID CLSID_DxcUtils = CLSID_DxcLibrary;
+
+// {8CA3E215-F728-4CF3-8CDD-88AF917587A1}
+CLSID_SCOPE const GUID CLSID_DxcValidator = {
+ 0x8ca3e215,
+ 0xf728,
+ 0x4cf3,
+ {0x8c, 0xdd, 0x88, 0xaf, 0x91, 0x75, 0x87, 0xa1}};
+
+// {D728DB68-F903-4F80-94CD-DCCF76EC7151}
+CLSID_SCOPE const GUID CLSID_DxcAssembler = {
+ 0xd728db68,
+ 0xf903,
+ 0x4f80,
+ {0x94, 0xcd, 0xdc, 0xcf, 0x76, 0xec, 0x71, 0x51}};
+
+// {b9f54489-55b8-400c-ba3a-1675e4728b91}
+CLSID_SCOPE const GUID CLSID_DxcContainerReflection = {
+ 0xb9f54489,
+ 0x55b8,
+ 0x400c,
+ {0xba, 0x3a, 0x16, 0x75, 0xe4, 0x72, 0x8b, 0x91}};
+
+// {AE2CD79F-CC22-453F-9B6B-B124E7A5204C}
+CLSID_SCOPE const GUID CLSID_DxcOptimizer = {
+ 0xae2cd79f,
+ 0xcc22,
+ 0x453f,
+ {0x9b, 0x6b, 0xb1, 0x24, 0xe7, 0xa5, 0x20, 0x4c}};
+
+// {94134294-411f-4574-b4d0-8741e25240d2}
+CLSID_SCOPE const GUID CLSID_DxcContainerBuilder = {
+ 0x94134294,
+ 0x411f,
+ 0x4574,
+ {0xb4, 0xd0, 0x87, 0x41, 0xe2, 0x52, 0x40, 0xd2}};
+#endif
#include "util/u_math.h"
static void
+cl_type_size_align(const struct glsl_type *type, unsigned *size,
+ unsigned *align)
+{
+ *size = glsl_get_cl_size(type);
+ *align = glsl_get_cl_alignment(type);
+}
+
+static void
extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32,
unsigned dst_bit_size,
nir_ssa_def **dst_comps,
}
static nir_ssa_def *
+load_comps_to_vec32(nir_builder *b, unsigned src_bit_size,
+ nir_ssa_def **src_comps, unsigned num_src_comps)
+{
+ unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32);
+ unsigned step = DIV_ROUND_UP(src_bit_size, 32);
+ unsigned comps_per32b = 32 / src_bit_size;
+ nir_ssa_def *vec32comps[4];
+
+ for (unsigned i = 0; i < num_vec32comps; i += step) {
+ nir_ssa_def *tmp;
+ switch (src_bit_size) {
+ case 64:
+ vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]);
+ vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]);
+ break;
+ case 32:
+ vec32comps[i] = src_comps[i];
+ break;
+ case 16:
+ case 8:
+ unsigned src_offs = i * comps_per32b;
+
+ vec32comps[i] = nir_u2u32(b, src_comps[src_offs]);
+ for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) {
+ nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]),
+ nir_imm_int(b, j * src_bit_size));
+ vec32comps[i] = nir_ior(b, vec32comps[i], tmp);
+ }
+ break;
+ }
+ }
+
+ return nir_vec(b, vec32comps, num_vec32comps);
+}
+
+static nir_ssa_def *
+build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx)
+{
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ptr_dxil);
+
+ load->num_components = 1;
+ load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+ load->src[1] = nir_src_for_ssa(idx);
+ nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+ return &load->dest.ssa;
+}
+
+static bool
+lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+ if (!nir_deref_mode_is(deref, nir_var_shader_temp))
+ return false;
+ nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align));
+ nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3)));
+
+ assert(intr->dest.is_ssa);
+ unsigned num_components = nir_dest_num_components(intr->dest);
+ unsigned bit_size = nir_dest_bit_size(intr->dest);
+ unsigned load_size = MAX2(32, bit_size);
+ unsigned num_bits = num_components * bit_size;
+ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+ unsigned comp_idx = 0;
+
+ nir_deref_path path;
+ nir_deref_path_init(&path, deref, NULL);
+ nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */));
+
+ /* Split loads into 32-bit chunks */
+ for (unsigned i = 0; i < num_bits; i += load_size) {
+ unsigned subload_num_bits = MIN2(num_bits - i, load_size);
+ nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32));
+ nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx);
+
+ if (load_size == 64) {
+ idx = nir_iadd(b, idx, nir_imm_int(b, 1));
+ vec32 = nir_vec2(b, vec32,
+ build_load_ptr_dxil(b, path.path[0], idx));
+ }
+
+ /* If we have 2 bytes or less to load we need to adjust the u32 value so
+ * we can always extract the LSB.
+ */
+ if (subload_num_bits <= 16) {
+ nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)),
+ nir_imm_int(b, 8));
+ vec32 = nir_ushr(b, vec32, shift);
+ }
+
+ /* And now comes the pack/unpack step to match the original type. */
+ extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
+ subload_num_bits / bit_size);
+ comp_idx += subload_num_bits / bit_size;
+ }
+
+ nir_deref_path_finish(&path);
+ assert(comp_idx == num_components);
+ nir_ssa_def *result = nir_vec(b, comps, num_components);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static nir_ssa_def *
ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,
nir_ssa_def *offset, unsigned num_bytes)
{
assert(comp_idx == num_components);
return nir_vec(b, comps, num_components);
}
+
+static bool
+lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+ assert(intr->src[0].is_ssa);
+ assert(intr->src[1].is_ssa);
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ nir_ssa_def *buffer = intr->src[0].ssa;
+ nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3UL));
+ unsigned bit_size = nir_dest_bit_size(intr->dest);
+ unsigned num_components = nir_dest_num_components(intr->dest);
+ unsigned num_bits = num_components * bit_size;
+
+ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+ unsigned comp_idx = 0;
+
+ /* We need to split loads in 16byte chunks because that's the optimal
+ * granularity of bufferLoad(). Minimum alignment is 4byte, which saves
+ * from us from extra complexity to extract >= 32 bit components.
+ */
+ for (unsigned i = 0; i < num_bits; i += 4 * 32) {
+ /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
+ * load.
+ */
+ unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32);
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_load_ssbo);
+
+ /* The number of components to store depends on the number of bytes. */
+ load->num_components = DIV_ROUND_UP(subload_num_bits, 32);
+ load->src[0] = nir_src_for_ssa(buffer);
+ load->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i / 8)));
+ nir_ssa_dest_init(&load->instr, &load->dest, load->num_components,
+ 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+
+ nir_ssa_def *vec32 = &load->dest.ssa;
+
+ /* If we have 2 bytes or less to load we need to adjust the u32 value so
+ * we can always extract the LSB.
+ */
+ if (subload_num_bits <= 16) {
+ nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)),
+ nir_imm_int(b, 8));
+ vec32 = nir_ushr(b, vec32, shift);
+ }
+
+ nir_intrinsic_set_align(load, 4, 0);
+
+ /* And now comes the pack/unpack step to match the original type. */
+ extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
+ subload_num_bits / bit_size);
+ comp_idx += subload_num_bits / bit_size;
+ }
+
+ assert(comp_idx == num_components);
+ nir_ssa_def *result = nir_vec(b, comps, num_components);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ b->cursor = nir_before_instr(&intr->instr);
+
+ assert(intr->src[0].is_ssa);
+ assert(intr->src[1].is_ssa);
+ assert(intr->src[2].is_ssa);
+
+ nir_ssa_def *val = intr->src[0].ssa;
+ nir_ssa_def *buffer = intr->src[1].ssa;
+ nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3UL));
+
+ unsigned bit_size = val->bit_size;
+ unsigned num_components = val->num_components;
+ unsigned num_bits = num_components * bit_size;
+
+ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+ unsigned comp_idx = 0;
+
+ for (unsigned i = 0; i < num_components; i++)
+ comps[i] = nir_channel(b, val, i);
+
+ /* We split stores in 16byte chunks because that's the optimal granularity
+ * of bufferStore(). Minimum alignment is 4byte, which saves from us from
+ * extra complexity to store >= 32 bit components.
+ */
+ for (unsigned i = 0; i < num_bits; i += 4 * 32) {
+ /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
+ * store.
+ */
+ unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
+ nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
+ nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
+ substore_num_bits / bit_size);
+ nir_intrinsic_instr *store;
+
+ if (substore_num_bits < 32) {
+ nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1);
+
+ /* If we have 16 bits or less to store we need to place them
+ * correctly in the u32 component. Anything greater than 16 bits
+ * (including uchar3) is naturally aligned on 32bits.
+ */
+ if (substore_num_bits <= 16) {
+ nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3));
+ nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
+
+ vec32 = nir_ishl(b, vec32, shift);
+ mask = nir_ishl(b, mask, shift);
+ }
+
+ store = nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_store_ssbo_masked_dxil);
+ store->src[0] = nir_src_for_ssa(vec32);
+ store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
+ store->src[2] = nir_src_for_ssa(buffer);
+ store->src[3] = nir_src_for_ssa(local_offset);
+ } else {
+ store = nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_store_ssbo);
+ store->src[0] = nir_src_for_ssa(vec32);
+ store->src[1] = nir_src_for_ssa(buffer);
+ store->src[2] = nir_src_for_ssa(local_offset);
+
+ nir_intrinsic_set_align(store, 4, 0);
+ }
+
+ /* The number of components to store depends on the number of bits. */
+ store->num_components = DIV_ROUND_UP(substore_num_bits, 32);
+ nir_builder_instr_insert(b, &store->instr);
+ comp_idx += substore_num_bits / bit_size;
+ }
+
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+void
+lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op)
+{
+ for (unsigned i = 0; i < num_comps; i++) {
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader, op);
+
+ load->num_components = 1;
+ load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
+ nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+ comps[i] = &load->dest.ssa;
+ }
+}
+
+static bool
+lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+ unsigned bit_size = nir_dest_bit_size(intr->dest);
+ unsigned num_components = nir_dest_num_components(intr->dest);
+ unsigned num_bits = num_components * bit_size;
+
+ b->cursor = nir_before_instr(&intr->instr);
+ nir_intrinsic_op op = intr->intrinsic;
+
+ assert(intr->src[0].is_ssa);
+ nir_ssa_def *offset = intr->src[0].ssa;
+ if (op == nir_intrinsic_load_shared) {
+ offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
+ op = nir_intrinsic_load_shared_dxil;
+ } else {
+ offset = nir_u2u32(b, offset);
+ op = nir_intrinsic_load_scratch_dxil;
+ }
+ nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
+ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+ nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
+
+ /* We need to split loads in 32-bit accesses because the buffer
+ * is an i32 array and DXIL does not support type casts.
+ */
+ unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
+ lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op);
+ unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
+
+ for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
+ unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
+ unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
+ nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
+
+ /* If we have 16 bits or less to load we need to adjust the u32 value so
+ * we can always extract the LSB.
+ */
+ if (num_bits <= 16) {
+ nir_ssa_def *shift =
+ nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)),
+ nir_imm_int(b, 8));
+ vec32 = nir_ushr(b, vec32, shift);
+ }
+
+ /* And now comes the pack/unpack step to match the original type. */
+ unsigned dest_index = i * 32 / bit_size;
+ extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps);
+ }
+
+ nir_ssa_def *result = nir_vec(b, comps, num_components);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&intr->instr);
+
+ return true;
+}
+
+static void
+lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op)
+{
+
+ for (unsigned i = 0; i < vec32->num_components; i++) {
+ nir_intrinsic_instr *store =
+ nir_intrinsic_instr_create(b->shader, op);
+
+ store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i));
+ store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
+ store->num_components = 1;
+ nir_builder_instr_insert(b, &store->instr);
+ }
+}
+
+static void
+lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index,
+ nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op)
+{
+ nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
+
+ /* If we have 16 bits or less to store we need to place them correctly in
+ * the u32 component. Anything greater than 16 bits (including uchar3) is
+ * naturally aligned on 32bits.
+ */
+ if (num_bits <= 16) {
+ nir_ssa_def *shift =
+ nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8);
+
+ vec32 = nir_ishl(b, vec32, shift);
+ mask = nir_ishl(b, mask, shift);
+ }
+
+ if (op == nir_intrinsic_store_shared_dxil) {
+ /* Use the dedicated masked intrinsic */
+ nir_intrinsic_instr *store =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_store_shared_masked_dxil);
+ store->src[0] = nir_src_for_ssa(vec32);
+ store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
+ store->src[2] = nir_src_for_ssa(index);
+ store->num_components = 1;
+ nir_builder_instr_insert(b, &store->instr);
+ } else {
+ /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
+ nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_scratch_dxil);
+ load->src[0] = nir_src_for_ssa(index);
+ load->num_components = 1;
+ nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+
+ nir_ssa_def *new_val = nir_ior(b, vec32,
+ nir_iand(b,
+ nir_inot(b, mask),
+ &load->dest.ssa));
+
+ lower_store_vec32(b, index, new_val, op);
+ }
+}
+
+static bool
+lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ assert(intr->src[0].is_ssa);
+ unsigned num_components = nir_src_num_components(intr->src[0]);
+ unsigned bit_size = nir_src_bit_size(intr->src[0]);
+ unsigned num_bits = num_components * bit_size;
+
+ b->cursor = nir_before_instr(&intr->instr);
+ nir_intrinsic_op op = intr->intrinsic;
+
+ nir_ssa_def *offset = intr->src[1].ssa;
+ if (op == nir_intrinsic_store_shared) {
+ offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
+ op = nir_intrinsic_store_shared_dxil;
+ } else {
+ offset = nir_u2u32(b, offset);
+ op = nir_intrinsic_store_scratch_dxil;
+ }
+ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+
+ unsigned comp_idx = 0;
+ for (unsigned i = 0; i < num_components; i++)
+ comps[i] = nir_channel(b, intr->src[0].ssa, i);
+
+ for (unsigned i = 0; i < num_bits; i += 4 * 32) {
+ /* For each 4byte chunk (or smaller) we generate a 32bit scalar store.
+ */
+ unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
+ nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
+ nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
+ substore_num_bits / bit_size);
+ nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2));
+
+ /* For anything less than 32bits we need to use the masked version of the
+ * intrinsic to preserve data living in the same 32bit slot.
+ */
+ if (num_bits < 32) {
+ lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op);
+ } else {
+ lower_store_vec32(b, index, vec32, op);
+ }
+
+ comp_idx += substore_num_bits / bit_size;
+ }
+
+ nir_instr_remove(&intr->instr);
+
+ return true;
+}
+
+static void
+ubo_to_temp_patch_deref_mode(nir_deref_instr *deref)
+{
+ deref->modes = nir_var_shader_temp;
+ nir_foreach_use(use_src, &deref->dest.ssa) {
+ if (use_src->parent_instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr);
+ ubo_to_temp_patch_deref_mode(parent);
+ }
+}
+
+static void
+ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he)
+{
+ assert(nir_deref_mode_is(deref, nir_var_mem_constant));
+ assert(deref->dest.is_ssa);
+ assert(he->data);
+
+ nir_foreach_use(use_src, &deref->dest.ssa) {
+ if (use_src->parent_instr->type == nir_instr_type_deref) {
+ ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he);
+ } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr);
+ if (intr->intrinsic != nir_intrinsic_load_deref)
+ he->data = NULL;
+ } else {
+ he->data = NULL;
+ }
+
+ if (!he->data)
+ break;
+ }
+}
+
+bool
+dxil_nir_lower_ubo_to_temp(nir_shader *nir)
+{
+ struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL);
+ bool progress = false;
+
+ /* First pass: collect all UBO accesses that could be turned into
+ * shader temp accesses.
+ */
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+ if (!nir_deref_mode_is(deref, nir_var_mem_constant) ||
+ deref->deref_type != nir_deref_type_var)
+ continue;
+
+ struct hash_entry *he =
+ _mesa_hash_table_search(ubo_to_temp, deref->var);
+
+ if (!he)
+ he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var);
+
+ if (!he->data)
+ continue;
+
+ ubo_to_temp_update_entry(deref, he);
+ }
+ }
+ }
+
+ hash_table_foreach(ubo_to_temp, he) {
+ nir_variable *var = he->data;
+
+ if (!var)
+ continue;
+
+ /* Change the variable mode. */
+ var->data.mode = nir_var_shader_temp;
+
+ /* Make sure the variable has a name.
+ * DXIL variables must have names.
+ */
+ if (!var->name)
+ var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables));
+
+ progress = true;
+ }
+ _mesa_hash_table_destroy(ubo_to_temp, NULL);
+
+ /* Second pass: patch all derefs that were accessing the converted UBOs
+ * variables.
+ */
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+ if (nir_deref_mode_is(deref, nir_var_mem_constant) &&
+ deref->deref_type == nir_deref_type_var &&
+ deref->var->data.mode == nir_var_shader_temp)
+ ubo_to_temp_patch_deref_mode(deref);
+ }
+ }
+ }
+
+ return progress;
+}
+
+bool
+lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+ assert(intr->src[0].is_ssa);
+ assert(intr->src[1].is_ssa);
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ nir_ssa_def *result =
+ build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,
+ nir_dest_num_components(intr->dest),
+ nir_dest_bit_size(intr->dest));
+
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+bool
+dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_deref:
+ progress |= lower_load_deref(&b, intr);
+ break;
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_load_scratch:
+ progress |= lower_32b_offset_load(&b, intr);
+ break;
+ case nir_intrinsic_load_ssbo:
+ progress |= lower_load_ssbo(&b, intr);
+ break;
+ case nir_intrinsic_load_ubo:
+ progress |= lower_load_ubo(&b, intr);
+ break;
+ case nir_intrinsic_store_shared:
+ case nir_intrinsic_store_scratch:
+ progress |= lower_32b_offset_store(&b, intr);
+ break;
+ case nir_intrinsic_store_ssbo:
+ progress |= lower_store_ssbo(&b, intr);
+ break;
+ }
+ }
+ }
+ }
+
+ return progress;
+}
+
+lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_intrinsic_op dxil_op)
+{
+ b->cursor = nir_before_instr(&intr->instr);
+
+ assert(intr->src[0].is_ssa);
+ nir_ssa_def *offset =
+ nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr)));
+ nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
+
+ nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op);
+ atomic->src[0] = nir_src_for_ssa(index);
+ assert(intr->src[1].is_ssa);
+ atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa);
+ if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) {
+ assert(intr->src[2].is_ssa);
+ atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa);
+ }
+ atomic->num_components = 0;
+ nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, intr->dest.ssa.name);
+
+ nir_builder_instr_insert(b, &atomic->instr);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(&atomic->dest.ssa));
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+bool
+dxil_nir_lower_atomics_to_dxil(nir_shader *nir)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+
+#define ATOMIC(op) \
+ case nir_intrinsic_shared_atomic_##op: \
+ progress |= lower_shared_atomic(&b, intr, \
+ nir_intrinsic_shared_atomic_##op##_dxil); \
+ break
+
+ ATOMIC(add);
+ ATOMIC(imin);
+ ATOMIC(umin);
+ ATOMIC(imax);
+ ATOMIC(umax);
+ ATOMIC(and);
+ ATOMIC(or);
+ ATOMIC(xor);
+ ATOMIC(exchange);
+ ATOMIC(comp_swap);
+
+#undef ATOMIC
+ }
+ }
+ }
+ }
+
+ return progress;
+}
+
+static bool
+lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref)
+{
+ assert(nir_deref_mode_is(deref, nir_var_mem_ssbo));
+ assert(deref->deref_type == nir_deref_type_var ||
+ deref->deref_type == nir_deref_type_cast);
+ nir_variable *var = deref->var;
+
+ b->cursor = nir_before_instr(&deref->instr);
+
+ if (deref->deref_type == nir_deref_type_var) {
+ /* We turn all deref_var into deref_cast and build a pointer value based on
+ * the var binding which encodes the UAV id.
+ */
+ nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32);
+ nir_deref_instr *deref_cast =
+ nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type,
+ glsl_get_explicit_stride(var->type));
+ nir_ssa_def_rewrite_uses(&deref->dest.ssa,
+ nir_src_for_ssa(&deref_cast->dest.ssa));
+ nir_instr_remove(&deref->instr);
+
+ deref = deref_cast;
+ return true;
+ }
+ return false;
+}
+
+bool
+dxil_nir_lower_deref_ssbo(nir_shader *nir)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+ if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) ||
+ (deref->deref_type != nir_deref_type_var &&
+ deref->deref_type != nir_deref_type_cast))
+ continue;
+
+ progress |= lower_deref_ssbo(&b, deref);
+ }
+ }
+ }
+
+ return progress;
+}
+
+static bool
+lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu)
+{
+ const nir_op_info *info = &nir_op_infos[alu->op];
+ bool progress = false;
+
+ b->cursor = nir_before_instr(&alu->instr);
+
+ for (unsigned i = 0; i < info->num_inputs; i++) {
+ nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src);
+
+ if (!deref)
+ continue;
+
+ nir_deref_path path;
+ nir_deref_path_init(&path, deref, NULL);
+ nir_deref_instr *root_deref = path.path[0];
+ nir_deref_path_finish(&path);
+
+ if (root_deref->deref_type != nir_deref_type_cast)
+ continue;
+
+ nir_ssa_def *ptr =
+ nir_iadd(b, root_deref->parent.ssa,
+ nir_build_deref_offset(b, deref, cl_type_size_align));
+ nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr));
+ progress = true;
+ }
+
+ return progress;
+}
+
+bool
+dxil_nir_opt_alu_deref_srcs(nir_shader *nir)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ bool progress = false;
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_alu)
+ continue;
+
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ progress |= lower_alu_deref_srcs(&b, alu);
+ }
+ }
+ }
+
+ return progress;
+}
+
+static nir_ssa_def *
+memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent,
+ nir_ssa_def *index)
+{
+ nir_deref_instr *deref;
+
+ index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
+ assert(parent->deref_type == nir_deref_type_cast);
+ deref = nir_build_deref_ptr_as_array(b, parent, index);
+
+ return nir_load_deref(b, deref);
+}
+
+static void
+memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent,
+ nir_ssa_def *index, nir_ssa_def *value)
+{
+ nir_deref_instr *deref;
+
+ index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
+ assert(parent->deref_type == nir_deref_type_cast);
+ deref = nir_build_deref_ptr_as_array(b, parent, index);
+ nir_store_deref(b, deref, value, 1);
+}
+
+static bool
+lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr)
+{
+ nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]);
+ nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]);
+ assert(intr->src[2].is_ssa);
+ nir_ssa_def *num_bytes = intr->src[2].ssa;
+
+ assert(dst_deref && src_deref);
+
+ b->cursor = nir_after_instr(&intr->instr);
+
+ dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes,
+ glsl_uint8_t_type(), 1);
+ src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes,
+ glsl_uint8_t_type(), 1);
+
+ /*
+ * We want to avoid 64b instructions, so let's assume we'll always be
+ * passed a value that fits in a 32b type and truncate the 64b value.
+ */
+ num_bytes = nir_u2u32(b, num_bytes);
+
+ nir_variable *loop_index_var =
+ nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index");
+ nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var);
+ nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1);
+
+ nir_loop *loop = nir_push_loop(b);
+ nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref);
+ nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes);
+ nir_if *loop_check = nir_push_if(b, cmp);
+ nir_jump(b, nir_jump_break);
+ nir_pop_if(b, loop_check);
+ nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index);
+ memcpy_store_deref_elem(b, dst_deref, loop_index, val);
+ nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1);
+ nir_pop_loop(b, loop);
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+bool
+dxil_nir_lower_memcpy_deref(nir_shader *nir)
+{
+ bool progress = false;
+
+ foreach_list_typed(nir_function, func, node, &nir->functions) {
+ if (!func->is_entrypoint)
+ continue;
+ assert(func->impl);
+
+ nir_builder b;
+ nir_builder_init(&b, func->impl);
+
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ if (intr->intrinsic == nir_intrinsic_memcpy_deref)
+ progress |= lower_memcpy_deref(&b, intr);
+ }
+ }
+ }
+
+ return progress;
+}
+
+static void
+cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size)
+{
+ nir_phi_instr *lowered = nir_phi_instr_create(b->shader);
+ int num_components = 0;
+ int old_bit_size = phi->dest.ssa.bit_size;
+
+ nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size,
+ nir_type_uint | new_bit_size,
+ nir_rounding_mode_undef);
+ nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size,
+ nir_type_uint | old_bit_size,
+ nir_rounding_mode_undef);
+
+ nir_foreach_phi_src(src, phi) {
+ assert(num_components == 0 || num_components == src->src.ssa->num_components);
+ num_components = src->src.ssa->num_components;
+
+ b->cursor = nir_after_instr(src->src.ssa->parent_instr);
+
+ nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL);
+
+ nir_phi_src *new_src = rzalloc(lowered, nir_phi_src);
+ new_src->pred = src->pred;
+ new_src->src = nir_src_for_ssa(cast);
+ exec_list_push_tail(&lowered->srcs, &new_src->node);
+ }
+
+ nir_ssa_dest_init(&lowered->instr, &lowered->dest,
+ num_components, new_bit_size, NULL);
+
+ b->cursor = nir_before_instr(&phi->instr);
+ nir_builder_instr_insert(b, &lowered->instr);
+
+ b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor));
+ nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL);
+
+ nir_ssa_def_rewrite_uses(&phi->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&phi->instr);
+}
+
+static bool
+upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size)
+{
+ nir_builder b;
+ nir_builder_init(&b, impl);
+ bool progress = false;
+
+ nir_foreach_block_reverse(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_phi)
+ continue;
+
+ nir_phi_instr *phi = nir_instr_as_phi(instr);
+ assert(phi->dest.is_ssa);
+
+ if (phi->dest.ssa.bit_size == 1 ||
+ phi->dest.ssa.bit_size >= min_bit_size)
+ continue;
+
+ cast_phi(&b, phi, min_bit_size);
+ progress = true;
+ }
+ }
+
+ if (progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
+ return progress;
+}
+
+bool
+dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ progress |= upcast_phi_impl(function->impl, min_bit_size);
+ }
+
+ return progress;
+}
+
+/* The following float-to-half conversion routines are based on the "half" library:
+ * https://sourceforge.net/projects/half/
+ *
+ * half - IEEE 754-based half-precision floating-point library.
+ *
+ * Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Version 2.1.0
+ */
+
+
+static bool
+lower_fp16_casts_filter(const nir_instr *instr, const void *data)
+{
+ if (instr->type == nir_instr_type_alu) {
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ /* TODO: DXIL has instructions for f2f16_rtz. For CL, it's not precise enough
+ * due to denorm handling. If the f2f16 instruction has undef rounding mode,
+ * we could map that too, but for CL, f2f16 is implied to mean rtne.
+ */
+ switch (alu->op) {
+ case nir_op_f2f16:
+ case nir_op_f2f16_rtne:
+ case nir_op_f2f16_rtz:
+ return true;
+ }
+ } else if (instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ return intrin->intrinsic == nir_intrinsic_convert_alu_types &&
+ nir_intrinsic_dest_type(intrin) == nir_type_float16;
+ }
+ return false;
+}
+
+static nir_ssa_def *
+half_rounded(nir_builder *b, nir_ssa_def *value, nir_ssa_def *guard, nir_ssa_def *sticky,
+ nir_ssa_def *sign, nir_rounding_mode mode)
+{
+ switch (mode) {
+ case nir_rounding_mode_rtne:
+ return nir_iadd(b, value, nir_iand(b, guard, nir_ior(b, sticky, value)));
+ case nir_rounding_mode_ru:
+ sign = nir_ushr(b, sign, nir_imm_int(b, 31));
+ return nir_iadd(b, value, nir_iand(b, nir_inot(b, sign),
+ nir_ior(b, guard, sticky)));
+ case nir_rounding_mode_rd:
+ sign = nir_ushr(b, sign, nir_imm_int(b, 31));
+ return nir_iadd(b, value, nir_iand(b, sign,
+ nir_ior(b, guard, sticky)));
+ default:
+ return value;
+ }
+}
+
+static nir_ssa_def *
+float_to_half_impl(nir_builder *b, nir_ssa_def *src, nir_rounding_mode mode)
+{
+ nir_ssa_def *f32infinity = nir_imm_int(b, 255 << 23);
+ nir_ssa_def *f16max = nir_imm_int(b, (127 + 16) << 23);
+ nir_ssa_def *denorm_magic = nir_imm_int(b, ((127 - 15) + (23 - 10) + 1) << 23);
+ nir_ssa_def *sign = nir_iand(b, src, nir_imm_int(b, 0x80000000));
+ nir_ssa_def *one = nir_imm_int(b, 1);
+
+ nir_ssa_def *abs = nir_iand(b, src, nir_imm_int(b, 0x7FFFFFFF));
+ /* NaN or INF. For rtne, overflow also becomes INF, so combine the comparisons */
+ nir_push_if(b, nir_ige(b, abs, mode == nir_rounding_mode_rtne ? f16max : f32infinity));
+ nir_ssa_def *inf_nanfp16 = nir_bcsel(b,
+ nir_ilt(b, f32infinity, abs),
+ nir_imm_int(b, 0x7E00),
+ nir_imm_int(b, 0x7C00));
+ nir_push_else(b, NULL);
+
+ nir_ssa_def *overflowed_fp16 = NULL;
+ if (mode != nir_rounding_mode_rtne) {
+ /* Handle overflow */
+ nir_push_if(b, nir_ige(b, abs, f16max));
+ switch (mode) {
+ case nir_rounding_mode_rtz:
+ overflowed_fp16 = nir_imm_int(b, 0x7BFF);
+ break;
+ case nir_rounding_mode_ru:
+ /* Negative becomes max float, positive becomes inf */
+ overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7BFF), nir_imm_int(b, 0x7C00));
+ break;
+ case nir_rounding_mode_rd:
+ /* Negative becomes inf, positive becomes max float */
+ overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7C00), nir_imm_int(b, 0x7BFF));
+ break;
+ default: unreachable("Should've been handled already");
+ }
+ nir_push_else(b, NULL);
+ }
+
+ nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 113 << 23)));
+
+ /* FP16 will be normal */
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_ssa_def *value = nir_ior(b,
+ nir_ishl(b,
+ nir_isub(b,
+ nir_ushr(b, abs, nir_imm_int(b, 23)),
+ nir_imm_int(b, 112)),
+ nir_imm_int(b, 10)),
+ nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 13)), nir_imm_int(b, 0x3FFF)));
+ nir_ssa_def *guard = nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 12)), one);
+ nir_ssa_def *sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, abs, nir_imm_int(b, 0xFFF)), zero), one, zero);
+ nir_ssa_def *normal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
+
+ nir_push_else(b, NULL);
+ nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 102 << 23)));
+
+ /* FP16 will be denormal */
+ nir_ssa_def *i = nir_isub(b, nir_imm_int(b, 125), nir_ushr(b, abs, nir_imm_int(b, 23)));
+ nir_ssa_def *masked = nir_ior(b, nir_iand(b, abs, nir_imm_int(b, 0x7FFFFF)), nir_imm_int(b, 0x800000));
+ value = nir_ushr(b, masked, nir_iadd(b, i, one));
+ guard = nir_iand(b, nir_ushr(b, masked, i), one);
+ sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, masked, nir_isub(b, nir_ishl(b, one, i), one)), zero), one, zero);
+ nir_ssa_def *denormal_fp16 = half_rounded(b, value, guard, sticky, sign, mode);
+
+ nir_push_else(b, NULL);
+
+ /* Handle underflow. Nonzero values need to shift up or down for round-up or round-down */
+ nir_ssa_def *underflowed_fp16 = zero;
+ if (mode == nir_rounding_mode_ru ||
+ mode == nir_rounding_mode_rd) {
+ nir_push_if(b, nir_i2b1(b, abs));
+
+ if (mode == nir_rounding_mode_ru)
+ underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), zero, one);
+ else
+ underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), one, zero);
+
+ nir_push_else(b, NULL);
+ nir_pop_if(b, NULL);
+ underflowed_fp16 = nir_if_phi(b, underflowed_fp16, zero);
+ }
+
+ nir_pop_if(b, NULL);
+ nir_ssa_def *underflowed_or_denorm_fp16 = nir_if_phi(b, denormal_fp16, underflowed_fp16);
+
+ nir_pop_if(b, NULL);
+ nir_ssa_def *finite_fp16 = nir_if_phi(b, normal_fp16, underflowed_or_denorm_fp16);
+
+ nir_ssa_def *finite_or_overflowed_fp16 = finite_fp16;
+ if (mode != nir_rounding_mode_rtne) {
+ nir_pop_if(b, NULL);
+ finite_or_overflowed_fp16 = nir_if_phi(b, overflowed_fp16, finite_fp16);
+ }
+
+ nir_pop_if(b, NULL);
+ nir_ssa_def *fp16 = nir_if_phi(b, inf_nanfp16, finite_or_overflowed_fp16);
+
+ return nir_u2u16(b, nir_ior(b, fp16, nir_ushr(b, sign, nir_imm_int(b, 16))));
+}
+
+static nir_ssa_def *
+lower_fp16_cast_impl(nir_builder *b, nir_instr *instr, void *data)
+{
+ nir_ssa_def *src, *dst;
+ uint8_t *swizzle = NULL;
+ nir_rounding_mode mode = nir_rounding_mode_rtne;
+
+ if (instr->type == nir_instr_type_alu) {
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ src = alu->src[0].src.ssa;
+ swizzle = alu->src[0].swizzle;
+ dst = &alu->dest.dest.ssa;
+ assert(src->bit_size == 32);
+ switch (alu->op) {
+ case nir_op_f2f16:
+ case nir_op_f2f16_rtne:
+ break;
+ case nir_op_f2f16_rtz:
+ mode = nir_rounding_mode_rtz;
+ break;
+ }
+ } else {
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ assert(nir_intrinsic_src_type(intrin) == nir_type_float32);
+ src = intrin->src[0].ssa;
+ dst = &intrin->dest.ssa;
+ mode = nir_intrinsic_rounding_mode(intrin);
+ }
+
+ nir_ssa_def *rets[NIR_MAX_VEC_COMPONENTS] = { NULL };
+
+ for (unsigned i = 0; i < dst->num_components; i++) {
+ nir_ssa_def *comp = nir_channel(b, src, swizzle ? swizzle[i] : i);
+ rets[i] = float_to_half_impl(b, comp, mode);
+ }
+
+ return nir_vec(b, rets, dst->num_components);
+}
+
+bool
+dxil_nir_lower_fp16_casts(nir_shader *shader)
+{
+ return nir_shader_lower_instructions(shader,
+ lower_fp16_casts_filter,
+ lower_fp16_cast_impl,
+ NULL);
+}
bool dxil_nir_lower_16bit_conv(nir_shader *shader);
bool dxil_nir_lower_x2b(nir_shader *shader);
bool dxil_nir_lower_inot(nir_shader *shader);
+bool dxil_nir_lower_ubo_to_temp(nir_shader *shader);
+bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader);
+bool dxil_nir_lower_atomics_to_dxil(nir_shader *shader);
+bool dxil_nir_lower_deref_ssbo(nir_shader *shader);
+bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader);
+bool dxil_nir_lower_memcpy_deref(nir_shader *shader);
+bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size);
+bool dxil_nir_lower_fp16_casts(nir_shader *shader);
nir_ssa_def *
build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
.lower_pack_32_2x16_split = true,
.lower_unpack_64_2x32_split = true,
.lower_unpack_32_2x16_split = true,
+ .use_scoped_barrier = true,
.vertex_id_zero_based = true,
.lower_base_vertex = true,
+ .has_cs_global_id = true,
};
const nir_shader_compiler_options*
}
static bool
+emit_globals(struct ntd_context *ctx, nir_shader *s, unsigned size)
+{
+ nir_foreach_variable_with_modes(var, s, nir_var_mem_ssbo)
+ size++;
+
+ if (!size)
+ return true;
+
+ const struct dxil_type *type = dxil_module_get_int_type(&ctx->mod, 32);
+ if (!type)
+ return false;
+
+ const struct dxil_type *struct_type =
+ dxil_module_get_struct_type(&ctx->mod, NULL, &type, 1);
+ if (!struct_type)
+ return false;
+
+ const struct dxil_type *array_type =
+ dxil_module_get_array_type(&ctx->mod, struct_type, size);
+ if (!array_type)
+ return false;
+
+ resource_array_layout layout = {0, 0, size};
+ const struct dxil_mdnode *uav_meta =
+ emit_uav_metadata(&ctx->mod, array_type,
+ "globals", &layout,
+ DXIL_COMP_TYPE_INVALID,
+ DXIL_RESOURCE_KIND_RAW_BUFFER);
+ if (!uav_meta)
+ return false;
+
+ ctx->uav_metadata_nodes[ctx->num_uav_arrays++] = uav_meta;
+ if (ctx->num_uav_arrays > 8)
+ ctx->mod.feats.use_64uavs = 1;
+ /* Handles to UAVs used for kernel globals are created on-demand */
+ ctx->num_uavs += size;
+ add_resource(ctx, DXIL_RES_UAV_RAW, &layout);
+ ctx->mod.raw_and_structured_buffers = true;
+ return true;
+}
+
+static bool
emit_uav(struct ntd_context *ctx, nir_variable *var, unsigned count)
{
assert(ctx->num_uav_arrays < ARRAY_SIZE(ctx->uav_metadata_nodes));
}
static bool
+emit_global_consts(struct ntd_context *ctx, nir_shader *s)
+{
+ nir_foreach_variable_with_modes(var, s, nir_var_shader_temp) {
+ struct dxil_value *ret;
+ bool err;
+
+ assert(var->constant_initializer);
+
+ unsigned int num_members = DIV_ROUND_UP(glsl_get_cl_size(var->type), 4);
+ uint32_t *const_ints = ralloc_array(ctx->ralloc_ctx, uint32_t, num_members);
+ err = var_fill_const_array(ctx, var->constant_initializer, var->type,
+ const_ints, 0);
+ if (!err)
+ return false;
+ const struct dxil_value **const_vals =
+ ralloc_array(ctx->ralloc_ctx, const struct dxil_value *, num_members);
+ if (!const_vals)
+ return false;
+ for (int i = 0; i < num_members; i++)
+ const_vals[i] = dxil_module_get_int32_const(&ctx->mod, const_ints[i]);
+
+ const struct dxil_type *elt_type = dxil_module_get_int_type(&ctx->mod, 32);
+ if (!elt_type)
+ return false;
+ const struct dxil_type *type =
+ dxil_module_get_array_type(&ctx->mod, elt_type, num_members);
+ if (!type)
+ return false;
+ const struct dxil_value *agg_vals =
+ dxil_module_get_array_const(&ctx->mod, type, const_vals);
+ if (!agg_vals)
+ return false;
+
+ const struct dxil_value *gvar = dxil_add_global_ptr_var(&ctx->mod, var->name, type,
+ DXIL_AS_DEFAULT, 4,
+ agg_vals);
+ if (!gvar)
+ return false;
+
+ if (!_mesa_hash_table_insert(ctx->consts, var, (void *)gvar))
+ return false;
+ }
+
+ return true;
+}
+
+static bool
emit_cbv(struct ntd_context *ctx, unsigned binding,
unsigned size, char *name)
{
case nir_op_flog2: return emit_unary_intin(ctx, alu, DXIL_INTR_FLOG2, src[0]);
case nir_op_ffloor: return emit_unary_intin(ctx, alu, DXIL_INTR_ROUND_NI, src[0]);
case nir_op_ffract: return emit_unary_intin(ctx, alu, DXIL_INTR_FRC, src[0]);
+ case nir_op_fisnormal: return emit_unary_intin(ctx, alu, DXIL_INTR_ISNORMAL, src[0]);
+ case nir_op_fisfinite: return emit_unary_intin(ctx, alu, DXIL_INTR_ISFINITE, src[0]);
case nir_op_fddx:
case nir_op_fddx_coarse: return emit_unary_intin(ctx, alu, DXIL_INTR_DDX_COARSE, src[0]);
}
static bool
+emit_barrier(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *opcode, *mode;
+ const struct dxil_func *func;
+ uint32_t flags = 0;
+
+ if (nir_intrinsic_execution_scope(intr) == NIR_SCOPE_WORKGROUP)
+ flags |= DXIL_BARRIER_MODE_SYNC_THREAD_GROUP;
+
+ nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
+ nir_scope mem_scope = nir_intrinsic_memory_scope(intr);
+
+ if (modes & ~(nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_shared))
+ return false;
+
+ if (mem_scope != NIR_SCOPE_DEVICE && mem_scope != NIR_SCOPE_WORKGROUP)
+ return false;
+
+ if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
+ if (mem_scope == NIR_SCOPE_DEVICE)
+ flags |= DXIL_BARRIER_MODE_UAV_FENCE_GLOBAL;
+ else
+ flags |= DXIL_BARRIER_MODE_UAV_FENCE_THREAD_GROUP;
+ }
+
+ if (modes & nir_var_mem_shared)
+ flags |= DXIL_BARRIER_MODE_UAV_FENCE_THREAD_GROUP;
+
+ func = dxil_get_function(&ctx->mod, "dx.op.barrier", DXIL_NONE);
+ if (!func)
+ return false;
+
+ opcode = dxil_module_get_int32_const(&ctx->mod, DXIL_INTR_BARRIER);
+ if (!opcode)
+ return false;
+
+ mode = dxil_module_get_int32_const(&ctx->mod, flags);
+ if (!mode)
+ return false;
+
+ const struct dxil_value *args[] = { opcode, mode };
+
+ return dxil_emit_call_void(&ctx->mod, func,
+ args, ARRAY_SIZE(args));
+}
+
+static bool
+emit_load_global_invocation_id(struct ntd_context *ctx,
+ nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+ nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa);
+
+ for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+ if (comps & (1 << i)) {
+ const struct dxil_value *idx = dxil_module_get_int32_const(&ctx->mod, i);
+ if (!idx)
+ return false;
+ const struct dxil_value *globalid = emit_threadid_call(ctx, idx);
+
+ if (!globalid)
+ return false;
+
+ store_dest_value(ctx, &intr->dest, i, globalid);
+ }
+ }
+ return true;
+}
+
+static bool
+emit_load_local_invocation_id(struct ntd_context *ctx,
+ nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+ nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa);
+
+ for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+ if (comps & (1 << i)) {
+ const struct dxil_value
+ *idx = dxil_module_get_int32_const(&ctx->mod, i);
+ if (!idx)
+ return false;
+ const struct dxil_value
+ *threadidingroup = emit_threadidingroup_call(ctx, idx);
+ if (!threadidingroup)
+ return false;
+ store_dest_value(ctx, &intr->dest, i, threadidingroup);
+ }
+ }
+ return true;
+}
+
+static bool
+emit_load_local_work_group_id(struct ntd_context *ctx,
+ nir_intrinsic_instr *intr)
+{
+ assert(intr->dest.is_ssa);
+ nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa);
+
+ for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+ if (comps & (1 << i)) {
+ const struct dxil_value *idx = dxil_module_get_int32_const(&ctx->mod, i);
+ if (!idx)
+ return false;
+ const struct dxil_value *groupid = emit_groupid_call(ctx, idx);
+ if (!groupid)
+ return false;
+ store_dest_value(ctx, &intr->dest, i, groupid);
+ }
+ }
+ return true;
+}
+
+static bool
emit_load_primitiveid(struct ntd_context *ctx,
nir_intrinsic_instr *intr)
{
return dxil_module_get_undef(m, int32_type);
}
+static const struct dxil_value *
+offset_to_index(struct dxil_module *m, const struct dxil_value *offset,
+ unsigned bit_size)
+{
+ unsigned shift_amt = util_logbase2(bit_size / 8);
+ const struct dxil_value *shift =
+ dxil_module_get_int32_const(m, shift_amt);
+ if (!shift)
+ return NULL;
+
+ return dxil_emit_binop(m, DXIL_BINOP_LSHR, offset, shift, 0);
+}
+
+static const struct dxil_value *
+index_to_offset(struct dxil_module *m, const struct dxil_value *index,
+ unsigned bit_size)
+{
+ unsigned shift_amt = util_logbase2(bit_size / 8);
+ const struct dxil_value *shift =
+ dxil_module_get_int32_const(m, shift_amt);
+ if (!shift)
+ return NULL;
+
+ return dxil_emit_binop(m, DXIL_BINOP_SHL, index, shift, 0);
+}
+
+static const struct dxil_value *
+emit_gep_for_index(struct ntd_context *ctx, const nir_variable *var,
+ const struct dxil_value *index)
+{
+ assert(var->data.mode == nir_var_shader_temp);
+
+ struct hash_entry *he = _mesa_hash_table_search(ctx->consts, var);
+ assert(he != NULL);
+ const struct dxil_value *ptr = he->data;
+
+ const struct dxil_value *zero = dxil_module_get_int32_const(&ctx->mod, 0);
+ if (!zero)
+ return NULL;
+
+ const struct dxil_value *ops[] = { ptr, zero, index };
+ return dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+}
+
+static bool
+emit_load_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
+ const struct dxil_value *buffer =
+ get_src(ctx, &intr->src[0], 0, nir_type_uint);
+ const struct dxil_value *offset =
+ get_src(ctx, &intr->src[1], 0, nir_type_uint);
+ if (!int32_undef || !buffer || !offset)
+ return false;
+
+ assert(nir_src_bit_size(intr->src[0]) == 32);
+ assert(nir_intrinsic_dest_components(intr) <= 4);
+
+ const struct dxil_value *handle =
+ emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer,
+ nir_src_is_const(intr->src[0]));
+ if (!handle)
+ return false;
+
+ const struct dxil_value *coord[2] = {
+ offset,
+ int32_undef
+ };
+
+ const struct dxil_value *load = emit_bufferload_call(ctx, handle, coord);
+ if (!load)
+ return false;
+
+ for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) {
+ const struct dxil_value *val =
+ dxil_emit_extractval(&ctx->mod, load, i);
+ if (!val)
+ return false;
+ store_dest_value(ctx, &intr->dest, i, val);
+ }
+ return true;
+}
+
+static bool
+emit_store_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *buffer =
+ get_src(ctx, &intr->src[1], 0, nir_type_uint);
+ const struct dxil_value *offset =
+ get_src(ctx, &intr->src[2], 0, nir_type_uint);
+ if (!buffer || !offset)
+ return false;
+
+ const struct dxil_value *handle =
+ emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer,
+ nir_src_is_const(intr->src[1]));
+ if (!handle)
+ return false;
+
+ assert(nir_src_bit_size(intr->src[0]) == 32);
+ unsigned num_components = nir_src_num_components(intr->src[0]);
+ assert(num_components <= 4);
+ const struct dxil_value *value[4];
+ for (unsigned i = 0; i < num_components; ++i) {
+ value[i] = get_src(ctx, &intr->src[0], i, nir_type_uint);
+ if (!value[i])
+ return false;
+ }
+
+ const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
+ if (!int32_undef)
+ return false;
+
+ const struct dxil_value *coord[2] = {
+ offset,
+ int32_undef
+ };
+
+ for (int i = num_components; i < 4; ++i)
+ value[i] = int32_undef;
+
+ const struct dxil_value *write_mask =
+ dxil_module_get_int8_const(&ctx->mod, (1u << num_components) - 1);
+ if (!write_mask)
+ return false;
+
+ return emit_bufferstore_call(ctx, handle, coord, value, write_mask, DXIL_I32);
+}
+
+static bool
+emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *value =
+ get_src(ctx, &intr->src[0], 0, nir_type_uint);
+ const struct dxil_value *mask =
+ get_src(ctx, &intr->src[1], 0, nir_type_uint);
+ const struct dxil_value *buffer =
+ get_src(ctx, &intr->src[2], 0, nir_type_uint);
+ const struct dxil_value *offset =
+ get_src(ctx, &intr->src[3], 0, nir_type_uint);
+ if (!value || !mask || !buffer || !offset)
+ return false;
+
+ const struct dxil_value *handle =
+ emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer,
+ nir_src_is_const(intr->src[2]));
+ if (!handle)
+ return false;
+
+ const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
+ if (!int32_undef)
+ return false;
+
+ const struct dxil_value *coord[3] = {
+ offset, int32_undef, int32_undef
+ };
+
+ return
+ emit_atomic_binop(ctx, handle, DXIL_ATOMIC_AND, coord, mask) != NULL &&
+ emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL;
+}
+
+static bool
+emit_store_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *zero, *index;
+ unsigned bit_size = nir_src_bit_size(intr->src[0]);
+
+ /* All shared mem accesses should have been lowered to scalar 32bit
+ * accesses.
+ */
+ assert(bit_size == 32);
+ assert(nir_src_num_components(intr->src[0]) == 1);
+
+ zero = dxil_module_get_int32_const(&ctx->mod, 0);
+ if (!zero)
+ return false;
+
+ if (intr->intrinsic == nir_intrinsic_store_shared_dxil)
+ index = get_src(ctx, &intr->src[1], 0, nir_type_uint);
+ else
+ index = get_src(ctx, &intr->src[2], 0, nir_type_uint);
+ if (!index)
+ return false;
+
+ const struct dxil_value *ops[] = { ctx->sharedvars, zero, index };
+ const struct dxil_value *ptr, *value;
+
+ ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+ if (!ptr)
+ return false;
+
+ value = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+
+ if (intr->intrinsic == nir_intrinsic_store_shared_dxil)
+ return dxil_emit_store(&ctx->mod, value, ptr, 4, false);
+
+ const struct dxil_value *mask = get_src(ctx, &intr->src[1], 0, nir_type_uint);
+
+ if (!dxil_emit_atomicrmw(&ctx->mod, mask, ptr, DXIL_RMWOP_AND, false,
+ DXIL_ATOMIC_ORDERING_ACQREL,
+ DXIL_SYNC_SCOPE_CROSSTHREAD))
+ return false;
+
+ if (!dxil_emit_atomicrmw(&ctx->mod, value, ptr, DXIL_RMWOP_OR, false,
+ DXIL_ATOMIC_ORDERING_ACQREL,
+ DXIL_SYNC_SCOPE_CROSSTHREAD))
+ return false;
+
+ return true;
+}
+
+static bool
+emit_store_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *zero, *index;
+ unsigned bit_size = nir_src_bit_size(intr->src[0]);
+
+ /* All scratch mem accesses should have been lowered to scalar 32bit
+ * accesses.
+ */
+ assert(bit_size == 32);
+ assert(nir_src_num_components(intr->src[0]) == 1);
+
+ zero = dxil_module_get_int32_const(&ctx->mod, 0);
+ if (!zero)
+ return false;
+
+ index = get_src(ctx, &intr->src[1], 0, nir_type_uint);
+ if (!index)
+ return false;
+
+ const struct dxil_value *ops[] = { ctx->scratchvars, zero, index };
+ const struct dxil_value *ptr, *value;
+
+ ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+ if (!ptr)
+ return false;
+
+ value = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+ return dxil_emit_store(&ctx->mod, value, ptr, 4, false);
+}
+
static bool
emit_load_ubo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
{
}
static bool
+emit_load_ptr(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct nir_variable *var =
+ nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
+ const struct dxil_value *index =
+ get_src(ctx, &intr->src[1], 0, nir_type_uint);
+
+ const struct dxil_value *ptr = emit_gep_for_index(ctx, var, index);
+ if (!ptr)
+ return false;
+
+ const struct dxil_value *retval =
+ dxil_emit_load(&ctx->mod, ptr, 4, false);
+
+ store_dest(ctx, &intr->dest, 0, retval, nir_type_uint);
+ return true;
+}
+
+static bool
+emit_load_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *zero, *index;
+ unsigned bit_size = nir_dest_bit_size(intr->dest);
+ unsigned align = bit_size / 8;
+
+ /* All shared mem accesses should have been lowered to scalar 32bit
+ * accesses.
+ */
+ assert(bit_size == 32);
+ assert(nir_dest_num_components(intr->dest) == 1);
+
+ zero = dxil_module_get_int32_const(&ctx->mod, 0);
+ if (!zero)
+ return false;
+
+ index = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+ if (!index)
+ return false;
+
+ const struct dxil_value *ops[] = { ctx->sharedvars, zero, index };
+ const struct dxil_value *ptr, *retval;
+
+ ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+ if (!ptr)
+ return false;
+
+ retval = dxil_emit_load(&ctx->mod, ptr, align, false);
+ if (!retval)
+ return false;
+
+ store_dest(ctx, &intr->dest, 0, retval, nir_type_uint);
+ return true;
+}
+
+static bool
+emit_load_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr)
+{
+ const struct dxil_value *zero, *one, *index;
+ unsigned bit_size = nir_dest_bit_size(intr->dest);
+ unsigned align = bit_size / 8;
+
+ /* All scratch mem accesses should have been lowered to scalar 32bit
+ * accesses.
+ */
+ assert(bit_size == 32);
+ assert(nir_dest_num_components(intr->dest) == 1);
+
+ zero = dxil_module_get_int32_const(&ctx->mod, 0);
+ if (!zero)
+ return false;
+
+ index = get_src(ctx, &intr->src[0], 0, nir_type_uint);
+ if (!index)
+ return false;
+
+ const struct dxil_value *ops[] = { ctx->scratchvars, zero, index };
+ const struct dxil_value *ptr, *retval;
+
+ ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops));
+ if (!ptr)
+ return false;
+
+ retval = dxil_emit_load(&ctx->mod, ptr, align, false);
+ if (!retval)
+ return false;
+
+ store_dest(ctx, &intr->dest, 0, retval, nir_type_uint);
+ return true;
+}
+
+static bool
emit_load_deref(struct ntd_context *ctx, nir_intrinsic_instr *intr)
{
assert(intr->src[0].is_ssa);
emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
{
switch (intr->intrinsic) {
+ case nir_intrinsic_load_global_invocation_id:
+ case nir_intrinsic_load_global_invocation_id_zero_base:
+ return emit_load_global_invocation_id(ctx, intr);
+ case nir_intrinsic_load_local_invocation_id:
+ return emit_load_local_invocation_id(ctx, intr);
+ case nir_intrinsic_load_work_group_id:
+ case nir_intrinsic_load_work_group_id_zero_base:
+ return emit_load_local_work_group_id(ctx, intr);
+ case nir_intrinsic_load_ssbo:
+ return emit_load_ssbo(ctx, intr);
+ case nir_intrinsic_store_ssbo:
+ return emit_store_ssbo(ctx, intr);
+ case nir_intrinsic_store_ssbo_masked_dxil:
+ return emit_store_ssbo_masked(ctx, intr);
case nir_intrinsic_store_deref:
return emit_store_deref(ctx, intr);
+ case nir_intrinsic_store_shared_dxil:
+ case nir_intrinsic_store_shared_masked_dxil:
+ return emit_store_shared(ctx, intr);
+ case nir_intrinsic_store_scratch_dxil:
+ return emit_store_scratch(ctx, intr);
case nir_intrinsic_load_deref:
return emit_load_deref(ctx, intr);
+ case nir_intrinsic_load_ptr_dxil:
+ return emit_load_ptr(ctx, intr);
case nir_intrinsic_load_ubo:
return emit_load_ubo(ctx, intr);
case nir_intrinsic_load_ubo_dxil:
ctx->system_value[SYSTEM_VALUE_INSTANCE_ID]);
case nir_intrinsic_load_primitive_id:
return emit_load_primitiveid(ctx, intr);
+ case nir_intrinsic_load_shared_dxil:
+ return emit_load_shared(ctx, intr);
+ case nir_intrinsic_load_scratch_dxil:
+ return emit_load_scratch(ctx, intr);
case nir_intrinsic_discard_if:
return emit_discard_if(ctx, intr);
case nir_intrinsic_discard:
return emit_emit_vertex(ctx, intr);
case nir_intrinsic_end_primitive:
return emit_end_primitive(ctx, intr);
-
+ case nir_intrinsic_scoped_barrier:
+ return emit_barrier(ctx, intr);
+ case nir_intrinsic_ssbo_atomic_add:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_ADD, nir_type_int);
+ case nir_intrinsic_ssbo_atomic_imin:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_IMIN, nir_type_int);
+ case nir_intrinsic_ssbo_atomic_umin:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_UMIN, nir_type_uint);
+ case nir_intrinsic_ssbo_atomic_imax:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_IMAX, nir_type_int);
+ case nir_intrinsic_ssbo_atomic_umax:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_UMAX, nir_type_uint);
+ case nir_intrinsic_ssbo_atomic_and:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_AND, nir_type_uint);
+ case nir_intrinsic_ssbo_atomic_or:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_OR, nir_type_uint);
+ case nir_intrinsic_ssbo_atomic_xor:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_XOR, nir_type_uint);
+ case nir_intrinsic_ssbo_atomic_exchange:
+ return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_EXCHANGE, nir_type_int);
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ return emit_ssbo_atomic_comp_swap(ctx, intr);
+ case nir_intrinsic_shared_atomic_add_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_ADD, nir_type_int);
+ case nir_intrinsic_shared_atomic_imin_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_MIN, nir_type_int);
+ case nir_intrinsic_shared_atomic_umin_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_UMIN, nir_type_uint);
+ case nir_intrinsic_shared_atomic_imax_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_MAX, nir_type_int);
+ case nir_intrinsic_shared_atomic_umax_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_UMAX, nir_type_uint);
+ case nir_intrinsic_shared_atomic_and_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_AND, nir_type_uint);
+ case nir_intrinsic_shared_atomic_or_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_OR, nir_type_uint);
+ case nir_intrinsic_shared_atomic_xor_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_XOR, nir_type_uint);
+ case nir_intrinsic_shared_atomic_exchange_dxil:
+ return emit_shared_atomic(ctx, intr, DXIL_RMWOP_XCHG, nir_type_int);
+ case nir_intrinsic_shared_atomic_comp_swap_dxil:
+ return emit_shared_atomic_comp_swap(ctx, intr);
+ case nir_intrinsic_image_store:
+ return emit_image_store(ctx, intr);
+ case nir_intrinsic_image_size:
+ return emit_image_size(ctx, intr);
+
+ case nir_intrinsic_load_num_work_groups:
+ case nir_intrinsic_load_local_group_size:
default:
NIR_INSTR_UNSUPPORTED(&intr->instr);
assert("Unimplemented intrinsic instruction");
static bool
emit_cbvs(struct ntd_context *ctx, nir_shader *s)
{
- for (int i = ctx->opts->ubo_binding_offset; i < s->info.num_ubos; ++i) {
- char name[64];
- snprintf(name, sizeof(name), "__ubo%d", i);
- if (!emit_cbv(ctx, i, 16384 /*4096 vec4's*/, name))
+ if (s->info.stage == MESA_SHADER_KERNEL) {
+ nir_foreach_variable_with_modes(var, s, nir_var_mem_ubo) {
+ if (!emit_ubo_var(ctx, var))
+ return false;
+ }
+ } else {
+ for (int i = ctx->opts->ubo_binding_offset; i < s->info.num_ubos; ++i) {
+ char name[64];
+ snprintf(name, sizeof(name), "__ubo%d", i);
+ if (!emit_cbv(ctx, i, 16384 /*4096 vec4's*/, name))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+emit_scratch(struct ntd_context *ctx, nir_shader *s)
+{
+ if (s->scratch_size) {
+ /*
+ * We always allocate an u32 array, no matter the actual variable types.
+ * According to the DXIL spec, the minimum load/store granularity is
+ * 32-bit, anything smaller requires using a read-extract/read-write-modify
+ * approach.
+ */
+ unsigned size = ALIGN_POT(s->scratch_size, sizeof(uint32_t));
+ const struct dxil_type *int32 = dxil_module_get_int_type(&ctx->mod, 32);
+ const struct dxil_value *array_length = dxil_module_get_int32_const(&ctx->mod, size / sizeof(uint32_t));
+ if (!int32 || !array_length)
+ return false;
+
+ const struct dxil_type *type = dxil_module_get_array_type(
+ &ctx->mod, int32, size / sizeof(uint32_t));
+ if (!type)
+ return false;
+
+ ctx->scratchvars = dxil_emit_alloca(&ctx->mod, type, int32, array_length, 4);
+ if (!ctx->scratchvars)
return false;
}
return true;
}
+/* The validator complains if we don't have ops that reference a global variable. */
+static bool
+shader_has_shared_ops(struct nir_shader *s)
+{
+ nir_foreach_function(func, s) {
+ if (!func->impl)
+ continue;
+ nir_foreach_block(block, func->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_shared_dxil:
+ case nir_intrinsic_store_shared_dxil:
+ case nir_intrinsic_shared_atomic_add_dxil:
+ case nir_intrinsic_shared_atomic_and_dxil:
+ case nir_intrinsic_shared_atomic_comp_swap_dxil:
+ case nir_intrinsic_shared_atomic_exchange_dxil:
+ case nir_intrinsic_shared_atomic_imax_dxil:
+ case nir_intrinsic_shared_atomic_imin_dxil:
+ case nir_intrinsic_shared_atomic_or_dxil:
+ case nir_intrinsic_shared_atomic_umax_dxil:
+ case nir_intrinsic_shared_atomic_umin_dxil:
+ case nir_intrinsic_shared_atomic_xor_dxil:
+ return true;
+ default: break;
+ }
+ }
+ }
+ }
+ return false;
+}
+
static bool
-emit_module(struct ntd_context *ctx, nir_shader *s)
+emit_module(struct ntd_context *ctx, nir_shader *s, const struct nir_to_dxil_options *opts)
{
unsigned binding;
}
}
+ if (s->info.cs.shared_size && shader_has_shared_ops(s)) {
+ const struct dxil_type *type;
+ unsigned size;
+
+ /*
+ * We always allocate an u32 array, no matter the actual variable types.
+ * According to the DXIL spec, the minimum load/store granularity is
+ * 32-bit, anything smaller requires using a read-extract/read-write-modify
+ * approach. Non-atomic 64-bit accesses are allowed, but the
+ * GEP(cast(gvar, u64[] *), offset) and cast(GEP(gvar, offset), u64 *))
+ * sequences don't seem to be accepted by the DXIL validator when the
+ * pointer is in the groupshared address space, making the 32-bit -> 64-bit
+ * pointer cast impossible.
+ */
+ size = ALIGN_POT(s->info.cs.shared_size, sizeof(uint32_t));
+ type = dxil_module_get_array_type(&ctx->mod,
+ dxil_module_get_int_type(&ctx->mod, 32),
+ size / sizeof(uint32_t));
+ ctx->sharedvars = dxil_add_global_ptr_var(&ctx->mod, "shared", type,
+ DXIL_AS_GROUPSHARED,
+ ffs(sizeof(uint64_t)),
+ NULL);
+ }
+
+ if (!emit_scratch(ctx, s))
+ return false;
+
+ /* UAVs */
+ if (s->info.stage == MESA_SHADER_KERNEL) {
+ if (!emit_globals(ctx, s, opts->num_kernel_globals))
+ return false;
+
+ ctx->consts = _mesa_pointer_hash_table_create(ctx->ralloc_ctx);
+ if (!ctx->consts)
+ return false;
+ if (!emit_global_consts(ctx, s))
+ return false;
+ }
+
nir_foreach_variable_with_modes(var, s, nir_var_uniform) {
unsigned count = glsl_type_get_image_count(var->type);
if (var->data.mode == nir_var_uniform && count) {
return DXIL_GEOMETRY_SHADER;
case MESA_SHADER_FRAGMENT:
return DXIL_PIXEL_SHADER;
+ case MESA_SHADER_KERNEL:
case MESA_SHADER_COMPUTE:
return DXIL_COMPUTE_SHADER;
default:
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, dxil_nir_lower_x2b);
+ if (s->options->lower_int64_options)
+ NIR_PASS(progress, s, nir_lower_int64);
NIR_PASS(progress, s, nir_lower_alu);
NIR_PASS(progress, s, dxil_nir_lower_inot);
NIR_PASS(progress, s, nir_opt_constant_folding);
NIR_PASS(progress, s, nir_opt_undef);
+ NIR_PASS(progress, s, nir_lower_undef_to_zero);
NIR_PASS(progress, s, nir_opt_deref);
+ NIR_PASS(progress, s, dxil_nir_lower_upcast_phis, opts->lower_int16 ? 32 : 16);
+ NIR_PASS(progress, s, nir_lower_64bit_phis);
NIR_PASS_V(s, nir_lower_system_values);
} while (progress);
if (debug_dxil & DXIL_DEBUG_VERBOSE)
nir_print_shader(s, stderr);
- if (!emit_module(ctx, s)) {
+ if (!emit_module(ctx, s, opts)) {
debug_printf("D3D12: dxil_container_add_module failed\n");
retval = false;
goto out;
bool disable_math_refactoring;
unsigned ubo_binding_offset;
unsigned provoking_vertex;
+ unsigned num_kernel_globals;
};
bool
# IN THE SOFTWARE.
subdir('compiler')
+if with_microsoft_clc
+ subdir('clc')
+endif
subdir('resource_state_manager')