From 477332a347799e08d884f8fcc31816291575a955 Mon Sep 17 00:00:00 2001 From: Jesse Natalie Date: Thu, 23 Mar 2023 08:30:58 -0700 Subject: [PATCH] microsoft/compiler: Fix 8-bit loads and stores when supporting 16-bit DXIL Shifts should always use 32bit shift values, and when lowering to masked, we need to use 32-bit atomics. That means that we should also treat 24bit stores as a single masked op rather than one 16bit unmasked and one 8bit masked. Part-of: --- src/microsoft/compiler/dxil_nir.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c index c7acafa..b671bc2 100644 --- a/src/microsoft/compiler/dxil_nir.c +++ b/src/microsoft/compiler/dxil_nir.c @@ -257,7 +257,7 @@ lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size if (nir_intrinsic_align(intr) < store_bit_size / 8) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)), nir_imm_int(b, 8)); - result = nir_ushr(b, result, nir_u2uN(b, shift, store_bit_size)); + result = nir_ushr(b, result, shift); } /* And now comes the pack/unpack step to match the original type. */ @@ -287,11 +287,14 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz unsigned src_bit_size = val->bit_size; unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32); + unsigned masked_store_bit_size = 32; unsigned num_components = val->num_components; unsigned num_bits = num_components * src_bit_size; unsigned offset_mask = store_bit_size / 8 - 1; + unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1; nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask)); + nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 }; unsigned comp_idx = 0; @@ -325,32 +328,41 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz ++num_src_comps_stored; substore_num_bits += src_bit_size; } + bool force_masked = false; if (substore_num_bits > store_bit_size && substore_num_bits % store_bit_size != 0) { /* Split this into two, one unmasked store of the first bits, * and then the second loop iteration will handle a masked store * for the rest. */ assert(num_src_comps_stored == 3); - --num_src_comps_stored; - substore_num_bits = store_bit_size; + if (store_bit_size == 16) { + assert(substore_num_bits < 32); + /* If we're already doing atomics to store, just do one + * 32bit masked store instead of a 16bit store and a masked + * store for the other 8 bits. */ + force_masked = true; + } else { + --num_src_comps_stored; + substore_num_bits = store_bit_size; + } } - nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8)); - nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], - num_src_comps_stored, store_bit_size); nir_intrinsic_instr *store; - if (substore_num_bits < store_bit_size) { - nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, store_bit_size); + if (substore_num_bits < store_bit_size || force_masked) { + nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], + num_src_comps_stored, masked_store_bit_size); + nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size); /* If we have small alignments we need to place them correctly in the component. */ - if (nir_intrinsic_align(intr) <= store_bit_size / 8) { - nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, offset_mask)); - nir_ssa_def *shift = nir_u2uN(b, nir_imul_imm(b, pos, 8), store_bit_size); + if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) { + nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask)); + nir_ssa_def *shift = nir_imul_imm(b, pos, 8); store_vec = nir_ishl(b, store_vec, shift); mask = nir_ishl(b, mask, shift); } + nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8)); store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo_masked_dxil); store->src[0] = nir_src_for_ssa(store_vec); @@ -358,6 +370,9 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz store->src[2] = nir_src_for_ssa(buffer); store->src[3] = nir_src_for_ssa(local_offset); } else { + nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8)); + nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], + num_src_comps_stored, store_bit_size); store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo); store->src[0] = nir_src_for_ssa(store_vec); -- 2.7.4