if (nir_intrinsic_align(intr) < store_bit_size / 8) {
nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)),
nir_imm_int(b, 8));
- result = nir_ushr(b, result, nir_u2uN(b, shift, store_bit_size));
+ result = nir_ushr(b, result, shift);
}
/* And now comes the pack/unpack step to match the original type. */
unsigned src_bit_size = val->bit_size;
unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32);
+ unsigned masked_store_bit_size = 32;
unsigned num_components = val->num_components;
unsigned num_bits = num_components * src_bit_size;
unsigned offset_mask = store_bit_size / 8 - 1;
+ unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1;
nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask));
+ nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask));
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 };
unsigned comp_idx = 0;
++num_src_comps_stored;
substore_num_bits += src_bit_size;
}
+ bool force_masked = false;
if (substore_num_bits > store_bit_size &&
substore_num_bits % store_bit_size != 0) {
/* Split this into two, one unmasked store of the first bits,
* and then the second loop iteration will handle a masked store
* for the rest. */
assert(num_src_comps_stored == 3);
- --num_src_comps_stored;
- substore_num_bits = store_bit_size;
+ if (store_bit_size == 16) {
+ assert(substore_num_bits < 32);
+ /* If we're already doing atomics to store, just do one
+ * 32bit masked store instead of a 16bit store and a masked
+ * store for the other 8 bits. */
+ force_masked = true;
+ } else {
+ --num_src_comps_stored;
+ substore_num_bits = store_bit_size;
+ }
}
- nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
- nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
- num_src_comps_stored, store_bit_size);
nir_intrinsic_instr *store;
- if (substore_num_bits < store_bit_size) {
- nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, store_bit_size);
+ if (substore_num_bits < store_bit_size || force_masked) {
+ nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
+ num_src_comps_stored, masked_store_bit_size);
+ nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size);
/* If we have small alignments we need to place them correctly in the component. */
- if (nir_intrinsic_align(intr) <= store_bit_size / 8) {
- nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, offset_mask));
- nir_ssa_def *shift = nir_u2uN(b, nir_imul_imm(b, pos, 8), store_bit_size);
+ if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) {
+ nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask));
+ nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
store_vec = nir_ishl(b, store_vec, shift);
mask = nir_ishl(b, mask, shift);
}
+ nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8));
store = nir_intrinsic_instr_create(b->shader,
nir_intrinsic_store_ssbo_masked_dxil);
store->src[0] = nir_src_for_ssa(store_vec);
store->src[2] = nir_src_for_ssa(buffer);
store->src[3] = nir_src_for_ssa(local_offset);
} else {
+ nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
+ nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
+ num_src_comps_stored, store_bit_size);
store = nir_intrinsic_instr_create(b->shader,
nir_intrinsic_store_ssbo);
store->src[0] = nir_src_for_ssa(store_vec);