dwords += 4 + num_units;
}
- dwords += 4 * link->tu_const_state.num_inline_ubos;
+ dwords += 8 * link->tu_const_state.num_inline_ubos;
return dwords;
}
for (unsigned i = 0; i < link->tu_const_state.num_inline_ubos; i++) {
const struct tu_inline_ubo *ubo = &link->tu_const_state.ubos[i];
- /* With variable-count inline uniform blocks, we allocate constant space
- * based on the maximum size specified in the descriptor set layout, but
- * the actual size may be smaller. In that case we have to avoid
- * fetching out-of-bounds of the buffer that contains the descriptor
- * set to avoid faults.
- */
- unsigned num_units = MIN2(ubo->size_vec4,
- (descriptors->sets[ubo->base]->size - ubo->offset) / 16);
-
- if (num_units > 0) {
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(num_units));
+ tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), ubo->push_address ? 7 : 3);
+ tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+ CP_LOAD_STATE6_0_STATE_SRC(ubo->push_address ? SS6_DIRECT : SS6_INDIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
+ CP_LOAD_STATE6_0_NUM_UNIT(ubo->size_vec4));
+ if (ubo->push_address) {
+ tu_cs_emit(cs, 0);
+ tu_cs_emit(cs, 0);
+ tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset);
+ tu_cs_emit(cs, 0);
+ tu_cs_emit(cs, 0);
+ } else {
tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset);
}
}
struct tu_const_state *const_state = &shader->const_state;
unsigned base = UINT_MAX;
+ bool use_load = false;
for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
if (const_state->ubos[i].base == binding.desc_set &&
const_state->ubos[i].offset == binding_layout->offset) {
base = const_state->ubos[i].const_offset_vec4 * 4;
+ use_load = const_state->ubos[i].push_address;
break;
}
}
nir_ssa_def *offset = intrin->src[1].ssa;
b->cursor = nir_before_instr(instr);
- nir_ssa_def *uniform =
- nir_load_uniform(b, intrin->num_components,
- intrin->dest.ssa.bit_size,
- nir_ishr_imm(b, offset, 2), .base = base);
+ nir_ssa_def *val;
+
+ if (use_load) {
+ nir_ssa_def *base_addr =
+ nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
+ val = nir_load_global_ir3(b, intrin->num_components,
+ intrin->dest.ssa.bit_size,
+ base_addr, nir_ishr_imm(b, offset, 2));
+ } else {
+ val = nir_load_uniform(b, intrin->num_components,
+ intrin->dest.ssa.bit_size,
+ nir_ishr_imm(b, offset, 2), .base = base);
+ }
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, uniform);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, val);
nir_instr_remove(instr);
return true;
}
if (binding->size == 0)
continue;
+ /* If we don't know the size at compile time due to a variable
+ * descriptor count, then with descriptor buffers we cannot know
+ * how much space the real inline uniform has. In this case we fall
+ * back to pushing the address and using ldg, which is slower than
+ * setting up a descriptor but setting up our own descriptor with
+ * descriptor_buffer is also painful and has to be done on the GPU
+ * and doesn't avoid the UBO getting pushed anyway and faulting if a
+ * out-of-bounds access is hidden behind an if and not dynamically
+ * executed. Given the small max size, there shouldn't be much reason
+ * to use variable size anyway.
+ */
+ bool push_address = desc_layout->has_variable_descriptors &&
+ b == desc_layout->binding_count - 1;
+
+ if (push_address) {
+ perf_debug(dev,
+ "falling back to ldg for variable-sized inline "
+ "uniform block");
+ }
+
assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
- unsigned size_vec4 = DIV_ROUND_UP(binding->size, 16);
+ unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) {
.base = set,
.offset = binding->offset,
.const_offset_vec4 = reserved_consts_vec4,
.size_vec4 = size_vec4,
+ .push_address = push_address,
};
reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);