From 37cde2c63483fcf99092af84c14bea7ba3adbb84 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 16 Sep 2022 16:24:17 +0200 Subject: [PATCH] tu: Rewrite inline uniform implementation Now we always push the inline uniforms, using an indirect CP_LOAD_STATE. There is just enough space to be able to always push them if we expose the minimum possible per-stage limits. This aligns us with Qualcomm and removes the need to setup a uniform descriptor which will be problematic. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 50 ++++++++--- src/freedreno/vulkan/tu_common.h | 6 ++ src/freedreno/vulkan/tu_descriptor_set.c | 58 +++++-------- src/freedreno/vulkan/tu_device.c | 19 ++--- src/freedreno/vulkan/tu_shader.c | 137 +++++++++++++++++++++++++++---- src/freedreno/vulkan/tu_shader.h | 15 ++++ 6 files changed, 206 insertions(+), 79 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index f5f53cb..f10f4e7 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -2091,6 +2091,9 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, descriptors_state->sets[idx] = set; + if (set->layout->has_inline_uniforms) + cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; + if (!set->layout->dynamic_offset_size) continue; @@ -4452,6 +4455,8 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline, dwords += 4 + num_units; } + dwords += 4 * link->tu_const_state.num_inline_ubos; + return dwords; } @@ -4459,6 +4464,7 @@ static void tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, gl_shader_stage type, + struct tu_descriptor_state *descriptors, uint32_t *push_constants) { const struct tu_program_descriptor_linkage *link = @@ -4480,6 +4486,20 @@ tu6_emit_user_consts(struct tu_cs *cs, for (unsigned i = 0; i < num_units; i++) tu_cs_emit(cs, push_constants[i + offset]); } + + /* Emit loads of inline uniforms. These load directly from the uniform's + * storage space inside the descriptor set. + */ + for (unsigned i = 0; i < link->tu_const_state.num_inline_ubos; i++) { + const struct tu_inline_ubo *ubo = &link->tu_const_state.ubos[i]; + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(ubo->size_vec4)); + tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset); + } } static void @@ -4518,14 +4538,14 @@ tu6_const_size(struct tu_cmd_buffer *cmd, uint32_t dwords = 0; if (pipeline->shared_consts.dwords > 0) { - dwords = pipeline->shared_consts.dwords + 4; + dwords += pipeline->shared_consts.dwords + 4; + } + + if (compute) { + dwords += tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE); } else { - if (compute) { - dwords = tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE); - } else { - for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) - dwords += tu6_user_consts_size(pipeline, type); - } + for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) + dwords += tu6_user_consts_size(pipeline, type); } return dwords; @@ -4554,13 +4574,17 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd, &pipeline->program.link[i]; assert(!link->tu_const_state.push_consts.dwords); } + } + + if (compute) { + tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE, + tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE), + cmd->push_constants); } else { - if (compute) { - tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE, cmd->push_constants); - } else { - for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) - tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants); - } + struct tu_descriptor_state *descriptors = + tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); + for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) + tu6_emit_user_consts(&cs, pipeline, type, descriptors, cmd->push_constants); } return tu_cs_end_draw_state(&cmd->sub_cs, &cs); diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index 6aad2c7..136760e 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -109,6 +109,12 @@ */ #define MAX_UNIFORM_BUFFER_RANGE 0x10000 +/* Use the minimum maximum to guarantee that it can always fit in the safe + * const file size, even with maximum push constant usage and driver params. + */ +#define MAX_INLINE_UBO_RANGE 256 +#define MAX_INLINE_UBOS 4 + #define A6XX_TEX_CONST_DWORDS 16 #define A6XX_TEX_SAMP_DWORDS 4 diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c index 14d8b4b..5297df6 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.c +++ b/src/freedreno/vulkan/tu_descriptor_set.c @@ -66,8 +66,7 @@ descriptor_size(struct tu_device *dev, return A6XX_TEX_CONST_DWORDS * 4; } case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: - return A6XX_TEX_CONST_DWORDS * 4 + - ALIGN(binding->descriptorCount, A6XX_TEX_CONST_DWORDS * 4); + return binding->descriptorCount; default: return A6XX_TEX_CONST_DWORDS * 4; } @@ -557,8 +556,8 @@ tu_descriptor_set_create(struct tu_device *device, struct tu_descriptor_set_binding_layout *binding = &layout->binding[layout->binding_count - 1]; if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - layout_size = binding->offset + A6XX_TEX_CONST_DWORDS * 4 + - ALIGN(variable_count, A6XX_TEX_CONST_DWORDS * 4); + layout_size = binding->offset + + ALIGN(variable_count, 4 * A6XX_TEX_CONST_DWORDS); } else { uint32_t stride = binding->size; layout_size = binding->offset + variable_count * stride; @@ -635,24 +634,6 @@ tu_descriptor_set_create(struct tu_device *device, } } - if (layout->has_inline_uniforms) { - for (unsigned i = 0; i < layout->binding_count; i++) { - if (layout->binding[i].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) - continue; - - uint32_t *ptr = set->mapped_ptr + layout->binding[i].offset / 4; - uint64_t va = set->va + layout->binding[i].offset + - A6XX_TEX_CONST_DWORDS * 4; - uint32_t size = - (layout->has_variable_descriptors && i == layout->binding_count - 1) ? - variable_count : layout->binding[i].size - A6XX_TEX_CONST_DWORDS * 4; - size = ALIGN_POT(size, 16) / 16; - - ptr[0] = A6XX_UBO_0_BASE_LO(va); - ptr[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(size); - } - } - vk_descriptor_set_layout_ref(&layout->vk); list_addtail(&set->pool_link, &pool->desc_sets); @@ -705,12 +686,12 @@ tu_CreateDescriptorPool(VkDevice _device, DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO); if (inline_info) { - /* In addition to the size of the descriptors, we have to factor in the - * padding for each binding. The sizes are 4 aligned but we have to - * align to a descriptor size, and in the worst case each inline - * binding has a size of 4 bytes and we have to pad each one out. + /* We have to factor in the padding for each binding. The sizes are 4 + * aligned but we have to align to 4 * A6XX_TEX_CONST_DWORDS bytes, and in + * the worst case each inline binding has a size of 4 bytes and we have + * to pad each one out. */ - bo_size += (2 * 4 * A6XX_TEX_CONST_DWORDS - 4) * + bo_size += (4 * A6XX_TEX_CONST_DWORDS - 4) * inline_info->maxInlineUniformBlockBindings; } @@ -1086,8 +1067,8 @@ tu_update_descriptor_sets(const struct tu_device *device, * rule. * * This means we can't just do a straight memcpy, because due to - * alignment padding and the descriptor itself there are gaps between - * sequential bindings. We have to loop over each binding updated. + * alignment padding there are gaps between sequential bindings. We + * have to loop over each binding updated. */ const VkWriteDescriptorSetInlineUniformBlock *inline_write = vk_find_struct_const(writeset->pNext, @@ -1096,9 +1077,8 @@ tu_update_descriptor_sets(const struct tu_device *device, const uint8_t *src = inline_write->pData; uint32_t dst_offset = writeset->dstArrayElement; do { - uint8_t *dst = (uint8_t *)(ptr + A6XX_TEX_CONST_DWORDS) + dst_offset; - uint32_t binding_size = - binding_layout->size - A6XX_TEX_CONST_DWORDS * 4 - dst_offset; + uint8_t *dst = (uint8_t *)(ptr) + dst_offset; + uint32_t binding_size = binding_layout->size - dst_offset; uint32_t to_write = MIN2(remaining, binding_size); memcpy(dst, src, to_write); @@ -1188,12 +1168,12 @@ tu_update_descriptor_sets(const struct tu_device *device, uint32_t remaining = copyset->descriptorCount; uint32_t src_start = copyset->srcArrayElement; uint32_t dst_start = copyset->dstArrayElement; - uint8_t *src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS) + src_start; - uint8_t *dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS) + dst_start; + uint8_t *src = (uint8_t *)(src_ptr) + src_start; + uint8_t *dst = (uint8_t *)(dst_ptr) + dst_start; uint32_t src_remaining = - src_binding_layout->size - src_start - 4 * A6XX_TEX_CONST_DWORDS; + src_binding_layout->size - src_start; uint32_t dst_remaining = - dst_binding_layout->size - dst_start - 4 * A6XX_TEX_CONST_DWORDS; + dst_binding_layout->size - dst_start; do { uint32_t to_write = MIN3(remaining, src_remaining, dst_remaining); memcpy(dst, src, to_write); @@ -1294,7 +1274,7 @@ tu_CreateDescriptorUpdateTemplate( set_layout->binding + entry->dstBinding; uint32_t dst_start = entry->dstArrayElement; do { - uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4; + uint32_t size = binding_layout->size; uint32_t count = MIN2(remaining, size - dst_start); remaining -= count; binding_layout++; @@ -1343,8 +1323,8 @@ tu_CreateDescriptorUpdateTemplate( /* See comment in update_descriptor_sets() */ do { dst_offset = - binding_layout->offset + A6XX_TEX_CONST_DWORDS * 4 + dst_start; - uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4; + binding_layout->offset + dst_start; + uint32_t size = binding_layout->size; uint32_t count = MIN2(remaining, size - dst_start); templ->entry[j++] = (struct tu_descriptor_update_template_entry) { .descriptor_type = entry->descriptorType, diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index cb3034f..fd66bd7 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1097,19 +1097,12 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice, p->maxComputeWorkgroupSubgroups = 16; /* max_waves */ p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL; - /* Inline uniform buffers are just normal UBOs */ - p->maxInlineUniformBlockSize = MAX_UNIFORM_BUFFER_RANGE; - - /* Halve the normal limit on the number of descriptors, see below. */ - p->maxPerStageDescriptorInlineUniformBlocks = max_descriptor_set_size / 2; - p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2; - p->maxDescriptorSetInlineUniformBlocks = max_descriptor_set_size / 2; - p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2; - /* Because we halve the normal limit on the number of descriptors, in the - * worst case each descriptor takes up half the space, leaving the rest for - * the actual data. - */ - p->maxInlineUniformTotalSize = MAX_SET_SIZE / 2; + p->maxInlineUniformBlockSize = MAX_INLINE_UBO_RANGE; + p->maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UBOS; + p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UBOS; + p->maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UBOS; + p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UBOS; + p->maxInlineUniformTotalSize = MAX_INLINE_UBOS * MAX_INLINE_UBO_RANGE; p->integerDotProduct8BitUnsignedAccelerated = false; p->integerDotProduct8BitSignedAccelerated = false; diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c index e485f8f..0dc9219 100644 --- a/src/freedreno/vulkan/tu_shader.c +++ b/src/freedreno/vulkan/tu_shader.c @@ -178,6 +178,9 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, &set_layout->binding[binding]; nir_ssa_def *base; + if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + return; + shader->active_desc_sets |= 1u << set; switch (binding_layout->type) { @@ -205,16 +208,9 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, break; } - nir_ssa_def *shift; - - if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - /* Inline uniform blocks cannot have arrays so the stride is unused */ - shift = nir_imm_int(b, 0); - } else { - unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS); - assert(util_is_power_of_two_nonzero(stride)); - shift = nir_imm_int(b, util_logbase2(stride)); - } + unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS); + assert(util_is_power_of_two_nonzero(stride)); + nir_ssa_def *shift = nir_imm_int(b, util_logbase2(stride)); nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set), nir_iadd(b, base, @@ -608,6 +604,70 @@ lower_instr(nir_builder *b, nir_instr *instr, void *cb_data) } } +/* Since we always push inline uniforms into constant memory, lower loads of + * them to load_uniform which turns into constant memory loads. + */ +static bool +lower_inline_ubo(nir_builder *b, nir_instr *instr, void *cb_data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_ubo) + return false; + + struct lower_instr_params *params = cb_data; + struct tu_shader *shader = params->shader; + const struct tu_pipeline_layout *layout = params->layout; + + nir_binding binding = nir_chase_binding(intrin->src[0]); + + if (!binding.success) + return false; + + struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout; + struct tu_descriptor_set_binding_layout *binding_layout = + &set_layout->binding[binding.binding]; + + if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + return false; + + /* lookup the const offset of the inline UBO */ + struct tu_const_state *const_state = &shader->const_state; + + unsigned base = UINT_MAX; + for (unsigned i = 0; i < const_state->num_inline_ubos; i++) { + if (const_state->ubos[i].base == binding.desc_set && + const_state->ubos[i].offset == binding_layout->offset) { + base = const_state->ubos[i].const_offset_vec4 * 4; + break; + } + } + + if (base == UINT_MAX) { + /* Assume we're loading out-of-bounds from a 0-sized inline uniform + * filtered out below. + */ + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_ssa_undef(b, intrin->num_components, + intrin->dest.ssa.bit_size)); + return true; + } + + nir_ssa_def *offset = intrin->src[1].ssa; + + b->cursor = nir_before_instr(instr); + nir_ssa_def *uniform = + nir_load_uniform(b, intrin->num_components, + intrin->dest.ssa.bit_size, + nir_ishr_imm(b, offset, 2), .base = base); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, uniform); + nir_instr_remove(instr); + return true; +} + /* Figure out the range of push constants that we're actually going to push to * the shader, and tell the backend to reserve this range when pushing UBO * constants. @@ -678,6 +738,47 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, const_state->dynamic_offset_loc = UINT32_MAX; } + /* Reserve space for inline uniforms, so we can always load them from + * constants and not setup a UBO descriptor for them. + */ + for (unsigned set = 0; set < layout->num_sets; set++) { + const struct tu_descriptor_set_layout *desc_layout = + layout->set[set].layout; + + if (!desc_layout || !desc_layout->has_inline_uniforms) + continue; + + for (unsigned b = 0; b < desc_layout->binding_count; b++) { + const struct tu_descriptor_set_binding_layout *binding = + &desc_layout->binding[b]; + + if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + continue; + if (!(binding->shader_stages & + mesa_to_vk_shader_stage(shader->info.stage))) + continue; + + /* Workaround a CTS bug by ignoring zero-sized inline uniform + * blocks that aren't being properly filtered out when creating the + * descriptor set layout, see + * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4115 + */ + if (binding->size == 0) + continue; + + assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos)); + unsigned size_vec4 = DIV_ROUND_UP(binding->size, 16); + const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) { + .base = set, + .offset = binding->offset, + .const_offset_vec4 = reserved_consts_vec4, + .size_vec4 = size_vec4, + }; + + reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit); + } + } + tu_shader->reserved_user_consts_vec4 = reserved_consts_vec4; struct lower_instr_params params = { @@ -686,10 +787,18 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, .layout = layout, }; - bool progress = nir_shader_instructions_pass(shader, - lower_instr, - nir_metadata_none, - ¶ms); + bool progress = false; + if (const_state->num_inline_ubos) { + progress |= nir_shader_instructions_pass(shader, + lower_inline_ubo, + nir_metadata_none, + ¶ms); + } + + progress |= nir_shader_instructions_pass(shader, + lower_instr, + nir_metadata_none, + ¶ms); /* Remove now-unused variables so that when we gather the shader info later * they won't be counted. diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h index 69e0a20..a4015cf 100644 --- a/src/freedreno/vulkan/tu_shader.h +++ b/src/freedreno/vulkan/tu_shader.h @@ -12,6 +12,19 @@ #include "tu_common.h" +struct tu_inline_ubo +{ + /* Push the data at BINDLESS_BASE[base] + offset */ + unsigned base; + unsigned offset; + + /* Push it to this location in the const file, in vec4s */ + unsigned const_offset_vec4; + + /* How much to push */ + unsigned size_vec4; +}; + struct tu_push_constant_range { uint32_t lo; @@ -22,6 +35,8 @@ struct tu_const_state { struct tu_push_constant_range push_consts; uint32_t dynamic_offset_loc; + unsigned num_inline_ubos; + struct tu_inline_ubo ubos[MAX_INLINE_UBOS]; }; struct tu_shader -- 2.7.4