radv,aco: use pipe_format for dynamic vertex input state
authorRhys Perry <pendingchaos02@gmail.com>
Wed, 3 Aug 2022 13:38:16 +0000 (14:38 +0100)
committerMarge Bot <emma+marge@anholt.net>
Tue, 30 Aug 2022 19:02:11 +0000 (19:02 +0000)
Also prepare for 64-bit and R8G8B8/R16G16B16 with the addition of
radv_vs_input_state::nontrivial_formats.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5021
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17894>

src/amd/compiler/aco_instruction_selection.cpp
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_shader.h

index 2547833..135e63a 100644 (file)
@@ -12221,6 +12221,9 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
    unsigned num_sgprs = 0;
 
+   const struct ac_vtx_format_info* vtx_info_table =
+      ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
+
    for (unsigned loc = 0; loc < key->num_attributes;) {
       unsigned num_descs =
          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
@@ -12285,9 +12288,12 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
          /* perform load */
          PhysReg cur_desc = desc.advance(i * 16);
          if ((key->misaligned_mask & (1u << loc))) {
-            unsigned dfmt = key->state.formats[loc] & 0xf;
-            unsigned nfmt = key->state.formats[loc] >> 4;
-            const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
+            const struct ac_vtx_format_info* vtx_info = &vtx_info_table[key->state.formats[loc]];
+
+            assert(vtx_info->has_hw_format & 0x1);
+            unsigned dfmt = vtx_info->hw_format[0] & 0xf;
+            unsigned nfmt = vtx_info->hw_format[0] >> 4;
+
             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
                bool post_shuffle = key->state.post_shuffle & (1u << loc);
                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
@@ -12297,14 +12303,14 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
                 * care).
                 */
-               if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
+               if (dfmt == V_008F0C_BUF_DATA_FORMAT_32)
                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
                             false, true);
                else
                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
-                            Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
-                            vtx_info->chan_format, nfmt, offset, false, true);
+                            Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
+                            offset, false, true);
             }
             uint32_t one =
                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
index d3dc79e..b4dd00f 100644 (file)
@@ -308,8 +308,6 @@ radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 {
    list_del(&cmd_buffer->pool_link);
 
-   util_dynarray_fini(&cmd_buffer->cached_vertex_formats);
-
    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
    {
       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
@@ -380,8 +378,6 @@ radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
    vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
                        VK_OBJECT_TYPE_DESCRIPTOR_SET);
 
-   util_dynarray_init(&cmd_buffer->cached_vertex_formats, NULL);
-
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
       vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
                           VK_OBJECT_TYPE_DESCRIPTOR_SET);
@@ -2964,6 +2960,7 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shad
       cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
       cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
    }
+   misaligned_mask |= state->nontrivial_formats;
 
    /* try to use a pre-compiled prolog first */
    struct radv_shader_part *prolog = NULL;
@@ -3540,39 +3537,6 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
    cmd_buffer->push_constant_stages |= dirty_stages;
 }
 
-enum radv_dst_sel {
-   DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
-                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
-   DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
-                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
-   DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
-   DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
-   DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
-   DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
-};
-
-static const uint32_t data_format_dst_sel[] = {
-   [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
-   [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
-   [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
-   [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
-   [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
-   [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
-   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
-   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
-   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
-   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
-   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
-   [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
-   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
-   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
-   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
-};
-
 void
 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
                               const struct radv_graphics_pipeline *pipeline,
@@ -3580,6 +3544,7 @@ radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
 {
    struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
+   enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
    unsigned desc_index = 0;
    uint32_t mask = pipeline->vb_desc_usage_mask;
    uint64_t va;
@@ -3587,6 +3552,9 @@ radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
       vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
    assert(!vs_state || pipeline->use_per_attribute_vb_descs);
 
+   const struct ac_vtx_format_info *vtx_info_table =
+      vs_state ? ac_get_vtx_format_info_table(chip, family) : NULL;
+
    while (mask) {
       unsigned i = u_bit_scan(&mask);
       uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
@@ -3598,23 +3566,25 @@ radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
       unsigned num_records;
       unsigned stride;
 
-      if (vs_state) {
-         unsigned format = vs_state->formats[i];
-         unsigned dfmt = format & 0xf;
-         unsigned nfmt = (format >> 4) & 0x7;
-
-         rsrc_word3 = vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
+      if (vs_state && !(vs_state->nontrivial_formats & BITFIELD_BIT(i))) {
+         const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vs_state->formats[i]];
+         unsigned hw_format = vtx_info->hw_format[vtx_info->num_channels - 1];
 
-         if (chip >= GFX10)
-            rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
-         else
-            rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
+         if (chip >= GFX10) {
+            rsrc_word3 = vtx_info->dst_sel | S_008F0C_FORMAT(hw_format);
+         } else {
+            rsrc_word3 = vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) |
+                         S_008F0C_DATA_FORMAT(hw_format & 0xf);
+         }
       } else {
+         rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                      S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
          if (chip >= GFX10)
-            rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
+            rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
          else
-            rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+            rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
       }
 
       if (pipeline->dynamic_states & (RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
@@ -5934,6 +5904,9 @@ radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingD
    state->bindings_match_attrib = true;
 
    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
+   enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
+   const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family);
+
    for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
       const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
       const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
@@ -5955,50 +5928,27 @@ radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingD
       cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
       state->offsets[loc] = attrib->offset;
 
-      struct dynamic_vertex_format_cache *found = NULL;
-      util_dynarray_foreach(&cmd_buffer->cached_vertex_formats,
-                            struct dynamic_vertex_format_cache,
-                            vf) {
-         if (vf->format == attrib->format) {
-            found = vf;
-            break;
-         }
-      }
-      if (!found) {
-         unsigned nfmt, dfmt;
-         bool post_shuffle;
-         enum ac_vs_input_alpha_adjust alpha_adjust;
-         const struct util_format_description *format_desc = vk_format_description(attrib->format);
-
-         found = util_dynarray_grow(&cmd_buffer->cached_vertex_formats,
-                                    struct dynamic_vertex_format_cache, 1);
-         radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
-                                      &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
-         found->format = attrib->format;
-         found->hw_fmt = dfmt | (nfmt << 4);
-         const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
-            (format_desc->block.bits / 8u - 1);
-         found->fmt_align_req_minus_1 = format_align_req_minus_1;
-         found->fmt_size = format_desc->block.bits / 8u;
-         found->post_shuffle = post_shuffle;
-         found->alpha_adjust_lo = alpha_adjust & 0x1;
-         found->alpha_adjust_hi = (alpha_adjust >> 1) & 0x1;
-      }
-
-      state->formats[loc] = found->hw_fmt;
-      state->format_align_req_minus_1[loc] = found->fmt_align_req_minus_1;
-      state->format_sizes[loc] = found->fmt_size;
-      state->alpha_adjust_lo |= found->alpha_adjust_lo << loc;
-      state->alpha_adjust_hi |= found->alpha_adjust_hi << loc;
-      if (found->post_shuffle)
-         state->post_shuffle |= 1u << loc;
+      enum pipe_format format = vk_format_to_pipe_format(attrib->format);
+      const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format];
+
+      state->formats[loc] = format;
+      uint8_t align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1);
+      state->format_align_req_minus_1[loc] = align_req_minus_1;
+      state->format_sizes[loc] = vtx_info->element_size;
+      state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc;
+      state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc;
+      if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z)
+         state->post_shuffle |= BITFIELD_BIT(loc);
+
+      if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)))
+         state->nontrivial_formats |= BITFIELD_BIT(loc);
 
       if ((chip == GFX6 || chip >= GFX10) &&
           cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
-         if (binding->stride & found->fmt_align_req_minus_1) {
+         if (binding->stride & align_req_minus_1) {
             cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
          } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) &
-                    found->fmt_align_req_minus_1) {
+                    align_req_minus_1) {
             cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
          }
       }
index 10a7986..043022b 100644 (file)
@@ -1568,16 +1568,6 @@ enum radv_cmd_buffer_status {
    RADV_CMD_BUFFER_STATUS_PENDING,
 };
 
-struct dynamic_vertex_format_cache {
-   VkFormat format;
-   uint8_t hw_fmt;
-   uint8_t fmt_align_req_minus_1;
-   uint8_t fmt_size;
-   bool post_shuffle;
-   bool alpha_adjust_lo;
-   bool alpha_adjust_hi;
-};
-
 struct radv_cmd_buffer {
    struct vk_command_buffer vk;
 
@@ -1586,7 +1576,6 @@ struct radv_cmd_buffer {
    struct radv_cmd_pool *pool;
    struct list_head pool_link;
 
-   struct util_dynarray cached_vertex_formats;
    VkCommandBufferUsageFlags usage_flags;
    enum radv_cmd_buffer_status status;
    struct radeon_cmdbuf *cs;
index 2b2d218..137048e 100644 (file)
@@ -374,6 +374,7 @@ struct radv_vs_input_state {
     */
    uint32_t alpha_adjust_lo;
    uint32_t alpha_adjust_hi;
+   uint32_t nontrivial_formats;
 
    uint8_t bindings[MAX_VERTEX_ATTRIBS];
    uint32_t divisors[MAX_VERTEX_ATTRIBS];