vbo: restructure vbo_save_vertex_list to get more cache hits
authorMarek Olšák <marek.olsak@amd.com>
Sat, 23 Oct 2021 05:19:23 +0000 (01:19 -0400)
committerMarge Bot <emma+marge@anholt.net>
Fri, 29 Oct 2021 07:33:50 +0000 (07:33 +0000)
- Move more stuff into the cold structure.
- Reorder fields for better packing.
- Flatten the gallium and merged nested structures.

Since we have tens of thousands of these, decreasing the size improves
performance by 13%.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13506>

src/mesa/main/dlist.c
src/mesa/vbo/vbo_save.h
src/mesa/vbo/vbo_save_api.c
src/mesa/vbo/vbo_save_draw.c
src/mesa/vbo/vbo_save_loopback.c

index 86b7e3f..5b6a2fc 100644 (file)
@@ -761,18 +761,18 @@ static void
 vbo_destroy_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node)
 {
    for (gl_vertex_processing_mode mode = VP_MODE_FF; mode < VP_MODE_MAX; ++mode) {
-      _mesa_reference_vao(ctx, &node->VAO[mode], NULL);
-      if (node->merged.gallium.private_refcount[mode]) {
-         assert(node->merged.gallium.private_refcount[mode] > 0);
-         p_atomic_add(&node->merged.gallium.state[mode]->reference.count,
-                      -node->merged.gallium.private_refcount[mode]);
+      _mesa_reference_vao(ctx, &node->cold->VAO[mode], NULL);
+      if (node->private_refcount[mode]) {
+         assert(node->private_refcount[mode] > 0);
+         p_atomic_add(&node->state[mode]->reference.count,
+                      -node->private_refcount[mode]);
       }
-      pipe_vertex_state_reference(&node->merged.gallium.state[mode], NULL);
+      pipe_vertex_state_reference(&node->state[mode], NULL);
    }
 
-   if (node->merged.mode) {
-      free(node->merged.mode);
-      free(node->merged.start_counts);
+   if (node->modes) {
+      free(node->modes);
+      free(node->start_counts);
    }
 
    _mesa_reference_buffer_object(ctx, &node->cold->ib.obj, NULL);
@@ -786,7 +786,7 @@ static void
 vbo_print_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node, OpCode op, FILE *f)
 {
    GLuint i;
-   struct gl_buffer_object *buffer = node->VAO[0]->BufferBinding[0].BufferObj;
+   struct gl_buffer_object *buffer = node->cold->VAO[0]->BufferBinding[0].BufferObj;
    const GLuint vertex_size = _vbo_save_get_stride(node)/sizeof(GLfloat);
    (void) ctx;
 
index 5af82f2..efe3904 100644 (file)
@@ -57,30 +57,30 @@ struct vbo_save_vertex_list {
    union gl_dlist_node header;
 
    /* Data used in vbo_save_playback_vertex_list */
-   struct gl_vertex_array_object *VAO[VP_MODE_MAX];
-
-   struct {
-      struct pipe_draw_info info;
-      unsigned char *mode;
-      union {
-         struct pipe_draw_start_count_bias *start_counts;
-         struct pipe_draw_start_count_bias start_count;
-      };
-      unsigned num_draws;
-
-      struct {
-         struct gl_context *ctx;
-         struct pipe_vertex_state *state[VP_MODE_MAX];
-         int16_t private_refcount[VP_MODE_MAX];
-         GLbitfield enabled_attribs[VP_MODE_MAX];
-         struct pipe_draw_vertex_state_info info;
-      } gallium;
-   } merged;
-
-   /* Cold: used during construction or to handle egde-cases */
+   unsigned num_draws;
+   uint8_t *modes;
+   union {
+      struct pipe_draw_start_count_bias *start_counts;
+      struct pipe_draw_start_count_bias start_count;
+   };
+   uint8_t mode;
+
+   int16_t private_refcount[VP_MODE_MAX];
+   struct gl_context *ctx;
+   struct pipe_vertex_state *state[VP_MODE_MAX];
+   GLbitfield enabled_attribs[VP_MODE_MAX];
+
+   /* Cold: used during construction or to handle edge-cases.
+    * It's not part of the structure because we want display list nodes
+    * to be tightly packed to get cache hits. Without this, performance would
+    * decrease by an order of magnitude with 10k display lists.
+    */
    struct {
+      struct gl_vertex_array_object *VAO[VP_MODE_MAX];
       struct _mesa_index_buffer ib;
 
+      struct pipe_draw_info info;
+
       /* Copy of the final vertex from node->vertex_store->bufferobj.
        * Keep this in regular (non-VBO) memory to avoid repeated
        * map/unmap of the VBO when updating GL current data.
@@ -103,7 +103,7 @@ struct vbo_save_vertex_list {
 static inline GLsizei
 _vbo_save_get_stride(const struct vbo_save_vertex_list *node)
 {
-   return node->VAO[0]->BufferBinding[0].Stride;
+   return node->cold->VAO[0]->BufferBinding[0].Stride;
 }
 
 /* Default size for the buffer holding the vertices and the indices.
index f0f5b82..6bcd3e9 100644 (file)
@@ -830,38 +830,38 @@ compile_vertex_list(struct gl_context *ctx)
    }
 
    /* Prepare for DrawGallium */
-   memset(&node->merged.info, 0, sizeof(struct pipe_draw_info));
+   memset(&node->cold->info, 0, sizeof(struct pipe_draw_info));
    /* The other info fields will be updated in vbo_save_playback_vertex_list */
-   node->merged.info.index_size = 4;
-   node->merged.info.instance_count = 1;
-   node->merged.info.index.gl_bo = node->cold->ib.obj;
+   node->cold->info.index_size = 4;
+   node->cold->info.instance_count = 1;
+   node->cold->info.index.gl_bo = node->cold->ib.obj;
    if (merged_prim_count == 1) {
-      node->merged.info.mode = merged_prims[0].mode;
-      node->merged.start_count.start = merged_prims[0].start;
-      node->merged.start_count.count = merged_prims[0].count;
-      node->merged.start_count.index_bias = 0;
-      node->merged.mode = NULL;
+      node->cold->info.mode = merged_prims[0].mode;
+      node->start_count.start = merged_prims[0].start;
+      node->start_count.count = merged_prims[0].count;
+      node->start_count.index_bias = 0;
+      node->modes = NULL;
    } else {
-      node->merged.mode = malloc(merged_prim_count * sizeof(unsigned char));
-      node->merged.start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
+      node->modes = malloc(merged_prim_count * sizeof(unsigned char));
+      node->start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
       for (unsigned i = 0; i < merged_prim_count; i++) {
-         node->merged.start_counts[i].start = merged_prims[i].start;
-         node->merged.start_counts[i].count = merged_prims[i].count;
-         node->merged.start_counts[i].index_bias = 0;
-         node->merged.mode[i] = merged_prims[i].mode;
+         node->start_counts[i].start = merged_prims[i].start;
+         node->start_counts[i].count = merged_prims[i].count;
+         node->start_counts[i].index_bias = 0;
+         node->modes[i] = merged_prims[i].mode;
       }
    }
-   node->merged.num_draws = merged_prim_count;
-   if (node->merged.num_draws > 1) {
+   node->num_draws = merged_prim_count;
+   if (node->num_draws > 1) {
       bool same_mode = true;
-      for (unsigned i = 1; i < node->merged.num_draws && same_mode; i++) {
-         same_mode = node->merged.mode[i] == node->merged.mode[0];
+      for (unsigned i = 1; i < node->num_draws && same_mode; i++) {
+         same_mode = node->modes[i] == node->modes[0];
       }
       if (same_mode) {
          /* All primitives use the same mode, so we can simplify a bit */
-         node->merged.info.mode = node->merged.mode[0];
-         free(node->merged.mode);
-         node->merged.mode = NULL;
+         node->cold->info.mode = node->modes[0];
+         free(node->modes);
+         node->modes = NULL;
       }
    }
 
@@ -897,28 +897,27 @@ end:
                  save->current_bo, buffer_offset, stride,
                  save->enabled, save->attrsz, save->attrtype, offsets);
       /* Reference the vao in the dlist */
-      node->VAO[vpm] = NULL;
-      _mesa_reference_vao(ctx, &node->VAO[vpm], save->VAO[vpm]);
+      node->cold->VAO[vpm] = NULL;
+      _mesa_reference_vao(ctx, &node->cold->VAO[vpm], save->VAO[vpm]);
    }
 
    /* Prepare for DrawGalliumVertexState */
-   if (node->merged.num_draws && ctx->Driver.DrawGalliumVertexState) {
+   if (node->num_draws && ctx->Driver.DrawGalliumVertexState) {
       for (unsigned i = 0; i < VP_MODE_MAX; i++) {
          uint32_t enabled_attribs = _vbo_get_vao_filter(i) &
-                                    node->VAO[i]->_EnabledWithMapMode;
+                                    node->cold->VAO[i]->_EnabledWithMapMode;
 
-         node->merged.gallium.state[i] =
-            ctx->Driver.CreateGalliumVertexState(ctx, node->VAO[i],
+         node->state[i] =
+            ctx->Driver.CreateGalliumVertexState(ctx, node->cold->VAO[i],
                                                  node->cold->ib.obj,
                                                  enabled_attribs);
-         node->merged.gallium.private_refcount[i] = 0;
-         node->merged.gallium.enabled_attribs[i] = enabled_attribs;
+         node->private_refcount[i] = 0;
+         node->enabled_attribs[i] = enabled_attribs;
       }
 
-      node->merged.gallium.ctx = ctx;
-      node->merged.gallium.info.mode = node->merged.info.mode;
-      node->merged.gallium.info.take_vertex_state_ownership = false;
-      assert(node->merged.info.index_size == 4);
+      node->ctx = ctx;
+      node->mode = node->cold->info.mode;
+      assert(node->cold->info.index_size == 4);
    }
 
    /* Deal with GL_COMPILE_AND_EXECUTE:
@@ -935,7 +934,7 @@ end:
        * The problem is that the VAO offset is based on current_bo's layout,
        * so we have to use a temp value.
        */
-      struct gl_vertex_array_object *vao = node->VAO[VP_MODE_SHADER];
+      struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_SHADER];
       GLintptr original = vao->BufferBinding[0].Offset;
       if (!ctx->ListState.Current.UseLoopback) {
          GLintptr new_offset = 0;
index a4e10c3..51c48c8 100644 (file)
@@ -106,10 +106,10 @@ playback_copy_to_current(struct gl_context *ctx,
    bool color0_changed = false;
 
    /* Copy conventional attribs and generics except pos */
-   copy_vao(ctx, node->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
+   copy_vao(ctx, node->cold->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
             _NEW_CURRENT_ATTRIB, GL_CURRENT_BIT, 0, &data, &color0_changed);
    /* Copy materials */
-   copy_vao(ctx, node->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
+   copy_vao(ctx, node->cold->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
             _NEW_MATERIAL, GL_LIGHTING_BIT,
             VBO_MATERIAL_SHIFT, &data, &color0_changed);
 
@@ -138,7 +138,7 @@ bind_vertex_list(struct gl_context *ctx,
                  const struct vbo_save_vertex_list *node)
 {
    const gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;
-   _mesa_set_draw_vao(ctx, node->VAO[mode], _vbo_get_vao_filter(mode));
+   _mesa_set_draw_vao(ctx, node->cold->VAO[mode], _vbo_get_vao_filter(mode));
 }
 
 
@@ -146,7 +146,7 @@ static void
 loopback_vertex_list(struct gl_context *ctx,
                      const struct vbo_save_vertex_list *list)
 {
-   struct gl_buffer_object *bo = list->VAO[0]->BufferBinding[0].BufferObj;
+   struct gl_buffer_object *bo = list->cold->VAO[0]->BufferBinding[0].BufferObj;
    void *buffer = ctx->Driver.MapBufferRange(ctx, 0, bo->Size, GL_MAP_READ_BIT, /* ? */
                                              bo, MAP_INTERNAL);
 
@@ -201,7 +201,7 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
    /* This sets which vertex arrays are enabled, which determines
     * which attribs have stride = 0 and whether edge flags are enabled.
     */
-   const GLbitfield enabled = node->merged.gallium.enabled_attribs[mode];
+   const GLbitfield enabled = node->enabled_attribs[mode];
    ctx->Array._DrawVAOEnabledAttribs = enabled;
    _mesa_set_varying_vp_inputs(ctx, enabled);
 
@@ -228,10 +228,13 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
    if (vp->info.inputs_read & ~enabled || vp->DualSlotInputs)
       return USE_SLOW_PATH;
 
-   struct pipe_vertex_state *state = node->merged.gallium.state[mode];
-   struct pipe_draw_vertex_state_info info = node->merged.gallium.info;
+   struct pipe_vertex_state *state = node->state[mode];
+   struct pipe_draw_vertex_state_info info;
 
-   if (node->merged.gallium.ctx == ctx) {
+   info.mode = node->mode;
+   info.take_vertex_state_ownership = false;
+
+   if (node->ctx == ctx) {
       /* This mechanism allows passing references to the driver without
        * using atomics to increase the reference count.
        *
@@ -248,7 +251,7 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
        * possibly turn a million atomic increments into 1 add and 1 subtract
        * atomic op over the whole lifetime of an app.
        */
-      int16_t * const private_refcount = (int16_t*)&node->merged.gallium.private_refcount[mode];
+      int16_t * const private_refcount = (int16_t*)&node->private_refcount[mode];
       assert(*private_refcount >= 0);
 
       if (unlikely(*private_refcount == 0)) {
@@ -270,15 +273,15 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
    }
 
    /* Fast path using a pre-built gallium vertex buffer state. */
-   if (node->merged.mode || node->merged.num_draws > 1) {
+   if (node->modes || node->num_draws > 1) {
       ctx->Driver.DrawGalliumVertexState(ctx, state, info,
-                                         node->merged.start_counts,
-                                         node->merged.mode,
-                                         node->merged.num_draws,
+                                         node->start_counts,
+                                         node->modes,
+                                         node->num_draws,
                                          enabled & VERT_ATTRIB_EDGEFLAG);
-   } else if (node->merged.num_draws) {
+   } else if (node->num_draws) {
       ctx->Driver.DrawGalliumVertexState(ctx, state, info,
-                                         &node->merged.start_count,
+                                         &node->start_count,
                                          NULL, 1,
                                          enabled & VERT_ATTRIB_EDGEFLAG);
    }
@@ -327,18 +330,18 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data, bool copy_to_c
 
    assert(ctx->NewState == 0);
 
-   struct pipe_draw_info *info = (struct pipe_draw_info *) &node->merged.info;
+   struct pipe_draw_info *info = (struct pipe_draw_info *) &node->cold->info;
    void *gl_bo = info->index.gl_bo;
-   if (node->merged.mode) {
+   if (node->modes) {
       ctx->Driver.DrawGalliumMultiMode(ctx, info,
-                                       node->merged.start_counts,
-                                       node->merged.mode,
-                                       node->merged.num_draws);
-   } else if (node->merged.num_draws == 1) {
-      ctx->Driver.DrawGallium(ctx, info, 0, &node->merged.start_count, 1);
-   } else if (node->merged.num_draws) {
-      ctx->Driver.DrawGallium(ctx, info, 0, node->merged.start_counts,
-                              node->merged.num_draws);
+                                       node->start_counts,
+                                       node->modes,
+                                       node->num_draws);
+   } else if (node->num_draws == 1) {
+      ctx->Driver.DrawGallium(ctx, info, 0, &node->start_count, 1);
+   } else if (node->num_draws) {
+      ctx->Driver.DrawGallium(ctx, info, 0, node->start_counts,
+                              node->num_draws);
    }
    info->index.gl_bo = gl_bo;
 
index 2bad1bd..153bf5f 100644 (file)
@@ -155,14 +155,14 @@ _vbo_loopback_vertex_list(struct gl_context *ctx,
    /* All Legacy, NV, ARB and Material attributes are routed through
     * the NV attributes entrypoints:
     */
-   const struct gl_vertex_array_object *vao = node->VAO[VP_MODE_FF];
+   const struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_FF];
    GLbitfield mask = vao->Enabled & VERT_BIT_MAT_ALL;
    while (mask) {
       const int i = u_bit_scan(&mask);
       append_attr(&nr, la, i, VBO_MATERIAL_SHIFT, vao);
    }
 
-   vao = node->VAO[VP_MODE_SHADER];
+   vao = node->cold->VAO[VP_MODE_SHADER];
    mask = vao->Enabled & ~(VERT_BIT_POS | VERT_BIT_GENERIC0);
    while (mask) {
       const int i = u_bit_scan(&mask);