- Move more stuff into the cold structure.
- Reorder fields for better packing.
- Flatten the gallium and merged nested structures.
Since we have tens of thousands of these, decreasing the size improves
performance by 13%.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13506>
vbo_destroy_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node)
{
for (gl_vertex_processing_mode mode = VP_MODE_FF; mode < VP_MODE_MAX; ++mode) {
- _mesa_reference_vao(ctx, &node->VAO[mode], NULL);
- if (node->merged.gallium.private_refcount[mode]) {
- assert(node->merged.gallium.private_refcount[mode] > 0);
- p_atomic_add(&node->merged.gallium.state[mode]->reference.count,
- -node->merged.gallium.private_refcount[mode]);
+ _mesa_reference_vao(ctx, &node->cold->VAO[mode], NULL);
+ if (node->private_refcount[mode]) {
+ assert(node->private_refcount[mode] > 0);
+ p_atomic_add(&node->state[mode]->reference.count,
+ -node->private_refcount[mode]);
}
- pipe_vertex_state_reference(&node->merged.gallium.state[mode], NULL);
+ pipe_vertex_state_reference(&node->state[mode], NULL);
}
- if (node->merged.mode) {
- free(node->merged.mode);
- free(node->merged.start_counts);
+ if (node->modes) {
+ free(node->modes);
+ free(node->start_counts);
}
_mesa_reference_buffer_object(ctx, &node->cold->ib.obj, NULL);
vbo_print_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node, OpCode op, FILE *f)
{
GLuint i;
- struct gl_buffer_object *buffer = node->VAO[0]->BufferBinding[0].BufferObj;
+ struct gl_buffer_object *buffer = node->cold->VAO[0]->BufferBinding[0].BufferObj;
const GLuint vertex_size = _vbo_save_get_stride(node)/sizeof(GLfloat);
(void) ctx;
union gl_dlist_node header;
/* Data used in vbo_save_playback_vertex_list */
- struct gl_vertex_array_object *VAO[VP_MODE_MAX];
-
- struct {
- struct pipe_draw_info info;
- unsigned char *mode;
- union {
- struct pipe_draw_start_count_bias *start_counts;
- struct pipe_draw_start_count_bias start_count;
- };
- unsigned num_draws;
-
- struct {
- struct gl_context *ctx;
- struct pipe_vertex_state *state[VP_MODE_MAX];
- int16_t private_refcount[VP_MODE_MAX];
- GLbitfield enabled_attribs[VP_MODE_MAX];
- struct pipe_draw_vertex_state_info info;
- } gallium;
- } merged;
-
- /* Cold: used during construction or to handle egde-cases */
+ unsigned num_draws;
+ uint8_t *modes;
+ union {
+ struct pipe_draw_start_count_bias *start_counts;
+ struct pipe_draw_start_count_bias start_count;
+ };
+ uint8_t mode;
+
+ int16_t private_refcount[VP_MODE_MAX];
+ struct gl_context *ctx;
+ struct pipe_vertex_state *state[VP_MODE_MAX];
+ GLbitfield enabled_attribs[VP_MODE_MAX];
+
+ /* Cold: used during construction or to handle edge-cases.
+ * It's not part of the structure because we want display list nodes
+ * to be tightly packed to get cache hits. Without this, performance would
+ * decrease by an order of magnitude with 10k display lists.
+ */
struct {
+ struct gl_vertex_array_object *VAO[VP_MODE_MAX];
struct _mesa_index_buffer ib;
+ struct pipe_draw_info info;
+
/* Copy of the final vertex from node->vertex_store->bufferobj.
* Keep this in regular (non-VBO) memory to avoid repeated
* map/unmap of the VBO when updating GL current data.
static inline GLsizei
_vbo_save_get_stride(const struct vbo_save_vertex_list *node)
{
- return node->VAO[0]->BufferBinding[0].Stride;
+ return node->cold->VAO[0]->BufferBinding[0].Stride;
}
/* Default size for the buffer holding the vertices and the indices.
}
/* Prepare for DrawGallium */
- memset(&node->merged.info, 0, sizeof(struct pipe_draw_info));
+ memset(&node->cold->info, 0, sizeof(struct pipe_draw_info));
/* The other info fields will be updated in vbo_save_playback_vertex_list */
- node->merged.info.index_size = 4;
- node->merged.info.instance_count = 1;
- node->merged.info.index.gl_bo = node->cold->ib.obj;
+ node->cold->info.index_size = 4;
+ node->cold->info.instance_count = 1;
+ node->cold->info.index.gl_bo = node->cold->ib.obj;
if (merged_prim_count == 1) {
- node->merged.info.mode = merged_prims[0].mode;
- node->merged.start_count.start = merged_prims[0].start;
- node->merged.start_count.count = merged_prims[0].count;
- node->merged.start_count.index_bias = 0;
- node->merged.mode = NULL;
+ node->cold->info.mode = merged_prims[0].mode;
+ node->start_count.start = merged_prims[0].start;
+ node->start_count.count = merged_prims[0].count;
+ node->start_count.index_bias = 0;
+ node->modes = NULL;
} else {
- node->merged.mode = malloc(merged_prim_count * sizeof(unsigned char));
- node->merged.start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
+ node->modes = malloc(merged_prim_count * sizeof(unsigned char));
+ node->start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
for (unsigned i = 0; i < merged_prim_count; i++) {
- node->merged.start_counts[i].start = merged_prims[i].start;
- node->merged.start_counts[i].count = merged_prims[i].count;
- node->merged.start_counts[i].index_bias = 0;
- node->merged.mode[i] = merged_prims[i].mode;
+ node->start_counts[i].start = merged_prims[i].start;
+ node->start_counts[i].count = merged_prims[i].count;
+ node->start_counts[i].index_bias = 0;
+ node->modes[i] = merged_prims[i].mode;
}
}
- node->merged.num_draws = merged_prim_count;
- if (node->merged.num_draws > 1) {
+ node->num_draws = merged_prim_count;
+ if (node->num_draws > 1) {
bool same_mode = true;
- for (unsigned i = 1; i < node->merged.num_draws && same_mode; i++) {
- same_mode = node->merged.mode[i] == node->merged.mode[0];
+ for (unsigned i = 1; i < node->num_draws && same_mode; i++) {
+ same_mode = node->modes[i] == node->modes[0];
}
if (same_mode) {
/* All primitives use the same mode, so we can simplify a bit */
- node->merged.info.mode = node->merged.mode[0];
- free(node->merged.mode);
- node->merged.mode = NULL;
+ node->cold->info.mode = node->modes[0];
+ free(node->modes);
+ node->modes = NULL;
}
}
save->current_bo, buffer_offset, stride,
save->enabled, save->attrsz, save->attrtype, offsets);
/* Reference the vao in the dlist */
- node->VAO[vpm] = NULL;
- _mesa_reference_vao(ctx, &node->VAO[vpm], save->VAO[vpm]);
+ node->cold->VAO[vpm] = NULL;
+ _mesa_reference_vao(ctx, &node->cold->VAO[vpm], save->VAO[vpm]);
}
/* Prepare for DrawGalliumVertexState */
- if (node->merged.num_draws && ctx->Driver.DrawGalliumVertexState) {
+ if (node->num_draws && ctx->Driver.DrawGalliumVertexState) {
for (unsigned i = 0; i < VP_MODE_MAX; i++) {
uint32_t enabled_attribs = _vbo_get_vao_filter(i) &
- node->VAO[i]->_EnabledWithMapMode;
+ node->cold->VAO[i]->_EnabledWithMapMode;
- node->merged.gallium.state[i] =
- ctx->Driver.CreateGalliumVertexState(ctx, node->VAO[i],
+ node->state[i] =
+ ctx->Driver.CreateGalliumVertexState(ctx, node->cold->VAO[i],
node->cold->ib.obj,
enabled_attribs);
- node->merged.gallium.private_refcount[i] = 0;
- node->merged.gallium.enabled_attribs[i] = enabled_attribs;
+ node->private_refcount[i] = 0;
+ node->enabled_attribs[i] = enabled_attribs;
}
- node->merged.gallium.ctx = ctx;
- node->merged.gallium.info.mode = node->merged.info.mode;
- node->merged.gallium.info.take_vertex_state_ownership = false;
- assert(node->merged.info.index_size == 4);
+ node->ctx = ctx;
+ node->mode = node->cold->info.mode;
+ assert(node->cold->info.index_size == 4);
}
/* Deal with GL_COMPILE_AND_EXECUTE:
* The problem is that the VAO offset is based on current_bo's layout,
* so we have to use a temp value.
*/
- struct gl_vertex_array_object *vao = node->VAO[VP_MODE_SHADER];
+ struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_SHADER];
GLintptr original = vao->BufferBinding[0].Offset;
if (!ctx->ListState.Current.UseLoopback) {
GLintptr new_offset = 0;
bool color0_changed = false;
/* Copy conventional attribs and generics except pos */
- copy_vao(ctx, node->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
+ copy_vao(ctx, node->cold->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
_NEW_CURRENT_ATTRIB, GL_CURRENT_BIT, 0, &data, &color0_changed);
/* Copy materials */
- copy_vao(ctx, node->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
+ copy_vao(ctx, node->cold->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
_NEW_MATERIAL, GL_LIGHTING_BIT,
VBO_MATERIAL_SHIFT, &data, &color0_changed);
const struct vbo_save_vertex_list *node)
{
const gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;
- _mesa_set_draw_vao(ctx, node->VAO[mode], _vbo_get_vao_filter(mode));
+ _mesa_set_draw_vao(ctx, node->cold->VAO[mode], _vbo_get_vao_filter(mode));
}
loopback_vertex_list(struct gl_context *ctx,
const struct vbo_save_vertex_list *list)
{
- struct gl_buffer_object *bo = list->VAO[0]->BufferBinding[0].BufferObj;
+ struct gl_buffer_object *bo = list->cold->VAO[0]->BufferBinding[0].BufferObj;
void *buffer = ctx->Driver.MapBufferRange(ctx, 0, bo->Size, GL_MAP_READ_BIT, /* ? */
bo, MAP_INTERNAL);
/* This sets which vertex arrays are enabled, which determines
* which attribs have stride = 0 and whether edge flags are enabled.
*/
- const GLbitfield enabled = node->merged.gallium.enabled_attribs[mode];
+ const GLbitfield enabled = node->enabled_attribs[mode];
ctx->Array._DrawVAOEnabledAttribs = enabled;
_mesa_set_varying_vp_inputs(ctx, enabled);
if (vp->info.inputs_read & ~enabled || vp->DualSlotInputs)
return USE_SLOW_PATH;
- struct pipe_vertex_state *state = node->merged.gallium.state[mode];
- struct pipe_draw_vertex_state_info info = node->merged.gallium.info;
+ struct pipe_vertex_state *state = node->state[mode];
+ struct pipe_draw_vertex_state_info info;
- if (node->merged.gallium.ctx == ctx) {
+ info.mode = node->mode;
+ info.take_vertex_state_ownership = false;
+
+ if (node->ctx == ctx) {
/* This mechanism allows passing references to the driver without
* using atomics to increase the reference count.
*
* possibly turn a million atomic increments into 1 add and 1 subtract
* atomic op over the whole lifetime of an app.
*/
- int16_t * const private_refcount = (int16_t*)&node->merged.gallium.private_refcount[mode];
+ int16_t * const private_refcount = (int16_t*)&node->private_refcount[mode];
assert(*private_refcount >= 0);
if (unlikely(*private_refcount == 0)) {
}
/* Fast path using a pre-built gallium vertex buffer state. */
- if (node->merged.mode || node->merged.num_draws > 1) {
+ if (node->modes || node->num_draws > 1) {
ctx->Driver.DrawGalliumVertexState(ctx, state, info,
- node->merged.start_counts,
- node->merged.mode,
- node->merged.num_draws,
+ node->start_counts,
+ node->modes,
+ node->num_draws,
enabled & VERT_ATTRIB_EDGEFLAG);
- } else if (node->merged.num_draws) {
+ } else if (node->num_draws) {
ctx->Driver.DrawGalliumVertexState(ctx, state, info,
- &node->merged.start_count,
+ &node->start_count,
NULL, 1,
enabled & VERT_ATTRIB_EDGEFLAG);
}
assert(ctx->NewState == 0);
- struct pipe_draw_info *info = (struct pipe_draw_info *) &node->merged.info;
+ struct pipe_draw_info *info = (struct pipe_draw_info *) &node->cold->info;
void *gl_bo = info->index.gl_bo;
- if (node->merged.mode) {
+ if (node->modes) {
ctx->Driver.DrawGalliumMultiMode(ctx, info,
- node->merged.start_counts,
- node->merged.mode,
- node->merged.num_draws);
- } else if (node->merged.num_draws == 1) {
- ctx->Driver.DrawGallium(ctx, info, 0, &node->merged.start_count, 1);
- } else if (node->merged.num_draws) {
- ctx->Driver.DrawGallium(ctx, info, 0, node->merged.start_counts,
- node->merged.num_draws);
+ node->start_counts,
+ node->modes,
+ node->num_draws);
+ } else if (node->num_draws == 1) {
+ ctx->Driver.DrawGallium(ctx, info, 0, &node->start_count, 1);
+ } else if (node->num_draws) {
+ ctx->Driver.DrawGallium(ctx, info, 0, node->start_counts,
+ node->num_draws);
}
info->index.gl_bo = gl_bo;
/* All Legacy, NV, ARB and Material attributes are routed through
* the NV attributes entrypoints:
*/
- const struct gl_vertex_array_object *vao = node->VAO[VP_MODE_FF];
+ const struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_FF];
GLbitfield mask = vao->Enabled & VERT_BIT_MAT_ALL;
while (mask) {
const int i = u_bit_scan(&mask);
append_attr(&nr, la, i, VBO_MATERIAL_SHIFT, vao);
}
- vao = node->VAO[VP_MODE_SHADER];
+ vao = node->cold->VAO[VP_MODE_SHADER];
mask = vao->Enabled & ~(VERT_BIT_POS | VERT_BIT_GENERIC0);
while (mask) {
const int i = u_bit_scan(&mask);