From 31d9caa23924a6076bd3f64b9aed7e212940437d Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 15 Apr 2019 04:08:46 +0000 Subject: [PATCH] panfrost: Fixup vertex offsets to prevent shadow copy Mali attribute buffers have to be 64-byte aligned. However, Gallium enforces no such requirement; for unaligned buffers, we were previously forced to create a shadow copy (slow!). To prevent this, we instead use the offseted buffer's address with the lower bits masked off, and then add those masked off bits to the src_offset. Proof of correctness included, possibly for the opportunity to say "QED" unironically. Signed-off-by: Alyssa Rosenzweig --- src/gallium/drivers/panfrost/pan_context.c | 104 ++++++++++++++++++----------- src/gallium/drivers/panfrost/pan_context.h | 6 +- 2 files changed, 67 insertions(+), 43 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 1319b79..9d3d0d2 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -735,6 +735,15 @@ panfrost_emit_varying_descriptor( ctx->payload_tiler.postfix.varyings = varyings_p; } +static mali_ptr +panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i) +{ + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i]; + struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); + + return rsrc->bo->gpu + buf->buffer_offset; +} + /* Emits attributes and varying descriptors, which should be called every draw, * excepting some obscure circumstances */ @@ -754,42 +763,20 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx, struct panfrost_job *job struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i]; struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); - /* Let's figure out the layout of the attributes in memory so - * we can be smart about size computation. The idea is to - * figure out the maximum src_offset, which tells us the latest - * spot a vertex could start. Meanwhile, we figure out the size - * of the attribute memory (assuming interleaved - * representation) and tack on the max src_offset for a - * reasonably good upper bound on the size. - * - * Proving correctness is left as an exercise to the reader. - */ + if (!rsrc) continue; - unsigned max_src_offset = 0; + /* Align to 64 bytes by masking off the lower bits. This + * will be adjusted back when we fixup the src_offset in + * mali_attr_meta */ - for (unsigned j = 0; j < ctx->vertex->num_elements; ++j) { - if (ctx->vertex->pipe[j].vertex_buffer_index != i) continue; - max_src_offset = MAX2(max_src_offset, ctx->vertex->pipe[j].src_offset); - } + mali_ptr addr = panfrost_vertex_buffer_address(ctx, i) & ~63; /* Offset vertex count by draw_start to make sure we upload enough */ attrs[k].stride = buf->stride; - attrs[k].size = buf->stride * (ctx->payload_vertex.draw_start + invocation_count) + max_src_offset; - - /* Vertex elements are -already- GPU-visible, at - * rsrc->gpu. However, attribute buffers must be 64 aligned. If - * it is not, for now we have to duplicate the buffer. */ + attrs[k].size = rsrc->base.width0; - mali_ptr effective_address = rsrc ? (rsrc->bo->gpu + buf->buffer_offset) : 0; - - if (effective_address & 63) { - attrs[k].elements = panfrost_upload_transient(ctx, rsrc->bo->cpu + buf->buffer_offset, attrs[i].size) | MALI_ATTR_LINEAR; - } else if (effective_address) { - panfrost_job_add_bo(job, rsrc->bo); - attrs[k].elements = effective_address | MALI_ATTR_LINEAR; - } else { - /* Leave unset? */ - } + panfrost_job_add_bo(job, rsrc->bo); + attrs[k].elements = addr | MALI_ATTR_LINEAR; ++k; } @@ -808,6 +795,53 @@ panfrost_writes_point_size(struct panfrost_context *ctx) return vs->writes_point_size && ctx->payload_tiler.prefix.draw_mode == MALI_POINTS; } +/* Stage the attribute descriptors so we can adjust src_offset + * to let BOs align nicely */ + +static void +panfrost_stage_attributes(struct panfrost_context *ctx) +{ + struct panfrost_vertex_state *so = ctx->vertex; + + size_t sz = sizeof(struct mali_attr_meta) * so->num_elements; + struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sz); + struct mali_attr_meta *target = (struct mali_attr_meta *) transfer.cpu; + + /* Copy as-is for the first pass */ + memcpy(target, so->hw, sz); + + /* Fixup offsets for the second pass. Recall that the hardware + * calculates attribute addresses as: + * + * addr = base + (stride * vtx) + src_offset; + * + * However, on Mali, base must be aligned to 64-bytes, so we + * instead let: + * + * base' = base & ~63 = base - (base & 63) + * + * To compensate when using base' (see emit_vertex_data), we have + * to adjust src_offset by the masked off piece: + * + * addr' = base' + (stride * vtx) + (src_offset + (base & 63)) + * = base - (base & 63) + (stride * vtx) + src_offset + (base & 63) + * = base + (stride * vtx) + src_offset + * = addr; + * + * QED. + */ + + for (unsigned i = 0; i < so->num_elements; ++i) { + unsigned vbi = so->pipe[i].vertex_buffer_index; + mali_ptr addr = panfrost_vertex_buffer_address(ctx, vbi); + + /* Adjust by the masked off bits of the offset */ + target[i].src_offset += (addr & 63); + } + + ctx->payload_vertex.postfix.attribute_meta = transfer.gpu; +} + /* Go through dirty flags and actualise them in the cmdstream. */ void @@ -991,9 +1025,8 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) } } - if (ctx->dirty & PAN_DIRTY_VERTEX) { - ctx->payload_vertex.postfix.attribute_meta = ctx->vertex->descriptor_ptr; - } + /* We stage to transient, so always dirty.. */ + panfrost_stage_attributes(ctx); if (ctx->dirty & PAN_DIRTY_SAMPLERS) { /* Upload samplers back to back, no padding */ @@ -1553,16 +1586,11 @@ panfrost_create_vertex_elements_state( unsigned num_elements, const struct pipe_vertex_element *elements) { - struct panfrost_context *ctx = pan_context(pctx); struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state); so->num_elements = num_elements; memcpy(so->pipe, elements, sizeof(*elements) * num_elements); - struct panfrost_transfer transfer = panfrost_allocate_chunk(ctx, sizeof(struct mali_attr_meta) * num_elements, HEAP_DESCRIPTOR); - so->hw = (struct mali_attr_meta *) transfer.cpu; - so->descriptor_ptr = transfer.gpu; - /* Allocate memory for the descriptor state */ for (int i = 0; i < num_elements; ++i) { diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index a6a5427..d4b9c1e 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -291,11 +291,7 @@ struct panfrost_vertex_state { unsigned num_elements; struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS]; - int nr_components[PIPE_MAX_ATTRIBS]; - - /* The actual attribute meta, prebaked and GPU mapped. TODO: Free memory */ - struct mali_attr_meta *hw; - mali_ptr descriptor_ptr; + struct mali_attr_meta hw[PIPE_MAX_ATTRIBS]; }; struct panfrost_sampler_state { -- 2.7.4