glthread: use signed vertex buffer offsets when available, don't require them
authorMike Blumenkrantz <michael.blumenkrantz@gmail.com>
Tue, 27 Sep 2022 22:26:49 +0000 (18:26 -0400)
committerMarge Bot <emma+marge@anholt.net>
Thu, 29 Sep 2022 22:05:06 +0000 (22:05 +0000)
this is a great memory-saving optimization for drivers that support it,
but for drivesr that can't, it ends up taking slower paths instead
of just consuming more memory

but the year is 2022. we have all the memory. so use more memory if signed
offsets aren't supported

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18856>

src/mesa/main/glthread.c
src/mesa/main/glthread.h
src/mesa/main/glthread_bufferobj.c
src/mesa/main/glthread_draw.c

index b44ff9b..b66e374 100644 (file)
@@ -132,13 +132,6 @@ _mesa_glthread_init(struct gl_context *ctx)
       ctx->Const.BufferCreateMapUnsynchronizedThreadSafe &&
       ctx->Const.AllowMappedBuffersDuringExecution;
 
-   /* If the draw start index is non-zero, glthread can upload to offset 0,
-    * which means the attrib offset has to be -(first * stride).
-    * So require signed vertex buffer offsets.
-    */
-   glthread->SupportsNonVBOUploads = glthread->SupportsBufferUploads &&
-                                     ctx->Const.VertexBufferOffsetIsInt32;
-
    ctx->CurrentClientDispatch = ctx->MarshalExec;
 
    glthread->LastDListChangeBatchIndex = -1;
index efc579a..fbce31f 100644 (file)
@@ -188,7 +188,6 @@ struct glthread_state
 
    /** Caps. */
    GLboolean SupportsBufferUploads;
-   GLboolean SupportsNonVBOUploads;
 
    /** Primitive restart state. */
    bool PrimitiveRestart;
@@ -251,7 +250,8 @@ void _mesa_glthread_finish_before(struct gl_context *ctx, const char *func);
 void _mesa_glthread_upload(struct gl_context *ctx, const void *data,
                            GLsizeiptr size, unsigned *out_offset,
                            struct gl_buffer_object **out_buffer,
-                           uint8_t **out_ptr);
+                           uint8_t **out_ptr,
+                           unsigned start_offset);
 void _mesa_glthread_reset_vao(struct glthread_vao *vao);
 void _mesa_error_glthread_safe(struct gl_context *ctx, GLenum error,
                                bool glthread, const char *format, ...);
index 5745df4..615d3b3 100644 (file)
@@ -66,7 +66,8 @@ void
 _mesa_glthread_upload(struct gl_context *ctx, const void *data,
                       GLsizeiptr size, unsigned *out_offset,
                       struct gl_buffer_object **out_buffer,
-                      uint8_t **out_ptr)
+                      uint8_t **out_ptr,
+                      unsigned start_offset)
 {
    struct glthread_state *glthread = &ctx->GLThread;
    const unsigned default_size = 1024 * 1024;
@@ -75,14 +76,14 @@ _mesa_glthread_upload(struct gl_context *ctx, const void *data,
       return;
 
    /* The alignment was chosen arbitrarily. */
-   unsigned offset = align(glthread->upload_offset, 8);
+   unsigned offset = align(glthread->upload_offset, 8) + start_offset;
 
    /* Allocate a new buffer if needed. */
    if (unlikely(!glthread->upload_buffer || offset + size > default_size)) {
       /* If the size is greater than the buffer size, allocate a separate buffer
        * just for this upload.
        */
-      if (unlikely(size > default_size)) {
+      if (unlikely(start_offset + size > default_size)) {
          uint8_t *ptr;
 
          assert(*out_buffer == NULL);
@@ -90,7 +91,8 @@ _mesa_glthread_upload(struct gl_context *ctx, const void *data,
          if (!*out_buffer)
             return;
 
-         *out_offset = 0;
+         ptr += start_offset;
+         *out_offset = start_offset;
          if (data)
             memcpy(ptr, data, size);
          else
@@ -107,7 +109,7 @@ _mesa_glthread_upload(struct gl_context *ctx, const void *data,
       glthread->upload_buffer =
          new_upload_buffer(ctx, default_size, &glthread->upload_ptr);
       glthread->upload_offset = 0;
-      offset = 0;
+      offset = start_offset;
 
       /* Since atomic operations are very very slow when 2 threads are not
        * sharing one L3 cache (which can happen on AMD Zen), prevent using
@@ -430,7 +432,7 @@ _mesa_marshal_BufferSubData_merged(GLuint target_or_name, GLintptr offset,
       unsigned upload_offset = 0;
 
       _mesa_glthread_upload(ctx, data, size, &upload_offset, &upload_buffer,
-                            NULL);
+                            NULL, 0);
 
       if (upload_buffer) {
          _mesa_marshal_InternalBufferSubDataCopyMESA((GLintptr)upload_buffer,
index 030b7be..16eab75 100644 (file)
@@ -69,7 +69,7 @@ upload_indices(struct gl_context *ctx, unsigned count, unsigned index_size,
    assert(count);
 
    _mesa_glthread_upload(ctx, *indices, index_size * count,
-                         &upload_offset, &upload_buffer, NULL);
+                         &upload_offset, &upload_buffer, NULL, 0);
    assert(upload_buffer);
    *indices = (const GLvoid*)(intptr_t)upload_offset;
 
@@ -89,7 +89,7 @@ upload_multi_indices(struct gl_context *ctx, unsigned total_count,
    assert(total_count);
 
    _mesa_glthread_upload(ctx, NULL, index_size * total_count,
-                         &upload_offset, &upload_buffer, &upload_ptr);
+                         &upload_offset, &upload_buffer, &upload_ptr, 0);
    assert(upload_buffer);
 
    for (unsigned i = 0, offset = 0; i < draw_count; i++) {
@@ -188,10 +188,14 @@ upload_vertices(struct gl_context *ctx, unsigned user_buffer_mask,
          end = end_offset[binding_index];
          assert(start < end);
 
+         /* If the draw start index is non-zero, glthread can upload to offset 0,
+         * which means the attrib offset has to be -(first * stride).
+         * So use signed vertex buffer offsets when possible to save memory.
+         */
          const void *ptr = vao->Attrib[binding_index].Pointer;
          _mesa_glthread_upload(ctx, (uint8_t*)ptr + start,
                                end - start, &upload_offset,
-                               &upload_buffer, NULL);
+                               &upload_buffer, NULL, ctx->Const.VertexBufferOffsetIsInt32 ? 0 : start);
          assert(upload_buffer);
 
          buffers[num_buffers].buffer = upload_buffer;
@@ -239,9 +243,14 @@ upload_vertices(struct gl_context *ctx, unsigned user_buffer_mask,
          size = stride * (num_vertices - 1) + element_size;
       }
 
+      /* If the draw start index is non-zero, glthread can upload to offset 0,
+       * which means the attrib offset has to be -(first * stride).
+       * So use signed vertex buffer offsets when possible to save memory.
+       */
       const void *ptr = vao->Attrib[binding_index].Pointer;
       _mesa_glthread_upload(ctx, (uint8_t*)ptr + offset,
-                            size, &upload_offset, &upload_buffer, NULL);
+                            size, &upload_offset, &upload_buffer, NULL,
+                            ctx->Const.VertexBufferOffsetIsInt32 ? 0 : offset);
       assert(upload_buffer);
 
       buffers[num_buffers].buffer = upload_buffer;
@@ -405,7 +414,7 @@ draw_arrays(GLenum mode, GLint first, GLsizei count, GLsizei instance_count,
 
    /* Upload and draw. */
    struct glthread_attrib_binding buffers[VERT_ATTRIB_MAX];
-   if (!ctx->GLThread.SupportsNonVBOUploads ||
+   if (!ctx->GLThread.SupportsBufferUploads ||
        !upload_vertices(ctx, user_buffer_mask, first, count, baseinstance,
                         instance_count, buffers)) {
       _mesa_glthread_finish_before(ctx, "DrawArrays");
@@ -517,7 +526,7 @@ _mesa_marshal_MultiDrawArrays(GLenum mode, const GLint *first,
    }
 
    /* If the draw count is too high or negative, the queue can't be used. */
-   if (!ctx->GLThread.SupportsNonVBOUploads ||
+   if (!ctx->GLThread.SupportsBufferUploads ||
        draw_count < 0 || draw_count > MARSHAL_MAX_CMD_SIZE / 16)
       goto sync;
 
@@ -805,7 +814,7 @@ draw_elements(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices,
       return;
    }
 
-   if (!ctx->GLThread.SupportsNonVBOUploads)
+   if (!ctx->GLThread.SupportsBufferUploads)
       goto sync;
 
    bool need_index_bounds = user_buffer_mask & ~vao->NonZeroDivisorMask;
@@ -1030,7 +1039,7 @@ _mesa_marshal_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
     * and index bounds are not valid. We would have to map the indices
     * to compute the index bounds, and for that we would have to sync anyway.
     */
-   if (!ctx->GLThread.SupportsNonVBOUploads ||
+   if (!ctx->GLThread.SupportsBufferUploads ||
        draw_count < 0 || draw_count > MARSHAL_MAX_CMD_SIZE / 32 ||
        (need_index_bounds && !has_user_indices))
       goto sync;