From de70c0cf88bd46a5e5fc7f8c38a64733a7c30042 Mon Sep 17 00:00:00 2001
From: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Date: Fri, 7 Apr 2023 11:24:36 -0400
Subject: [PATCH] lavapipe: implement inline variant caching

inlining is great, but it's less great if a new variant must be created
for every draw

to avoid this, cache inlined variants for reuse

Reviewed-by: Brian Paul <brianp@vmware.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22363>
---
 src/gallium/frontends/lavapipe/lvp_execute.c  | 55 ++++++++++++++++++---------
 src/gallium/frontends/lavapipe/lvp_pipeline.c | 27 ++++++++++++-
 src/gallium/frontends/lavapipe/lvp_private.h  |  7 ++++
 3 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/src/gallium/frontends/lavapipe/lvp_execute.c b/src/gallium/frontends/lavapipe/lvp_execute.c
index 690541e..bc8c7d6 100644
--- a/src/gallium/frontends/lavapipe/lvp_execute.c
+++ b/src/gallium/frontends/lavapipe/lvp_execute.c
@@ -282,18 +282,18 @@ update_pcbuf(struct rendering_state *state, enum pipe_shader_type pstage)
 static void
 update_inline_shader_state(struct rendering_state *state, enum pipe_shader_type sh, bool pcbuf_dirty, bool constbuf_dirty)
 {
-   uint32_t inline_uniforms[MAX_INLINABLE_UNIFORMS];
    unsigned stage = tgsi_processor_to_shader_stage(sh);
    state->inlines_dirty[sh] = false;
    struct lvp_shader *shader = state->shaders[stage];
    if (!shader || !shader->inlines.can_inline)
       return;
+   struct lvp_inline_variant v;
+   v.mask = shader->inlines.can_inline;
    /* these buffers have already been flushed in llvmpipe, so they're safe to read */
    nir_shader *base_nir = shader->pipeline_nir->nir;
    if (stage == MESA_SHADER_TESS_EVAL && state->tess_ccw)
       base_nir = shader->tess_ccw->nir;
-   nir_shader *nir = nir_shader_clone(shader->pipeline_nir->nir, base_nir);
-   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   nir_function_impl *impl = nir_shader_get_entrypoint(base_nir);
    unsigned ssa_alloc = impl->ssa_alloc;
    unsigned count = shader->inlines.count[0];
    if (count && pcbuf_dirty) {
@@ -301,20 +301,21 @@ update_inline_shader_state(struct rendering_state *state, enum pipe_shader_type
       for (unsigned i = 0; i < count; i++) {
          unsigned offset = shader->inlines.uniform_offsets[0][i];
          if (offset < push_size) {
-            memcpy(&inline_uniforms[i], &state->push_constants[offset], sizeof(uint32_t));
+            memcpy(&v.vals[0][i], &state->push_constants[offset], sizeof(uint32_t));
          } else {
             for (unsigned i = 0; i < state->uniform_blocks[sh].count; i++) {
                if (offset < push_size + state->uniform_blocks[sh].size[i]) {
                   unsigned ubo_offset = offset - push_size;
                   uint8_t *block = state->uniform_blocks[sh].block[i];
-                  memcpy(&inline_uniforms[i], &block[ubo_offset], sizeof(uint32_t));
+                  memcpy(&v.vals[0][i], &block[ubo_offset], sizeof(uint32_t));
                   break;
                }
                push_size += state->uniform_blocks[sh].size[i];
             }
          }
       }
-      NIR_PASS_V(nir, lvp_inline_uniforms, shader, inline_uniforms, 0);
+      for (unsigned i = count; i < MAX_INLINABLE_UNIFORMS; i++)
+         v.vals[0][i] = 0;
    }
    if (constbuf_dirty) {
       struct pipe_box box = {0};
@@ -331,24 +332,42 @@ update_inline_shader_state(struct rendering_state *state, enum pipe_shader_type
          uint8_t *map = state->pctx->buffer_map(state->pctx, pres, 0, PIPE_MAP_READ, &box, &xfer);
          for (unsigned i = 0; i < count; i++) {
             unsigned offset = shader->inlines.uniform_offsets[slot][i];
-            memcpy(&inline_uniforms[i], map + offset, sizeof(uint32_t));
+            memcpy(&v.vals[slot][i], map + offset, sizeof(uint32_t));
          }
          state->pctx->buffer_unmap(state->pctx, xfer);
-         NIR_PASS_V(nir, lvp_inline_uniforms, shader, inline_uniforms, slot);
+         for (unsigned i = count; i < MAX_INLINABLE_UNIFORMS; i++)
+            v.vals[slot][i] = 0;
       }
    }
-   lvp_shader_optimize(nir);
-   impl = nir_shader_get_entrypoint(nir);
+   bool found = false;
+   struct set_entry *entry = _mesa_set_search_or_add_pre_hashed(&shader->inlines.variants, v.mask, &v, &found);
    void *shader_state;
-   if (ssa_alloc - impl->ssa_alloc < ssa_alloc / 2 &&
-       !shader->inlines.must_inline) {
-      /* not enough change; don't inline further */
-      shader->inlines.can_inline = 0;
-      ralloc_free(nir);
-      shader->shader_cso = lvp_shader_compile(state->device, shader, nir_shader_clone(NULL, shader->pipeline_nir->nir));
-      shader_state = shader->shader_cso;
+   if (found) {
+      const struct lvp_inline_variant *variant = entry->key;
+      shader_state = variant->cso;
    } else {
-      shader_state = lvp_shader_compile(state->device, shader, nir);
+      nir_shader *nir = nir_shader_clone(NULL, base_nir);
+      NIR_PASS_V(nir, lvp_inline_uniforms, shader, v.vals[0], 0);
+      if (constbuf_dirty) {
+         u_foreach_bit(slot, shader->inlines.can_inline)
+            NIR_PASS_V(nir, lvp_inline_uniforms, shader, v.vals[slot], slot);
+      }
+      lvp_shader_optimize(nir);
+      impl = nir_shader_get_entrypoint(nir);
+      if (ssa_alloc - impl->ssa_alloc < ssa_alloc / 2 &&
+         !shader->inlines.must_inline) {
+         /* not enough change; don't inline further */
+         shader->inlines.can_inline = 0;
+         ralloc_free(nir);
+         shader->shader_cso = lvp_shader_compile(state->device, shader, nir_shader_clone(NULL, shader->pipeline_nir->nir));
+         _mesa_set_remove(&shader->inlines.variants, entry);
+         shader_state = shader->shader_cso;
+      } else {
+         shader_state = lvp_shader_compile(state->device, shader, nir);
+         struct lvp_inline_variant *variant = mem_dup(&v, sizeof(v));
+         variant->cso = shader_state;
+         entry->key = variant;
+      }
    }
    switch (sh) {
    case MESA_SHADER_VERTEX:
diff --git a/src/gallium/frontends/lavapipe/lvp_pipeline.c b/src/gallium/frontends/lavapipe/lvp_pipeline.c
index b86429e..ffd7124 100644
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@@ -56,6 +56,12 @@ shader_destroy(struct lvp_device *device, struct lvp_shader *shader)
       device->queue.ctx->delete_fs_state,
       device->queue.ctx->delete_compute_state,
    };
+   set_foreach(&shader->inlines.variants, entry) {
+      struct lvp_inline_variant *variant = (void*)entry->key;
+      destroy[stage](device->queue.ctx, variant->cso);
+      free(variant);
+   }
+   ralloc_free(shader->inlines.variants.table);
    if (shader->shader_cso)
       destroy[stage](device->queue.ctx, shader->shader_cso);
    if (shader->tess_ccw_cso)
@@ -439,6 +445,18 @@ compile_spirv(struct lvp_device *pdevice, const VkPipelineShaderStageCreateInfo
    return result;
 }
 
+static bool
+inline_variant_equals(const void *a, const void *b)
+{
+   const struct lvp_inline_variant *av = a, *bv = b;
+   assert(av->mask == bv->mask);
+   u_foreach_bit(slot, av->mask) {
+      if (memcmp(av->vals[slot], bv->vals[slot], sizeof(av->vals[slot])))
+         return false;
+   }
+   return true;
+}
+
 static void
 lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_shader *shader, struct lvp_pipeline_layout *layout)
 {
@@ -528,6 +546,8 @@ lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_shader
    if (impl->ssa_alloc > 100) //skip for small shaders
       shader->inlines.must_inline = lvp_find_inlinable_uniforms(shader, nir);
    shader->pipeline_nir = create_pipeline_nir(nir);
+   if (shader->inlines.can_inline)
+      _mesa_set_init(&shader->inlines.variants, NULL, NULL, inline_variant_equals);
 }
 
 static VkResult
@@ -782,6 +802,8 @@ copy_shader_sanitized(struct lvp_shader *dst, const struct lvp_shader *src)
    dst->tess_ccw = NULL; //this gets handled later
    assert(!dst->shader_cso);
    assert(!dst->tess_ccw_cso);
+   if (src->inlines.can_inline)
+      _mesa_set_init(&dst->inlines.variants, NULL, NULL, inline_variant_equals);
 }
 
 static VkResult
@@ -833,9 +855,10 @@ lvp_graphics_pipeline_init(struct lvp_pipeline *pipeline,
             pipeline->line_smooth = p->line_smooth;
             pipeline->disable_multisample = p->disable_multisample;
             pipeline->line_rectangular = p->line_rectangular;
-            pipeline->last_vertex = p->last_vertex;
-            for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++)
+            memcpy(pipeline->shaders, p->shaders, sizeof(struct lvp_shader) * 4);
+            for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
                copy_shader_sanitized(&pipeline->shaders[i], &p->shaders[i]);
+            }
          }
          if (p->stages & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
             pipeline->force_min_sample = p->force_min_sample;
diff --git a/src/gallium/frontends/lavapipe/lvp_private.h b/src/gallium/frontends/lavapipe/lvp_private.h
index 122d50d..613e78a 100644
--- a/src/gallium/frontends/lavapipe/lvp_private.h
+++ b/src/gallium/frontends/lavapipe/lvp_private.h
@@ -432,6 +432,12 @@ lvp_pipeline_nir_ref(struct lvp_pipeline_nir **dst, struct lvp_pipeline_nir *src
    *dst = src;
 }
 
+struct lvp_inline_variant {
+   uint32_t mask;
+   uint32_t vals[PIPE_MAX_CONSTANT_BUFFERS][MAX_INLINABLE_UNIFORMS];
+   void *cso;
+};
+
 struct lvp_shader {
    struct vk_object_base base;
    struct lvp_pipeline_layout *layout;
@@ -445,6 +451,7 @@ struct lvp_shader {
       uint8_t count[PIPE_MAX_CONSTANT_BUFFERS];
       bool must_inline;
       uint32_t can_inline; //bitmask
+      struct set variants;
    } inlines;
    struct pipe_stream_output_info stream_output;
    struct blob blob; //preserved for GetShaderBinaryDataEXT
-- 
2.7.4