lavapipe: implement extreme uniform inlining

author Mike Blumenkrantz <michael.blumenkrantz@gmail.com>

Mon, 27 Jun 2022 18:02:46 +0000 (14:02 -0400)

committer Marge Bot <emma+marge@anholt.net>

Mon, 25 Jul 2022 16:38:25 +0000 (16:38 +0000)
author Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Mon, 27 Jun 2022 18:02:46 +0000 (14:02 -0400)
committer Marge Bot <emma+marge@anholt.net>
Mon, 25 Jul 2022 16:38:25 +0000 (16:38 +0000)
diff --git a/src/gallium/frontends/lavapipe/lvp_execute.c b/src/gallium/frontends/lavapipe/lvp_execute.c

index 086347a..3bad295 100644 (file)
--- a/src/gallium/frontends/lavapipe/lvp_execute.c
+++ b/src/gallium/frontends/lavapipe/lvp_execute.c
@@ -83,6 +83,7 @@ struct rendering_state {
     bool constbuf_dirty[PIPE_SHADER_TYPES];
     bool pcbuf_dirty[PIPE_SHADER_TYPES];
     bool has_pcbuf[PIPE_SHADER_TYPES];
+   bool inlines_dirty[PIPE_SHADER_TYPES];
     bool vp_dirty;
     bool scissor_dirty;
     bool ib_dirty;
@@ -177,6 +178,8 @@ struct rendering_state {
     uint32_t num_so_targets;
     struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS];
     uint32_t so_offsets[PIPE_MAX_SO_BUFFERS];
+
+   struct lvp_pipeline *pipeline[2];
  };
  
  ALWAYS_INLINE static void
@@ -256,6 +259,95 @@ update_pcbuf(struct rendering_state *state, enum pipe_shader_type pstage)
     state->pcbuf_dirty[pstage] = false;
  }
  
+static void
+update_inline_shader_state(struct rendering_state *state, enum pipe_shader_type sh, bool pcbuf_dirty, bool constbuf_dirty)
+{
+   bool is_compute = sh == PIPE_SHADER_COMPUTE;
+   uint32_t inline_uniforms[MAX_INLINABLE_UNIFORMS];
+   unsigned stage = tgsi_processor_to_shader_stage(sh);
+   state->inlines_dirty[sh] = false;
+   if (!state->pipeline[is_compute]->inlines[stage].can_inline)
+      return;
+   struct lvp_pipeline *pipeline = state->pipeline[is_compute];
+   /* these buffers have already been flushed in llvmpipe, so they're safe to read */
+   nir_shader *nir = nir_shader_clone(pipeline->pipeline_nir[stage], pipeline->pipeline_nir[stage]);
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   unsigned ssa_alloc = impl->ssa_alloc;
+   unsigned count = pipeline->inlines[stage].count[0];
+   if (count && pcbuf_dirty) {
+      unsigned push_size = get_pcbuf_size(state, sh);
+      for (unsigned i = 0; i < count; i++) {
+         unsigned offset = pipeline->inlines[stage].uniform_offsets[0][i];
+         if (offset < push_size) {
+            memcpy(&inline_uniforms[i], &state->push_constants[offset], sizeof(uint32_t));
+         } else {
+            for (unsigned i = 0; i < state->uniform_blocks[sh].count; i++) {
+               if (offset < push_size + state->uniform_blocks[sh].size[i]) {
+                  unsigned ubo_offset = offset - push_size;
+                  uint8_t *block = state->uniform_blocks[sh].block[i];
+                  memcpy(&inline_uniforms[i], &block[ubo_offset], sizeof(uint32_t));
+                  break;
+               }
+               push_size += state->uniform_blocks[sh].size[i];
+            }
+         }
+      }
+      NIR_PASS_V(nir, lvp_inline_uniforms, pipeline, inline_uniforms, 0);
+   }
+   if (constbuf_dirty) {
+      struct pipe_box box = {0};
+      u_foreach_bit(slot, pipeline->inlines[stage].can_inline) {
+         unsigned count = pipeline->inlines[stage].count[slot];
+         struct pipe_constant_buffer *cbuf = &state->const_buffer[sh][slot - 1];
+         struct pipe_resource *pres = cbuf->buffer;
+         box.x = cbuf->buffer_offset;
+         box.width = cbuf->buffer_size - cbuf->buffer_offset;
+         struct pipe_transfer *xfer;
+         uint8_t *map = state->pctx->buffer_map(state->pctx, pres, 0, PIPE_MAP_READ, &box, &xfer);
+         for (unsigned i = 0; i < count; i++) {
+            unsigned offset = pipeline->inlines[stage].uniform_offsets[slot][i];
+            memcpy(&inline_uniforms[i], map + offset, sizeof(uint32_t));
+         }
+         state->pctx->buffer_unmap(state->pctx, xfer);
+         NIR_PASS_V(nir, lvp_inline_uniforms, pipeline, inline_uniforms, slot);
+      }
+   }
+   lvp_shader_optimize(nir);
+   impl = nir_shader_get_entrypoint(nir);
+   void *shader_state;
+   if (ssa_alloc - impl->ssa_alloc < ssa_alloc / 2 &&
+       !pipeline->inlines[stage].must_inline) {
+      /* not enough change; don't inline further */
+      pipeline->inlines[stage].can_inline = 0;
+      ralloc_free(nir);
+      pipeline->shader_cso[sh] = lvp_pipeline_compile(pipeline, nir_shader_clone(NULL, pipeline->pipeline_nir[stage]));
+      shader_state = pipeline->shader_cso[sh];
+   } else {
+      shader_state = lvp_pipeline_compile(pipeline, nir);
+   }
+   switch (sh) {
+   case PIPE_SHADER_VERTEX:
+      state->pctx->bind_vs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      state->pctx->bind_tcs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      state->pctx->bind_tes_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      state->pctx->bind_gs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      state->pctx->bind_fs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      state->pctx->bind_compute_state(state->pctx, shader_state);
+      break;
+   default: break;
+   }
+}
+
  static void emit_compute_state(struct rendering_state *state)
  {
     if (state->iv_dirty[PIPE_SHADER_COMPUTE]) {
@@ -265,9 +357,11 @@ static void emit_compute_state(struct rendering_state *state)
        state->iv_dirty[PIPE_SHADER_COMPUTE] = false;
     }
  
+   bool pcbuf_dirty = state->pcbuf_dirty[PIPE_SHADER_COMPUTE];
     if (state->pcbuf_dirty[PIPE_SHADER_COMPUTE])
        update_pcbuf(state, PIPE_SHADER_COMPUTE);
  
+   bool constbuf_dirty = state->constbuf_dirty[PIPE_SHADER_COMPUTE];
     if (state->constbuf_dirty[PIPE_SHADER_COMPUTE]) {
        for (unsigned i = 0; i < state->num_const_bufs[PIPE_SHADER_COMPUTE]; i++)
           state->pctx->set_constant_buffer(state->pctx, PIPE_SHADER_COMPUTE,
@@ -275,6 +369,9 @@ static void emit_compute_state(struct rendering_state *state)
        state->constbuf_dirty[PIPE_SHADER_COMPUTE] = false;
     }
  
+   if (state->inlines_dirty[PIPE_SHADER_COMPUTE])
+      update_inline_shader_state(state, PIPE_SHADER_COMPUTE, pcbuf_dirty, constbuf_dirty);
+
     if (state->sb_dirty[PIPE_SHADER_COMPUTE]) {
        state->pctx->set_shader_buffers(state->pctx, PIPE_SHADER_COMPUTE,
                                        0, state->num_shader_buffers[PIPE_SHADER_COMPUTE],
@@ -379,9 +476,11 @@ static void emit_state(struct rendering_state *state)
        cso_set_vertex_elements(state->cso, &state->velem);
        state->ve_dirty = false;
     }
-   
  
+   bool constbuf_dirty[PIPE_SHADER_TYPES] = {false};
+   bool pcbuf_dirty[PIPE_SHADER_TYPES] = {false};
     for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
+      constbuf_dirty[sh] = state->constbuf_dirty[sh];
        if (state->constbuf_dirty[sh]) {
           for (unsigned idx = 0; idx < state->num_const_bufs[sh]; idx++)
              state->pctx->set_constant_buffer(state->pctx, sh,
@@ -391,11 +490,17 @@ static void emit_state(struct rendering_state *state)
     }
  
     for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
+      pcbuf_dirty[sh] = state->pcbuf_dirty[sh];
        if (state->pcbuf_dirty[sh])
           update_pcbuf(state, sh);
     }
  
     for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
+      if (state->inlines_dirty[sh])
+         update_inline_shader_state(state, sh, pcbuf_dirty[sh], constbuf_dirty[sh]);
+   }
+
+   for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
        if (state->sb_dirty[sh]) {
           state->pctx->set_shader_buffers(state->pctx, sh,
                                           0, state->num_shader_buffers[sh],
@@ -462,7 +567,9 @@ static void handle_compute_pipeline(struct vk_cmd_queue_entry *cmd,
     state->dispatch_info.block[0] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0];
     state->dispatch_info.block[1] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1];
     state->dispatch_info.block[2] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2];
-   state->pctx->bind_compute_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]);
+   state->inlines_dirty[PIPE_SHADER_COMPUTE] = pipeline->inlines[MESA_SHADER_COMPUTE].can_inline;
+   if (!pipeline->inlines[MESA_SHADER_COMPUTE].can_inline)
+      state->pctx->bind_compute_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]);
  }
  
  static void
@@ -620,24 +727,34 @@ static void handle_graphics_pipeline(struct vk_cmd_queue_entry *cmd,
           const VkPipelineShaderStageCreateInfo *sh = &pipeline->graphics_create_info.pStages[i];
           switch (sh->stage) {
           case VK_SHADER_STAGE_FRAGMENT_BIT:
-            state->pctx->bind_fs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]);
+            state->inlines_dirty[PIPE_SHADER_FRAGMENT] = pipeline->inlines[MESA_SHADER_FRAGMENT].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_FRAGMENT].can_inline)
+               state->pctx->bind_fs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]);
              has_stage[PIPE_SHADER_FRAGMENT] = true;
              break;
           case VK_SHADER_STAGE_VERTEX_BIT:
-            state->pctx->bind_vs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]);
+            state->inlines_dirty[PIPE_SHADER_VERTEX] = pipeline->inlines[MESA_SHADER_VERTEX].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_VERTEX].can_inline)
+               state->pctx->bind_vs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]);
              has_stage[PIPE_SHADER_VERTEX] = true;
              break;
           case VK_SHADER_STAGE_GEOMETRY_BIT:
-            state->pctx->bind_gs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]);
+            state->inlines_dirty[PIPE_SHADER_GEOMETRY] = pipeline->inlines[MESA_SHADER_GEOMETRY].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_GEOMETRY].can_inline)
+               state->pctx->bind_gs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]);
              state->gs_output_lines = pipeline->gs_output_lines ? GS_OUTPUT_LINES : GS_OUTPUT_NOT_LINES;
              has_stage[PIPE_SHADER_GEOMETRY] = true;
              break;
           case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
-            state->pctx->bind_tcs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]);
+            state->inlines_dirty[PIPE_SHADER_TESS_CTRL] = pipeline->inlines[MESA_SHADER_TESS_CTRL].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_TESS_CTRL].can_inline)
+               state->pctx->bind_tcs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]);
              has_stage[PIPE_SHADER_TESS_CTRL] = true;
              break;
           case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
-            state->pctx->bind_tes_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]);
+            state->inlines_dirty[PIPE_SHADER_TESS_EVAL] = pipeline->inlines[MESA_SHADER_TESS_EVAL].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_TESS_EVAL].can_inline)
+               state->pctx->bind_tes_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]);
              has_stage[PIPE_SHADER_TESS_EVAL] = true;
              break;
           default:
@@ -995,6 +1112,7 @@ static void handle_pipeline(struct vk_cmd_queue_entry *cmd,
     else
        handle_graphics_pipeline(cmd, state);
     state->push_size[pipeline->is_compute_pipeline] = pipeline->layout->push_constant_size;
+   state->pipeline[pipeline->is_compute_pipeline] = pipeline;
  }
  
  static void handle_vertex_buffers2(struct vk_cmd_queue_entry *cmd,
@@ -1306,6 +1424,7 @@ static void handle_descriptor(struct rendering_state *state,
        assert(descriptor->uniform);
        state->uniform_blocks[p_stage].block[idx] = descriptor->uniform;
        state->pcbuf_dirty[p_stage] = true;
+      state->inlines_dirty[p_stage] = true;
        break;
     }
     case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
@@ -1339,6 +1458,7 @@ static void handle_descriptor(struct rendering_state *state,
        if (state->num_const_bufs[p_stage] <= idx)
           state->num_const_bufs[p_stage] = idx + 1;
        state->constbuf_dirty[p_stage] = true;
+      state->inlines_dirty[p_stage] = true;
        break;
     }
     case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
@@ -2699,6 +2819,12 @@ static void handle_push_constants(struct vk_cmd_queue_entry *cmd,
     state->pcbuf_dirty[PIPE_SHADER_TESS_CTRL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) > 0;
     state->pcbuf_dirty[PIPE_SHADER_TESS_EVAL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) > 0;
     state->pcbuf_dirty[PIPE_SHADER_COMPUTE] |= (stage_flags & VK_SHADER_STAGE_COMPUTE_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_VERTEX] |= (stage_flags & VK_SHADER_STAGE_VERTEX_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_FRAGMENT] |= (stage_flags & VK_SHADER_STAGE_FRAGMENT_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_GEOMETRY] |= (stage_flags & VK_SHADER_STAGE_GEOMETRY_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_TESS_CTRL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_TESS_EVAL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_COMPUTE] |= (stage_flags & VK_SHADER_STAGE_COMPUTE_BIT) > 0;
  }
  
  static void lvp_execute_cmd_buffer(struct lvp_cmd_buffer *cmd_buffer,
diff --git a/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c b/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c

new file mode 100644 (file)

index 0000000..c133b3d
--- /dev/null
+++ b/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ * Copyright © 2022 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* enhanced version of nir_inline_uniforms that can inline from any uniform buffer
+ * see nir_inline_uniforms.c for more details
+ */
+
+#include "nir_builder.h"
+#include "nir_loop_analyze.h"
+#include "lvp_private.h"
+
+static bool
+src_only_uses_uniforms(const nir_src *src, int component,
+                       uint32_t *uni_offsets, uint8_t *num_offsets)
+{
+   if (!src->is_ssa)
+      return false;
+
+   assert(component < src->ssa->num_components);
+
+   nir_instr *instr = src->ssa->parent_instr;
+
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+      /* Vector ops only need to check the corresponding component. */
+      if (nir_op_is_vec(alu->op)) {
+         nir_alu_src *alu_src = alu->src + component;
+         return src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[0],
+                                       uni_offsets, num_offsets);
+      }
+
+      /* Return true if all sources return true. */
+      for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
+         nir_alu_src *alu_src = alu->src + i;
+         int input_sizes = nir_op_infos[alu->op].input_sizes[i];
+
+         if (input_sizes == 0) {
+            /* For ops which has no input size, each component of dest is
+             * only determined by the same component of srcs.
+             */
+            if (!src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[component],
+                                        uni_offsets, num_offsets))
+               return false;
+         } else {
+            /* For ops which has input size, all components of dest are
+             * determined by all components of srcs (except vec ops).
+             */
+            for (unsigned j = 0; j < input_sizes; j++) {
+               if (!src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[j],
+                                           uni_offsets, num_offsets))
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      /* Return true if the intrinsic loads with a constant offset. */
+      if (intr->intrinsic == nir_intrinsic_load_ubo &&
+          nir_src_is_const(intr->src[0]) &&
+          nir_src_is_const(intr->src[1]) &&
+          /* TODO: Can't handle other bit sizes for now. */
+          intr->dest.ssa.bit_size == 32) {
+         uint32_t offset = nir_src_as_uint(intr->src[1]) + component * 4;
+
+         /* Already recorded by other one */
+         uint32_t ubo = nir_src_as_uint(intr->src[0]);
+         for (int i = 0; uni_offsets && i < num_offsets[ubo]; i++) {
+            if (uni_offsets[ubo * PIPE_MAX_CONSTANT_BUFFERS + i] == offset)
+               return true;
+         }
+
+         /* Exceed uniform number limit */
+         if (num_offsets && num_offsets[ubo] == MAX_INLINABLE_UNIFORMS)
+            return false;
+
+         /* Record the uniform offset. */
+         if (uni_offsets)
+            uni_offsets[ubo * MAX_INLINABLE_UNIFORMS + num_offsets[ubo]++] = offset;
+         return true;
+      }
+      return false;
+   }
+
+   case nir_instr_type_load_const:
+      /* Always return true for constants. */
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static bool
+is_induction_variable(const nir_src *src, int component, nir_loop_info *info,
+                      uint32_t *uni_offsets, uint8_t *num_offsets)
+{
+   if (!src->is_ssa)
+      return false;
+
+   assert(component < src->ssa->num_components);
+
+   /* Return true for induction variable (ie. i in for loop) */
+   for (int i = 0; i < info->num_induction_vars; i++) {
+      nir_loop_induction_variable *var = info->induction_vars + i;
+      if (var->def == src->ssa) {
+         /* Induction variable should have constant initial value (ie. i = 0),
+          * constant update value (ie. i++) and constant end condition
+          * (ie. i < 10), so that we know the exact loop count for unrolling
+          * the loop.
+          *
+          * Add uniforms need to be inlined for this induction variable's
+          * initial and update value to be constant, for example:
+          *
+          *     for (i = init; i < count; i += step)
+          *
+          * We collect uniform "init" and "step" here.
+          */
+         if (var->init_src) {
+            if (!src_only_uses_uniforms(var->init_src, component,
+                                        uni_offsets, num_offsets))
+               return false;
+         }
+
+         if (var->update_src) {
+            nir_alu_src *alu_src = var->update_src;
+            if (!src_only_uses_uniforms(&alu_src->src,
+                                        alu_src->swizzle[component],
+                                        uni_offsets, num_offsets))
+               return false;
+         }
+
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static void
+add_inlinable_uniforms(const nir_src *cond, nir_loop_info *info,
+                       uint32_t *uni_offsets, uint8_t *num_offsets)
+{
+   uint8_t new_num[PIPE_MAX_CONSTANT_BUFFERS];
+   memcpy(new_num, num_offsets, sizeof(new_num));
+   /* If condition SSA is always scalar, so component is 0. */
+   unsigned component = 0;
+
+   /* Allow induction variable which means a loop terminator. */
+   if (info) {
+      nir_ssa_scalar cond_scalar = {cond->ssa, 0};
+
+      /* Limit terminator condition to loop unroll support case which is a simple
+       * comparison (ie. "i < count" is supported, but "i + 1 < count" is not).
+       */
+      if (nir_is_supported_terminator_condition(cond_scalar)) {
+         nir_alu_instr *alu = nir_instr_as_alu(cond->ssa->parent_instr);
+
+         /* One side of comparison is induction variable, the other side is
+          * only uniform.
+          */
+         for (int i = 0; i < 2; i++) {
+            if (is_induction_variable(&alu->src[i].src, alu->src[i].swizzle[0],
+                                      info, uni_offsets, new_num)) {
+               cond = &alu->src[1 - i].src;
+               component = alu->src[1 - i].swizzle[0];
+               break;
+            }
+         }
+      }
+   }
+
+   /* Only update uniform number when all uniforms in the expression
+    * can be inlined. Partially inlines uniforms can't lower if/loop.
+    *
+    * For example, uniform can be inlined for a shader is limited to 4,
+    * and we have already added 3 uniforms, then want to deal with
+    *
+    *     if (uniform0 + uniform1 == 10)
+    *
+    * only uniform0 can be inlined due to we exceed the 4 limit. But
+    * unless both uniform0 and uniform1 are inlined, can we eliminate
+    * the if statement.
+    *
+    * This is even possible when we deal with loop if the induction
+    * variable init and update also contains uniform like
+    *
+    *    for (i = uniform0; i < uniform1; i+= uniform2)
+    *
+    * unless uniform0, uniform1 and uniform2 can be inlined at once,
+    * can the loop be unrolled.
+    */
+   if (src_only_uses_uniforms(cond, component, uni_offsets, new_num))
+      memcpy(num_offsets, new_num, sizeof(new_num));
+}
+
+static bool
+is_src_uniform_load(nir_src src)
+{
+   if (nir_src_bit_size(src) != 32 || nir_src_num_components(src) != 1 || nir_src_is_const(src))
+      return false;
+   return src_only_uses_uniforms(&src, 0, NULL, NULL);
+}
+
+static void
+process_node(nir_cf_node *node, nir_loop_info *info,
+             uint32_t *uni_offsets, uint8_t *num_offsets,
+             struct set *stores)
+{
+   switch (node->type) {
+   case nir_cf_node_if: {
+      nir_if *if_node = nir_cf_node_as_if(node);
+      const nir_src *cond = &if_node->condition;
+      add_inlinable_uniforms(cond, info, uni_offsets, num_offsets);
+
+      /* Do not pass loop info down so only alow induction variable
+       * in loop terminator "if":
+       *
+       *     for (i = 0; true; i++)
+       *         if (i == count)
+       *             if (i == num)
+       *                 <no break>
+       *             break
+       *
+       * so "num" won't be inlined due to the "if" is not a
+       * terminator.
+       */
+      info = NULL;
+
+      foreach_list_typed(nir_cf_node, nested_node, node, &if_node->then_list)
+         process_node(nested_node, info, uni_offsets, num_offsets, stores);
+      foreach_list_typed(nir_cf_node, nested_node, node, &if_node->else_list)
+         process_node(nested_node, info, uni_offsets, num_offsets, stores);
+      break;
+   }
+
+   case nir_cf_node_loop: {
+      nir_loop *loop = nir_cf_node_as_loop(node);
+
+      /* Replace loop info, no nested loop info currently:
+       *
+       *     for (i = 0; i < count0; i++)
+       *         for (j = 0; j < count1; j++)
+       *             if (i == num)
+       *
+       * so "num" won't be inlined due to "i" is an induction
+       * variable of upper loop.
+       */
+      info = loop->info;
+
+      foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) {
+         bool is_terminator = false;
+         list_for_each_entry(nir_loop_terminator, terminator,
+                             &info->loop_terminator_list,
+                             loop_terminator_link) {
+            if (nested_node == &terminator->nif->cf_node) {
+               is_terminator = true;
+               break;
+            }
+         }
+
+         /* Allow induction variables for terminator "if" only:
+          *
+          *     for (i = 0; i < count; i++)
+          *         if (i == num)
+          *             <no break>
+          *
+          * so "num" won't be inlined due to the "if" is not a
+          * terminator.
+          */
+         nir_loop_info *use_info = is_terminator ? info : NULL;
+         process_node(nested_node, use_info, uni_offsets, num_offsets, stores);
+      }
+      break;
+   }
+
+   case nir_cf_node_block: {
+      nir_block *block = nir_cf_node_as_block(node);
+      nir_foreach_instr(instr, block) {
+         if (instr->type == nir_instr_type_intrinsic) {
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            if (intr->intrinsic == nir_intrinsic_store_deref && is_src_uniform_load(intr->src[1]))
+               _mesa_set_add(stores, &intr->src[1]);
+         }
+      }
+      break;
+   }
+   default:
+      break;
+   }
+}
+
+bool
+lvp_find_inlinable_uniforms(struct lvp_pipeline *pipeline, nir_shader *shader)
+{
+   bool ret = false;
+   struct set *stores = _mesa_set_create(shader, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_metadata_require(function->impl, nir_metadata_loop_analysis, nir_var_all);
+
+         foreach_list_typed(nir_cf_node, node, node, &function->impl->body)
+            process_node(node, NULL, (uint32_t*)pipeline->inlines[shader->info.stage].uniform_offsets, pipeline->inlines[shader->info.stage].count, stores);
+      }
+   }
+   const unsigned threshold = 5;
+   set_foreach(stores, entry) {
+      const nir_src *src = entry->key;
+      unsigned counter = 0;
+      list_for_each_entry(nir_src, rsrc, &src->ssa->uses, use_link) {
+         counter++;
+         if (counter >= threshold)
+            break;
+      }
+      if (counter >= threshold) {
+         uint8_t new_num[PIPE_MAX_CONSTANT_BUFFERS];
+         memcpy(new_num, pipeline->inlines[shader->info.stage].count, sizeof(new_num));
+         if (src_only_uses_uniforms(src, 0, (uint32_t*)pipeline->inlines[shader->info.stage].uniform_offsets, new_num)) {
+            ret = true;
+            memcpy(pipeline->inlines[shader->info.stage].count, new_num, sizeof(new_num));
+         }
+      }
+   }
+   for (unsigned i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      if (pipeline->inlines[shader->info.stage].count[i]) {
+         pipeline->inlines[shader->info.stage].can_inline |= BITFIELD_BIT(i);
+         break;
+      }
+   }
+   return ret;
+}
+
+void
+lvp_inline_uniforms(nir_shader *shader, const struct lvp_pipeline *pipeline, const uint32_t *uniform_values, uint32_t ubo)
+{
+   if (!pipeline->inlines[shader->info.stage].can_inline)
+      return;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+               /* Only replace loads with constant offsets. */
+               if (intr->intrinsic == nir_intrinsic_load_ubo &&
+                   nir_src_is_const(intr->src[0]) &&
+                   nir_src_as_uint(intr->src[0]) == ubo &&
+                   nir_src_is_const(intr->src[1]) &&
+                   /* TODO: Can't handle other bit sizes for now. */
+                   intr->dest.ssa.bit_size == 32) {
+                  int num_components = intr->dest.ssa.num_components;
+                  uint32_t offset = nir_src_as_uint(intr->src[1]);
+                  const unsigned num_uniforms = pipeline->inlines[shader->info.stage].count[ubo];
+                  const unsigned *uniform_dw_offsets = pipeline->inlines[shader->info.stage].uniform_offsets[ubo];
+
+                  if (num_components == 1) {
+                     /* Just replace the uniform load to constant load. */
+                     for (unsigned i = 0; i < num_uniforms; i++) {
+                        if (offset == uniform_dw_offsets[i]) {
+                           b.cursor = nir_before_instr(&intr->instr);
+                           nir_ssa_def *def = nir_imm_int(&b, uniform_values[i]);
+                           nir_ssa_def_rewrite_uses(&intr->dest.ssa, def);
+                           nir_instr_remove(&intr->instr);
+                           break;
+                        }
+                     }
+                  } else {
+                     /* Lower vector uniform load to scalar and replace each
+                      * found component load with constant load.
+                      */
+                     uint32_t max_offset = offset + num_components;
+                     nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS] = {0};
+                     bool found = false;
+
+                     b.cursor = nir_before_instr(&intr->instr);
+
+                     /* Find component to replace. */
+                     for (unsigned i = 0; i < num_uniforms; i++) {
+                        uint32_t uni_offset = uniform_dw_offsets[i];
+                        if (uni_offset >= offset && uni_offset < max_offset) {
+                           int index = uni_offset - offset;
+                           components[index] = nir_imm_int(&b, uniform_values[i]);
+                           found = true;
+                        }
+                     }
+
+                     if (!found)
+                        continue;
+
+                     /* Create per-component uniform load. */
+                     for (unsigned i = 0; i < num_components; i++) {
+                        if (!components[i]) {
+                           uint32_t scalar_offset = (offset + i) * 4;
+                           components[i] = nir_load_ubo(&b, 1, intr->dest.ssa.bit_size,
+                                                        intr->src[0].ssa,
+                                                        nir_imm_int(&b, scalar_offset));
+                           nir_intrinsic_instr *load =
+                              nir_instr_as_intrinsic(components[i]->parent_instr);
+                           nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX, scalar_offset);
+                           nir_intrinsic_set_range_base(load, scalar_offset);
+                           nir_intrinsic_set_range(load, 4);
+                        }
+                     }
+
+                     /* Replace the original uniform load. */
+                     nir_ssa_def_rewrite_uses(&intr->dest.ssa,
+                                              nir_vec(&b, components, num_components));
+                     nir_instr_remove(&intr->instr);
+                  }
+               }
+            }
+         }
+
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                               nir_metadata_dominance);
+      }
+   }
+}
diff --git a/src/gallium/frontends/lavapipe/lvp_pipeline.c b/src/gallium/frontends/lavapipe/lvp_pipeline.c

index d25ed34..798c6b9 100644 (file)
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@@ -1060,6 +1060,9 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,
     nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                                 nir->info.stage);
  
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   if (impl->ssa_alloc > 100) //skip for small shaders
+      pipeline->inlines[stage].must_inline = lvp_find_inlinable_uniforms(pipeline, nir);
     pipeline->pipeline_nir[stage] = nir;
  
     return VK_SUCCESS;
@@ -1180,15 +1183,12 @@ lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir)
     return NULL;
  }
  
-static VkResult
-lvp_pipeline_compile(struct lvp_pipeline *pipeline,
-                     gl_shader_stage stage)
+void *
+lvp_pipeline_compile(struct lvp_pipeline *pipeline, nir_shader *nir)
  {
     struct lvp_device *device = pipeline->device;
-   device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, pipeline->pipeline_nir[stage]);
-   nir_shader *nir = nir_shader_clone(NULL, pipeline->pipeline_nir[stage]);
-   pipeline->shader_cso[pipe_shader_type_from_mesa(stage)] = lvp_pipeline_compile_stage(pipeline, nir);
-   return VK_SUCCESS;
+   device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, nir);
+   return lvp_pipeline_compile_stage(pipeline, nir);
  }
  
  #ifndef NDEBUG
@@ -1450,7 +1450,10 @@ lvp_graphics_pipeline_init(struct lvp_pipeline *pipeline,
           const VkPipelineShaderStageCreateInfo *sinfo =
              &pipeline->graphics_create_info.pStages[i];
           gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
-         lvp_pipeline_compile(pipeline, stage);
+         enum pipe_shader_type pstage = pipe_shader_type_from_mesa(stage);
+         if (!pipeline->inlines[stage].can_inline)
+            pipeline->shader_cso[pstage] = lvp_pipeline_compile(pipeline,
+                                                                nir_shader_clone(NULL, pipeline->pipeline_nir[stage]));
           if (stage == MESA_SHADER_FRAGMENT)
              has_fragment_shader = true;
        }
@@ -1571,7 +1574,8 @@ lvp_compute_pipeline_init(struct lvp_pipeline *pipeline,
     if (result != VK_SUCCESS)
        return result;
  
-   lvp_pipeline_compile(pipeline, MESA_SHADER_COMPUTE);
+   if (!pipeline->inlines[MESA_SHADER_COMPUTE].can_inline)
+      pipeline->shader_cso[PIPE_SHADER_COMPUTE] = lvp_pipeline_compile(pipeline, nir_shader_clone(NULL, pipeline->pipeline_nir[MESA_SHADER_COMPUTE]));
     return VK_SUCCESS;
  }
  
diff --git a/src/gallium/frontends/lavapipe/lvp_private.h b/src/gallium/frontends/lavapipe/lvp_private.h

index ce3f553..0ac180c 100644 (file)
--- a/src/gallium/frontends/lavapipe/lvp_private.h
+++ b/src/gallium/frontends/lavapipe/lvp_private.h
@@ -434,6 +434,12 @@ struct lvp_pipeline {
     bool force_min_sample;
     nir_shader *pipeline_nir[MESA_SHADER_STAGES];
     void *shader_cso[PIPE_SHADER_TYPES];
+   struct {
+      uint32_t uniform_offsets[PIPE_MAX_CONSTANT_BUFFERS][MAX_INLINABLE_UNIFORMS];
+      uint8_t count[PIPE_MAX_CONSTANT_BUFFERS];
+      bool must_inline;
+      uint32_t can_inline; //bitmask
+   } inlines[MESA_SHADER_STAGES];
     gl_shader_stage last_vertex;
     struct pipe_stream_output_info stream_output;
     VkGraphicsPipelineCreateInfo graphics_create_info;
@@ -625,6 +631,12 @@ void
  lvp_shader_optimize(nir_shader *nir);
  void *
  lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir);
+bool
+lvp_find_inlinable_uniforms(struct lvp_pipeline *pipeline, nir_shader *shader);
+void
+lvp_inline_uniforms(nir_shader *shader, const struct lvp_pipeline *pipeline, const uint32_t *uniform_values, uint32_t ubo);
+void *
+lvp_pipeline_compile(struct lvp_pipeline *pipeline, nir_shader *base_nir);
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/gallium/frontends/lavapipe/meson.build b/src/gallium/frontends/lavapipe/meson.build

index a63f322..85ff200 100644 (file)
--- a/src/gallium/frontends/lavapipe/meson.build
+++ b/src/gallium/frontends/lavapipe/meson.build
@@ -18,6 +18,7 @@ liblvp_files = files(
      'lvp_util.c',
      'lvp_image.c',
      'lvp_formats.c',
+    'lvp_inline_uniforms.c',
      'lvp_lower_vulkan_resource.c',
      'lvp_lower_vulkan_resource.h',
      'lvp_lower_input_attachments.c',
author	Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
	Mon, 27 Jun 2022 18:02:46 +0000 (14:02 -0400)
committer	Marge Bot <emma+marge@anholt.net>
	Mon, 25 Jul 2022 16:38:25 +0000 (16:38 +0000)
src/gallium/frontends/lavapipe/lvp_execute.c		patch \| blob \| history
src/gallium/frontends/lavapipe/lvp_inline_uniforms.c	[new file with mode: 0644]	patch \| blob
src/gallium/frontends/lavapipe/lvp_pipeline.c		patch \| blob \| history
src/gallium/frontends/lavapipe/lvp_private.h		patch \| blob \| history
src/gallium/frontends/lavapipe/meson.build		patch \| blob \| history