From 6d28c6e52cfd76855c1368560dd90f12493e2580 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sat, 2 Jan 2016 03:21:28 -0800
Subject: [PATCH] i965: Select ranges of UBO data to be uploaded as push
 constants.

This adds a NIR pass that decides which portions of UBOS we should
upload as push constants, rather than pull constants.

v2: Switch to uint16_t for the UBO block number, because we may
    have a lot of them in Vulkan (suggested by Jason).  Add more
    comments about bitfield trickery (requested by Matt).

v3: Skip vec4 stages for now...I haven't finished wiring up support
    in the vec4 backend, and so pushing the data but not using it
    will just be wasteful.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/intel/Makefile.sources                      |   1 +
 src/intel/compiler/brw_compiler.h               |   9 +
 src/intel/compiler/brw_nir.h                    |   4 +
 src/intel/compiler/brw_nir_analyze_ubo_ranges.c | 298 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_gs.c              |   2 +
 src/mesa/drivers/dri/i965/brw_tcs.c             |   2 +
 src/mesa/drivers/dri/i965/brw_tes.c             |   2 +
 src/mesa/drivers/dri/i965/brw_vs.c              |   2 +
 src/mesa/drivers/dri/i965/brw_wm.c              |   2 +
 9 files changed, 322 insertions(+)
 create mode 100644 src/intel/compiler/brw_nir_analyze_ubo_ranges.c

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 30643e0..17027cf 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -74,6 +74,7 @@ COMPILER_FILES = \
 	compiler/brw_nir.h \
 	compiler/brw_nir.c \
 	compiler/brw_nir_analyze_boolean_resolves.c \
+	compiler/brw_nir_analyze_ubo_ranges.c \
 	compiler/brw_nir_attribute_workarounds.c \
 	compiler/brw_nir_intrinsics.c \
 	compiler/brw_nir_opt_peephole_ffma.c \
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index e4c22e3..bebd244 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -468,6 +468,13 @@ struct brw_image_param {
  */
 #define BRW_SHADER_TIME_STRIDE 64
 
+struct brw_ubo_range
+{
+   uint16_t block;
+   uint8_t start;
+   uint8_t length;
+};
+
 struct brw_stage_prog_data {
    struct {
       /** size of our binding table. */
@@ -488,6 +495,8 @@ struct brw_stage_prog_data {
       /** @} */
    } binding_table;
 
+   struct brw_ubo_range ubo_ranges[4];
+
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    unsigned nr_image_params;
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 5d866b8..560027c 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -142,6 +142,10 @@ void brw_nir_setup_glsl_uniforms(nir_shader *shader,
 void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
                                 struct brw_stage_prog_data *stage_prog_data);
 
+void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                                nir_shader *nir,
+                                struct brw_ubo_range out_ranges[4]);
+
 bool brw_nir_opt_peephole_ffma(nir_shader *shader);
 
 #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
diff --git a/src/intel/compiler/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
new file mode 100644
index 0000000..097aa8e
--- /dev/null
+++ b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright Â© 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/**
+ * \file brw_nir_analyze_ubo_ranges.c
+ *
+ * This pass decides which portions of UBOs to upload as push constants,
+ * so shaders can access them as part of the thread payload, rather than
+ * having to issue expensive memory reads to pull the data.
+ *
+ * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
+ * buffers, in GRF (256-bit/32-byte) units.
+ *
+ * To do this, we examine NIR load_ubo intrinsics, recording the number of
+ * loads at each offset.  We track offsets at a 32-byte granularity, so even
+ * fields with a bit of padding between them tend to fall into contiguous
+ * ranges.  We build a list of these ranges, tracking their "cost" (number
+ * of registers required) and "benefit" (number of pull loads eliminated
+ * by pushing the range).  We then sort the list to obtain the four best
+ * ranges (most benefit for the least cost).
+ */
+
+struct ubo_range_entry
+{
+   struct brw_ubo_range range;
+   int benefit;
+};
+
+static int
+score(const struct ubo_range_entry *entry)
+{
+   return 2 * entry->benefit - entry->range.length;
+}
+
+/**
+ * Compares score for two UBO range entries.
+ *
+ * For a descending qsort().
+ */
+static int
+cmp_ubo_range_entry(const void *va, const void *vb)
+{
+   const struct ubo_range_entry *a = va;
+   const struct ubo_range_entry *b = vb;
+
+   /* Rank based on scores */
+   int delta = score(b) - score(a);
+
+   /* Then use the UBO block index as a tie-breaker */
+   if (delta == 0)
+      delta = b->range.block - a->range.block;
+
+   /* Finally use the UBO offset as a second tie-breaker */
+   if (delta == 0)
+      delta = b->range.block - a->range.block;
+
+   return delta;
+}
+
+struct ubo_block_info
+{
+   /* Each bit in the offsets bitfield represents a 32-byte section of data.
+    * If it's set to one, there is interesting UBO data at that offset.  If
+    * not, there's a "hole" - padding between data - or just nothing at all.
+    */
+   uint64_t offsets;
+   uint8_t uses[64];
+};
+
+struct ubo_analysis_state
+{
+   struct hash_table *blocks;
+   bool uses_regular_uniforms;
+};
+
+static struct ubo_block_info *
+get_block_info(struct ubo_analysis_state *state, int block)
+{
+   uint32_t hash = block + 1;
+   void *key = (void *) (uintptr_t) hash;
+
+   struct hash_entry *entry =
+      _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
+
+   if (entry)
+      return (struct ubo_block_info *) entry->data;
+
+   struct ubo_block_info *info =
+      rzalloc(state->blocks, struct ubo_block_info);
+   _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
+
+   return info;
+}
+
+static void
+analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      if (intrin->intrinsic == nir_intrinsic_load_uniform)
+         state->uses_regular_uniforms = true;
+
+      if (intrin->intrinsic != nir_intrinsic_load_ubo)
+         continue;
+
+      nir_const_value *block_const = nir_src_as_const_value(intrin->src[0]);
+      nir_const_value *offset_const = nir_src_as_const_value(intrin->src[1]);
+
+      if (block_const && offset_const) {
+         const int block = block_const->u32[0];
+         const int offset = offset_const->u32[0] / 32;
+
+         /* Won't fit in our bitfield */
+         if (offset >= 64)
+            continue;
+
+         /* TODO: should we count uses in loops as higher benefit? */
+
+         struct ubo_block_info *info = get_block_info(state, block);
+         info->offsets |= 1ull << offset;
+         info->uses[offset]++;
+      }
+   }
+}
+
+static void
+print_ubo_entry(FILE *file,
+                const struct ubo_range_entry *entry,
+                struct ubo_analysis_state *state)
+{
+   struct ubo_block_info *info = get_block_info(state, entry->range.block);
+
+   fprintf(file,
+           "block %2d, start %2d, length %2d, bits = %zx, "
+           "benefit %2d, cost %2d, score = %2d\n",
+           entry->range.block, entry->range.start, entry->range.length,
+           info->offsets, entry->benefit, entry->range.length, score(entry));
+}
+
+void
+brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                           nir_shader *nir,
+                           struct brw_ubo_range out_ranges[4])
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+
+   if ((devinfo->gen <= 7 && !devinfo->is_haswell) ||
+       !compiler->scalar_stage[nir->stage]) {
+      memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range));
+      return;
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct ubo_analysis_state state = {
+      .uses_regular_uniforms = false,
+      .blocks =
+         _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
+   };
+
+   /* Walk the IR, recording how many times each UBO block/offset is used. */
+   nir_foreach_function(function, nir) {
+      if (function->impl) {
+         nir_foreach_block(block, function->impl) {
+            analyze_ubos_block(&state, block);
+         }
+      }
+   }
+
+   /* Find ranges: a block, starting 32-byte offset, and length. */
+   struct util_dynarray ranges;
+   util_dynarray_init(&ranges, mem_ctx);
+
+   struct hash_entry *entry;
+   hash_table_foreach(state.blocks, entry) {
+      const int b = entry->hash - 1;
+      const struct ubo_block_info *info = entry->data;
+      uint64_t offsets = info->offsets;
+
+      /* Walk through the offsets bitfield, finding contiguous regions of
+       * set bits:
+       *
+       *   0000000001111111111111000000000000111111111111110000000011111100
+       *            ^^^^^^^^^^^^^            ^^^^^^^^^^^^^^        ^^^^^^
+       *
+       * Each of these will become a UBO range.
+       */
+      while (offsets != 0) {
+         /* Find the first 1 in the offsets bitfield.  This represents the
+          * start of a range of interesting UBO data.  Make it zero-indexed.
+          */
+         int first_bit = ffsll(offsets) - 1;
+
+         /* Find the first 0 bit in offsets beyond first_bit.  To find the
+          * first zero bit, we find the first 1 bit in the complement.  In
+          * order to ignore bits before first_bit, we mask off those bits.
+          */
+         int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
+
+         if (first_hole == -1) {
+            /* If we didn't find a hole, then set it to the end of the
+             * bitfield.  There are no more ranges to process.
+             */
+            first_hole = 64;
+            offsets = 0;
+         } else {
+            /* We've processed all bits before first_hole.  Mask them off. */
+            offsets &= ~((1ull << first_hole) - 1);
+         }
+
+         struct ubo_range_entry *entry =
+            util_dynarray_grow(&ranges, sizeof(struct ubo_range_entry));
+
+         entry->range.block = b;
+         entry->range.start = first_bit;
+         /* first_hole is one beyond the end, so we don't need to add 1 */
+         entry->range.length = first_hole - first_bit;
+         entry->benefit = 0;
+
+         for (int i = 0; i < entry->range.length; i++)
+            entry->benefit += info->uses[first_bit + i];
+      }
+   }
+
+   int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
+
+   if (0) {
+      util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
+         print_ubo_entry(stderr, entry, &state);
+      }
+   }
+
+   /* TODO: Consider combining ranges.
+    *
+    * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS.  If there are
+    * more ranges, and two are close by with only a small hole, it may be
+    * worth combining them.  The holes will waste register space, but the
+    * benefit of removing pulls may outweigh that cost.
+    */
+
+   /* Sort the list so the most beneficial ranges are at the front. */
+   qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
+         cmp_ubo_range_entry);
+
+   struct ubo_range_entry *entries = ranges.data;
+
+   /* Return the top 4 or so.  We drop by one if regular uniforms are in
+    * use, assuming one push buffer will be dedicated to those.  We may
+    * also only get 3 on Haswell if we can't write INSTPM.
+    *
+    * The backend may need to shrink these ranges to ensure that they
+    * don't exceed the maximum push constant limits.  It can simply drop
+    * the tail of the list, as that's the least valuable portion.  We
+    * unfortunately can't truncate it here, because we don't know what
+    * the backend is planning to do with regular uniforms.
+    */
+   const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
+                        state.uses_regular_uniforms;
+   nr_entries = MIN2(nr_entries, max_ubos);
+
+   for (int i = 0; i < nr_entries; i++) {
+      out_ranges[i] = entries[i].range;
+   }
+   for (int i = nr_entries; i < 4; i++) {
+      out_ranges[i].block = 0;
+      out_ranges[i].start = 0;
+      out_ranges[i].length = 0;
+   }
+
+   ralloc_free(ranges.mem_ctx);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 0c04ef0..bd8f993 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -112,6 +112,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
    brw_nir_setup_glsl_uniforms(gp->program.nir, &gp->program,
                                &prog_data.base.base,
                                compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
+   brw_nir_analyze_ubo_ranges(compiler, gp->program.nir,
+                              prog_data.base.base.ubo_ranges);
 
    uint64_t outputs_written = gp->program.info.outputs_written;
 
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 72c5872..1ed622e 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -205,6 +205,8 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp,
 
       brw_nir_setup_glsl_uniforms(nir, &tcp->program, &prog_data.base.base,
                                   compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
+      brw_nir_analyze_ubo_ranges(compiler, tcp->program.nir,
+                                 prog_data.base.base.ubo_ranges);
    } else {
       /* Upload the Patch URB Header as the first two uniforms.
        * Do the annoying scrambling so the shader doesn't have to.
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c
index 372ef51..20ce1f4 100644
--- a/src/mesa/drivers/dri/i965/brw_tes.c
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -102,6 +102,8 @@ brw_codegen_tes_prog(struct brw_context *brw,
 
    brw_nir_setup_glsl_uniforms(nir, &tep->program, &prog_data.base.base,
                                compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
+   brw_nir_analyze_ubo_ranges(compiler, tep->program.nir,
+                              prog_data.base.base.ubo_ranges);
 
    int st_index = -1;
    if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 33f2ac1..c0a0a13 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -203,6 +203,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
       brw_nir_setup_glsl_uniforms(vp->program.nir, &vp->program,
                                   &prog_data.base.base,
                                   compiler->scalar_stage[MESA_SHADER_VERTEX]);
+      brw_nir_analyze_ubo_ranges(compiler, vp->program.nir,
+                                 prog_data.base.base.ubo_ranges);
    } else {
       brw_nir_setup_arb_uniforms(vp->program.nir, &vp->program,
                                  &prog_data.base.base);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 71118c1..3a5fcf5 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -165,6 +165,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
    if (!fp->program.is_arb_asm) {
       brw_nir_setup_glsl_uniforms(fp->program.nir, &fp->program,
                                   &prog_data.base, true);
+      brw_nir_analyze_ubo_ranges(brw->screen->compiler, fp->program.nir,
+                                 prog_data.base.ubo_ranges);
    } else {
       brw_nir_setup_arb_uniforms(fp->program.nir, &fp->program,
                                  &prog_data.base);
-- 
2.7.4