freedreno/ir3: Add pass to deal with load_uniform base offsets

author Rob Clark <robdclark@chromium.org>

Fri, 13 Nov 2020 19:48:57 +0000 (11:48 -0800)

committer Marge Bot <eric+marge@anholt.net>

Fri, 13 Nov 2020 22:44:04 +0000 (22:44 +0000)
author Rob Clark <robdclark@chromium.org>
Fri, 13 Nov 2020 19:48:57 +0000 (11:48 -0800)
committer Marge Bot <eric+marge@anholt.net>
Fri, 13 Nov 2020 22:44:04 +0000 (22:44 +0000)
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c

index dfb5f29..29ab296 100644 (file)
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -511,6 +511,13 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
         if (progress)
                 ir3_optimize_loop(s);
  
+       /* Fixup indirect load_uniform's which end up with a const base offset
+        * which is too large to encode.  Do this late(ish) so we actually
+        * can differentiate indirect vs non-indirect.
+        */
+       if (OPT(s, ir3_nir_fixup_load_uniform))
+               ir3_optimize_loop(s);
+
         /* Do late algebraic optimization to turn add(a, neg(b)) back into
         * subs, then the mandatory cleanup after algebraic.  Note that it may
         * produce fnegs, and if so then we need to keep running to squash
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h

index a6ec144..d716e53 100644 (file)
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -61,6 +61,7 @@ void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
                 struct ir3_const_state *const_state);
  void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
  bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
+bool ir3_nir_fixup_load_uniform(nir_shader *nir);
  
  nir_ssa_def *
  ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift);
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c

index de62271..8e7f9aa 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -457,3 +457,76 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
  
         return progress;
  }
+
+
+static bool
+fixup_load_uniform_filter(const nir_instr *instr, const void *arg)
+{
+       if (instr->type != nir_instr_type_intrinsic)
+               return false;
+       return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_uniform;
+}
+
+static nir_ssa_def *
+fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
+{
+       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+       /* We don't need to worry about non-indirect case: */
+       if (nir_src_is_const(intr->src[0]))
+               return NULL;
+
+       const unsigned base_offset_limit = (1 << 10);  /* 10 bits */
+       unsigned base_offset = nir_intrinsic_base(intr);
+
+       /* Or cases were base offset is lower than the hw limit: */
+       if (base_offset < base_offset_limit)
+               return NULL;
+
+       b->cursor = nir_before_instr(instr);
+
+       nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
+
+       /* We'd like to avoid a sequence like:
+        *
+        *   vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
+        *   vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
+        *   vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
+        *
+        * From turning into a unique offset value (which requires reloading
+        * a0.x for each instruction).  So instead of just adding the constant
+        * base_offset to the non-const offset, be a bit more clever and only
+        * extract the part that cannot be encoded.  Afterwards CSE should
+        * turn the result into:
+        *
+        *   vec1 32 ssa_5 = load_const (1024)
+        *   vec4 32 ssa_6  = iadd ssa4_, ssa_5
+        *   vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
+        *   vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
+        *   vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
+        */
+       unsigned new_base_offset = base_offset % base_offset_limit;
+
+       nir_intrinsic_set_base(intr, new_base_offset);
+       offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
+
+       nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
+
+       return NIR_LOWER_INSTR_PROGRESS;
+}
+
+/**
+ * For relative CONST file access, we can only encode 10b worth of fixed offset,
+ * so in cases where the base offset is larger, we need to peel it out into
+ * ALU instructions.
+ *
+ * This should run late, after constant folding has had a chance to do it's
+ * thing, so we can actually know if it is an indirect uniform offset or not.
+ */
+bool
+ir3_nir_fixup_load_uniform(nir_shader *nir)
+{
+       return nir_shader_lower_instructions(nir,
+                       fixup_load_uniform_filter, fixup_load_uniform_instr,
+                       NULL);
+}
author	Rob Clark <robdclark@chromium.org>
	Fri, 13 Nov 2020 19:48:57 +0000 (11:48 -0800)
committer	Marge Bot <eric+marge@anholt.net>
	Fri, 13 Nov 2020 22:44:04 +0000 (22:44 +0000)
src/freedreno/ir3/ir3_nir.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir.h		patch \| blob \| history
src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c		patch \| blob \| history