ir3: Reformat source with clang-format

author Connor Abbott <cwabbott0@gmail.com>

Fri, 9 Jul 2021 12:50:05 +0000 (14:50 +0200)

committer Marge Bot <eric+marge@anholt.net>

Mon, 12 Jul 2021 20:57:21 +0000 (20:57 +0000)
author Connor Abbott <cwabbott0@gmail.com>
Fri, 9 Jul 2021 12:50:05 +0000 (14:50 +0200)
committer Marge Bot <eric+marge@anholt.net>
Mon, 12 Jul 2021 20:57:21 +0000 (20:57 +0000)
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c

index 757d307..4a0fb40 100644 (file)
--- a/src/freedreno/ir3/disasm-a3xx.c
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -21,15 +21,15 @@
   * SOFTWARE.
   */
  
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
  #include <string.h>
-#include <assert.h>
  
-#include <util/u_debug.h>
  #include <util/log.h>
+#include <util/u_debug.h>
  
  #include "isa/isa.h"
  
@@ -39,125 +39,120 @@
  static enum debug_t debug;
  
  static const char *levels[] = {
-               "",
-               "\t",
-               "\t\t",
-               "\t\t\t",
-               "\t\t\t\t",
-               "\t\t\t\t\t",
-               "\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t\t\t",
-               "x",
-               "x",
-               "x",
-               "x",
-               "x",
-               "x",
+   "",
+   "\t",
+   "\t\t",
+   "\t\t\t",
+   "\t\t\t\t",
+   "\t\t\t\t\t",
+   "\t\t\t\t\t\t",
+   "\t\t\t\t\t\t\t",
+   "\t\t\t\t\t\t\t\t",
+   "\t\t\t\t\t\t\t\t\t",
+   "x",
+   "x",
+   "x",
+   "x",
+   "x",
+   "x",
  };
  
  struct disasm_ctx {
-       FILE *out;
-       struct isa_decode_options *options;
-       unsigned level;
-       unsigned extra_cycles;
-
-       /**
-        * nop_count/has_end used to detect the real end of shader.  Since
-        * in some cases there can be a epilogue following an `end` we look
-        * for a sequence of `nop`s following the `end`
-        */
-       int nop_count;      /* number of nop's since non-nop instruction: */
-       bool has_end;       /* have we seen end instruction */
-
-       int cur_n;          /* current instr # */
-       int cur_opc_cat;    /* current opc_cat */
-
-       int sfu_delay;
-
-       /**
-        * State accumulated decoding fields of the current instruction,
-        * handled after decoding is complete (ie. at start of next instr)
-        */
-       struct {
-               bool ss;
-               uint8_t nop;
-               uint8_t repeat;
-       } last;
-
-       /**
-        * State accumulated decoding fields of src or dst register
-        */
-       struct {
-               bool half;
-               bool r;
-               enum {
-                       FILE_GPR = 1,
-                       FILE_CONST = 2,
-               } file;
-               unsigned num;
-       } reg;
-
-       struct shader_stats *stats;
+   FILE *out;
+   struct isa_decode_options *options;
+   unsigned level;
+   unsigned extra_cycles;
+
+   /**
+    * nop_count/has_end used to detect the real end of shader.  Since
+    * in some cases there can be a epilogue following an `end` we look
+    * for a sequence of `nop`s following the `end`
+    */
+   int nop_count; /* number of nop's since non-nop instruction: */
+   bool has_end;  /* have we seen end instruction */
+
+   int cur_n;       /* current instr # */
+   int cur_opc_cat; /* current opc_cat */
+
+   int sfu_delay;
+
+   /**
+    * State accumulated decoding fields of the current instruction,
+    * handled after decoding is complete (ie. at start of next instr)
+    */
+   struct {
+      bool ss;
+      uint8_t nop;
+      uint8_t repeat;
+   } last;
+
+   /**
+    * State accumulated decoding fields of src or dst register
+    */
+   struct {
+      bool half;
+      bool r;
+      enum {
+         FILE_GPR = 1,
+         FILE_CONST = 2,
+      } file;
+      unsigned num;
+   } reg;
+
+   struct shader_stats *stats;
  };
  
-static void print_stats(struct disasm_ctx *ctx)
+static void
+print_stats(struct disasm_ctx *ctx)
  {
-       if (ctx->options->gpu_id >= 600) {
-               /* handle MERGEREGS case.. this isn't *entirely* accurate, as
-                * you can have shader stages not using merged register file,
-                * but it is good enough for a guestimate:
-                */
-               unsigned n = (ctx->stats->halfreg + 1) / 2;
-
-               ctx->stats->halfreg = 0;
-               ctx->stats->fullreg = MAX2(ctx->stats->fullreg, n);
-       }
-
-       unsigned instructions = ctx->cur_n + ctx->extra_cycles + 1;
-
-       fprintf(ctx->out, "%sStats:\n", levels[ctx->level]);
-       fprintf(ctx->out, "%s- shaderdb: %u instr, %u nops, %u non-nops, %u mov, %u cov\n",
-                       levels[ctx->level],
-                       instructions,
-                       ctx->stats->nops,
-                       instructions - ctx->stats->nops,
-                       ctx->stats->mov_count,
-                       ctx->stats->cov_count);
-
-       fprintf(ctx->out, "%s- shaderdb: %u last-baryf, %d half, %d full, %u constlen\n",
-                       levels[ctx->level],
-                       ctx->stats->last_baryf,
-                       DIV_ROUND_UP(ctx->stats->halfreg, 4),
-                       DIV_ROUND_UP(ctx->stats->fullreg, 4),
-                       DIV_ROUND_UP(ctx->stats->constlen, 4));
-
-       fprintf(ctx->out, "%s- shaderdb: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7\n",
-                       levels[ctx->level],
-                       ctx->stats->instrs_per_cat[0],
-                       ctx->stats->instrs_per_cat[1],
-                       ctx->stats->instrs_per_cat[2],
-                       ctx->stats->instrs_per_cat[3],
-                       ctx->stats->instrs_per_cat[4],
-                       ctx->stats->instrs_per_cat[5],
-                       ctx->stats->instrs_per_cat[6],
-                       ctx->stats->instrs_per_cat[7]);
-
-       fprintf(ctx->out, "%s- shaderdb: %u sstall, %u (ss), %u (sy)\n",
-                       levels[ctx->level],
-                       ctx->stats->sstall,
-                       ctx->stats->ss,
-                       ctx->stats->sy);
+   if (ctx->options->gpu_id >= 600) {
+      /* handle MERGEREGS case.. this isn't *entirely* accurate, as
+       * you can have shader stages not using merged register file,
+       * but it is good enough for a guestimate:
+       */
+      unsigned n = (ctx->stats->halfreg + 1) / 2;
+
+      ctx->stats->halfreg = 0;
+      ctx->stats->fullreg = MAX2(ctx->stats->fullreg, n);
+   }
+
+   unsigned instructions = ctx->cur_n + ctx->extra_cycles + 1;
+
+   fprintf(ctx->out, "%sStats:\n", levels[ctx->level]);
+   fprintf(ctx->out,
+           "%s- shaderdb: %u instr, %u nops, %u non-nops, %u mov, %u cov\n",
+           levels[ctx->level], instructions, ctx->stats->nops,
+           instructions - ctx->stats->nops, ctx->stats->mov_count,
+           ctx->stats->cov_count);
+
+   fprintf(ctx->out,
+           "%s- shaderdb: %u last-baryf, %d half, %d full, %u constlen\n",
+           levels[ctx->level], ctx->stats->last_baryf,
+           DIV_ROUND_UP(ctx->stats->halfreg, 4),
+           DIV_ROUND_UP(ctx->stats->fullreg, 4),
+           DIV_ROUND_UP(ctx->stats->constlen, 4));
+
+   fprintf(
+      ctx->out,
+      "%s- shaderdb: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7\n",
+      levels[ctx->level], ctx->stats->instrs_per_cat[0],
+      ctx->stats->instrs_per_cat[1], ctx->stats->instrs_per_cat[2],
+      ctx->stats->instrs_per_cat[3], ctx->stats->instrs_per_cat[4],
+      ctx->stats->instrs_per_cat[5], ctx->stats->instrs_per_cat[6],
+      ctx->stats->instrs_per_cat[7]);
+
+   fprintf(ctx->out, "%s- shaderdb: %u sstall, %u (ss), %u (sy)\n",
+           levels[ctx->level], ctx->stats->sstall, ctx->stats->ss,
+           ctx->stats->sy);
  }
  
  /* size of largest OPC field of all the instruction categories: */
  #define NOPC_BITS 6
  
  static const struct opc_info {
-       const char *name;
-} opcs[1 << (3+NOPC_BITS)] = {
-#define OPC(cat, opc, name) [(opc)] = { #name }
+   const char *name;
+} opcs[1 << (3 + NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = {#name}
     /* clang-format off */
     /* category 0: */
     OPC(0, OPC_NOP,          nop),
@@ -359,96 +354,96 @@ static const struct opc_info {
  #undef OPC
  };
  
-#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
+#define GETINFO(instr)                                                         \
+   (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
  
-const char *disasm_a3xx_instr_name(opc_t opc)
+const char *
+disasm_a3xx_instr_name(opc_t opc)
  {
-       if (opc_cat(opc) == -1) return "??meta??";
-       return opcs[opc].name;
+   if (opc_cat(opc) == -1)
+      return "??meta??";
+   return opcs[opc].name;
  }
  
-
  static void
  disasm_field_cb(void *d, const char *field_name, struct isa_decode_value *val)
  {
-       struct disasm_ctx *ctx = d;
-
-       if (!strcmp(field_name, "NAME")) {
-               if (!strcmp("nop", val->str)) {
-                       if (ctx->has_end) {
-                               ctx->nop_count++;
-                               if (ctx->nop_count > 3) {
-                                       ctx->options->stop = true;
-                               }
-                       }
-                       ctx->stats->nops += 1 + ctx->last.repeat;
-               } else {
-                       ctx->nop_count = 0;
-               }
-
-               if (!strcmp("end", val->str)) {
-                       ctx->has_end = true;
-                       ctx->nop_count = 0;
-               } else if (!strcmp("chsh", val->str)) {
-                       ctx->options->stop = true;
-               } else if (!strcmp("bary.f", val->str)) {
-                       ctx->stats->last_baryf = ctx->cur_n;
-               }
-       } else if (!strcmp(field_name, "REPEAT")) {
-               ctx->extra_cycles += val->num;
-               ctx->stats->instrs_per_cat[ctx->cur_opc_cat] += val->num;
-               ctx->last.repeat = val->num;
-       } else if (!strcmp(field_name, "NOP")) {
-               ctx->extra_cycles += val->num;
-               ctx->stats->instrs_per_cat[0] += val->num;
-               ctx->stats->nops += val->num;
-               ctx->last.nop = val->num;
-       } else if (!strcmp(field_name, "SY")) {
-               ctx->stats->sy += val->num;
-       } else if (!strcmp(field_name, "SS")) {
-               ctx->stats->ss += val->num;
-               ctx->last.ss = !!val->num;
-       } else if (!strcmp(field_name, "CONST")) {
-               ctx->reg.num = val->num;
-               ctx->reg.file = FILE_CONST;
-       } else if (!strcmp(field_name, "GPR")) {
-               /* don't count GPR regs r48.x (shared) or higher: */
-               if (val->num < 48) {
-                       ctx->reg.num = val->num;
-                       ctx->reg.file = FILE_GPR;
-               }
-       } else if (!strcmp(field_name, "SRC_R") ||
-                       !strcmp(field_name, "SRC1_R") ||
-                       !strcmp(field_name, "SRC2_R") ||
-                       !strcmp(field_name, "SRC3_R")) {
-               ctx->reg.r = val->num;
-       } else if (!strcmp(field_name, "DST")) {
-               /* Dest register is always repeated
-                *
-                * Note that this doesn't really properly handle instructions
-                * that write multiple components.. the old disasm didn't handle
-                * that case either.
-                */
-               ctx->reg.r = true;
-       } else if (strstr(field_name, "HALF")) {
-               ctx->reg.half = val->num;
-       } else if (!strcmp(field_name, "SWIZ")) {
-               unsigned num = (ctx->reg.num << 2) | val->num;
-               if (ctx->reg.r)
-                       num += ctx->last.repeat;
-
-               if (ctx->reg.file == FILE_CONST) {
-                       ctx->stats->constlen = MAX2(ctx->stats->constlen, num);
-               } else if (ctx->reg.file == FILE_GPR) {
-                       if (ctx->reg.half) {
-                               ctx->stats->halfreg = MAX2(ctx->stats->halfreg, num);
-                       } else {
-                               ctx->stats->fullreg = MAX2(ctx->stats->fullreg, num);
-                       }
-               }
-
-               memset(&ctx->reg, 0, sizeof(ctx->reg));
-       }
+   struct disasm_ctx *ctx = d;
+
+   if (!strcmp(field_name, "NAME")) {
+      if (!strcmp("nop", val->str)) {
+         if (ctx->has_end) {
+            ctx->nop_count++;
+            if (ctx->nop_count > 3) {
+               ctx->options->stop = true;
+            }
+         }
+         ctx->stats->nops += 1 + ctx->last.repeat;
+      } else {
+         ctx->nop_count = 0;
+      }
+
+      if (!strcmp("end", val->str)) {
+         ctx->has_end = true;
+         ctx->nop_count = 0;
+      } else if (!strcmp("chsh", val->str)) {
+         ctx->options->stop = true;
+      } else if (!strcmp("bary.f", val->str)) {
+         ctx->stats->last_baryf = ctx->cur_n;
+      }
+   } else if (!strcmp(field_name, "REPEAT")) {
+      ctx->extra_cycles += val->num;
+      ctx->stats->instrs_per_cat[ctx->cur_opc_cat] += val->num;
+      ctx->last.repeat = val->num;
+   } else if (!strcmp(field_name, "NOP")) {
+      ctx->extra_cycles += val->num;
+      ctx->stats->instrs_per_cat[0] += val->num;
+      ctx->stats->nops += val->num;
+      ctx->last.nop = val->num;
+   } else if (!strcmp(field_name, "SY")) {
+      ctx->stats->sy += val->num;
+   } else if (!strcmp(field_name, "SS")) {
+      ctx->stats->ss += val->num;
+      ctx->last.ss = !!val->num;
+   } else if (!strcmp(field_name, "CONST")) {
+      ctx->reg.num = val->num;
+      ctx->reg.file = FILE_CONST;
+   } else if (!strcmp(field_name, "GPR")) {
+      /* don't count GPR regs r48.x (shared) or higher: */
+      if (val->num < 48) {
+         ctx->reg.num = val->num;
+         ctx->reg.file = FILE_GPR;
+      }
+   } else if (!strcmp(field_name, "SRC_R") || !strcmp(field_name, "SRC1_R") ||
+              !strcmp(field_name, "SRC2_R") || !strcmp(field_name, "SRC3_R")) {
+      ctx->reg.r = val->num;
+   } else if (!strcmp(field_name, "DST")) {
+      /* Dest register is always repeated
+       *
+       * Note that this doesn't really properly handle instructions
+       * that write multiple components.. the old disasm didn't handle
+       * that case either.
+       */
+      ctx->reg.r = true;
+   } else if (strstr(field_name, "HALF")) {
+      ctx->reg.half = val->num;
+   } else if (!strcmp(field_name, "SWIZ")) {
+      unsigned num = (ctx->reg.num << 2) | val->num;
+      if (ctx->reg.r)
+         num += ctx->last.repeat;
+
+      if (ctx->reg.file == FILE_CONST) {
+         ctx->stats->constlen = MAX2(ctx->stats->constlen, num);
+      } else if (ctx->reg.file == FILE_GPR) {
+         if (ctx->reg.half) {
+            ctx->stats->halfreg = MAX2(ctx->stats->halfreg, num);
+         } else {
+            ctx->stats->fullreg = MAX2(ctx->stats->fullreg, num);
+         }
+      }
+
+      memset(&ctx->reg, 0, sizeof(ctx->reg));
+   }
  }
  
  /**
@@ -458,103 +453,105 @@ disasm_field_cb(void *d, const char *field_name, struct isa_decode_value *val)
  static void
  disasm_handle_last(struct disasm_ctx *ctx)
  {
-       if (ctx->last.ss) {
-               ctx->stats->sstall += ctx->sfu_delay;
-               ctx->sfu_delay = 0;
-       }
-
-       if (ctx->cur_opc_cat == 4) {
-               ctx->sfu_delay = 10;
-       } else {
-               int n = MIN2(ctx->sfu_delay, 1 + ctx->last.repeat + ctx->last.nop);
-               ctx->sfu_delay -= n;
-       }
-
-       memset(&ctx->last, 0, sizeof(ctx->last));
+   if (ctx->last.ss) {
+      ctx->stats->sstall += ctx->sfu_delay;
+      ctx->sfu_delay = 0;
+   }
+
+   if (ctx->cur_opc_cat == 4) {
+      ctx->sfu_delay = 10;
+   } else {
+      int n = MIN2(ctx->sfu_delay, 1 + ctx->last.repeat + ctx->last.nop);
+      ctx->sfu_delay -= n;
+   }
+
+   memset(&ctx->last, 0, sizeof(ctx->last));
  }
  
  static void
  disasm_instr_cb(void *d, unsigned n, uint64_t instr)
  {
-       struct disasm_ctx *ctx = d;
-       uint32_t *dwords = (uint32_t *)&instr;
-       unsigned opc_cat = instr >> 61;
-
-       /* There are some cases where we can get instr_cb called multiple
-        * times per instruction (like when we need an extra line for branch
-        * target labels), don't update stats in these cases:
-        */
-       if (n != ctx->cur_n) {
-               if (n > 0) {
-                       disasm_handle_last(ctx);
-               }
-               ctx->stats->instrs_per_cat[opc_cat]++;
-               ctx->cur_n = n;
-
-               /* mov vs cov stats are a bit harder to fish out of the field
-                * names, because current ir3-cat1.xml doesn't use {NAME} for
-                * this distinction.  So for now just handle this case with
-                * some hand-coded parsing:
-                */
-               if (opc_cat == 1) {
-                       unsigned opc      = (instr >> 57) & 0x3;
-                       unsigned src_type = (instr >> 50) & 0x7;
-                       unsigned dst_type = (instr >> 46) & 0x7;
-
-                       if (opc == 0) {
-                               if (src_type == dst_type) {
-                                       ctx->stats->mov_count++;
-                               } else {
-                                       ctx->stats->cov_count++;
-                               }
-                       }
-               }
-       }
-
-       ctx->cur_opc_cat = opc_cat;
-
-       if (debug & PRINT_RAW) {
-               fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
-                       opc_cat, n, ctx->extra_cycles + n, dwords[1], dwords[0]);
-       }
+   struct disasm_ctx *ctx = d;
+   uint32_t *dwords = (uint32_t *)&instr;
+   unsigned opc_cat = instr >> 61;
+
+   /* There are some cases where we can get instr_cb called multiple
+    * times per instruction (like when we need an extra line for branch
+    * target labels), don't update stats in these cases:
+    */
+   if (n != ctx->cur_n) {
+      if (n > 0) {
+         disasm_handle_last(ctx);
+      }
+      ctx->stats->instrs_per_cat[opc_cat]++;
+      ctx->cur_n = n;
+
+      /* mov vs cov stats are a bit harder to fish out of the field
+       * names, because current ir3-cat1.xml doesn't use {NAME} for
+       * this distinction.  So for now just handle this case with
+       * some hand-coded parsing:
+       */
+      if (opc_cat == 1) {
+         unsigned opc = (instr >> 57) & 0x3;
+         unsigned src_type = (instr >> 50) & 0x7;
+         unsigned dst_type = (instr >> 46) & 0x7;
+
+         if (opc == 0) {
+            if (src_type == dst_type) {
+               ctx->stats->mov_count++;
+            } else {
+               ctx->stats->cov_count++;
+            }
+         }
+      }
+   }
+
+   ctx->cur_opc_cat = opc_cat;
+
+   if (debug & PRINT_RAW) {
+      fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
+              opc_cat, n, ctx->extra_cycles + n, dwords[1], dwords[0]);
+   }
  }
  
-int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
-               unsigned gpu_id, struct shader_stats *stats)
+int
+disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
+                 unsigned gpu_id, struct shader_stats *stats)
  {
-       struct isa_decode_options decode_options = {
-               .gpu_id = gpu_id,
-               .show_errors = true,
-               .max_errors = 5,
-               .branch_labels = true,
-               .field_cb = disasm_field_cb,
-               .instr_cb = disasm_instr_cb,
-       };
-       struct disasm_ctx ctx = {
-               .out = out,
-               .level = level,
-               .options = &decode_options,
-               .stats = stats,
-               .cur_n = -1,
-       };
-
-       memset(stats, 0, sizeof(*stats));
-
-       decode_options.cbdata = &ctx;
-
-       isa_decode(dwords, sizedwords * 4, out, &decode_options);
-
-       disasm_handle_last(&ctx);
-
-       if (debug & PRINT_STATS)
-               print_stats(&ctx);
-
-       return 0;
+   struct isa_decode_options decode_options = {
+      .gpu_id = gpu_id,
+      .show_errors = true,
+      .max_errors = 5,
+      .branch_labels = true,
+      .field_cb = disasm_field_cb,
+      .instr_cb = disasm_instr_cb,
+   };
+   struct disasm_ctx ctx = {
+      .out = out,
+      .level = level,
+      .options = &decode_options,
+      .stats = stats,
+      .cur_n = -1,
+   };
+
+   memset(stats, 0, sizeof(*stats));
+
+   decode_options.cbdata = &ctx;
+
+   isa_decode(dwords, sizedwords * 4, out, &decode_options);
+
+   disasm_handle_last(&ctx);
+
+   if (debug & PRINT_STATS)
+      print_stats(&ctx);
+
+   return 0;
  }
  
-void disasm_a3xx_set_debug(enum debug_t d)
+void
+disasm_a3xx_set_debug(enum debug_t d)
  {
-       debug = d;
+   debug = d;
  }
  
  #include <setjmp.h>
@@ -564,34 +561,38 @@ static jmp_buf jmp_env;
  
  void
  ir3_assert_handler(const char *expr, const char *file, int line,
-               const char *func)
+                   const char *func)
  {
-       mesa_loge("%s:%u: %s: Assertion `%s' failed.", file, line, func, expr);
-       if (jmp_env_valid)
-               longjmp(jmp_env, 1);
-       abort();
+   mesa_loge("%s:%u: %s: Assertion `%s' failed.", file, line, func, expr);
+   if (jmp_env_valid)
+      longjmp(jmp_env, 1);
+   abort();
  }
  
-#define TRY(x) do { \
-               assert(!jmp_env_valid); \
-               if (setjmp(jmp_env) == 0) { \
-                       jmp_env_valid = true; \
-                       x; \
-               } \
-               jmp_env_valid = false; \
-       } while (0)
-
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
+#define TRY(x)                                                                 \
+   do {                                                                        \
+      assert(!jmp_env_valid);                                                  \
+      if (setjmp(jmp_env) == 0) {                                              \
+         jmp_env_valid = true;                                                 \
+         x;                                                                    \
+      }                                                                        \
+      jmp_env_valid = false;                                                   \
+   } while (0)
+
+int
+disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out,
+            unsigned gpu_id)
  {
-       struct shader_stats stats;
-       return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
+   struct shader_stats stats;
+   return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
  }
  
-int try_disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
+int
+try_disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out,
+                unsigned gpu_id)
  {
-       struct shader_stats stats;
-       int ret = -1;
-       TRY(ret = disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats));
-       return ret;
+   struct shader_stats stats;
+   int ret = -1;
+   TRY(ret = disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats));
+   return ret;
  }
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h

index 71b03e6..8eba64c 100644 (file)
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -26,10 +26,10 @@
  
  #define PACKED __attribute__((__packed__))
  
+#include <assert.h>
+#include <stdbool.h>
  #include <stdint.h>
  #include <stdio.h>
-#include <stdbool.h>
-#include <assert.h>
  
  /* clang-format off */
  void ir3_assert_handler(const char *expr, const char *file, int line,
@@ -41,18 +41,19 @@ void ir3_assert_handler(const char *expr, const char *file, int line,
   * attempt to disassemble memory that might not actually be valid
   * instructions.
   */
-#define ir3_assert(expr) do { \
-               if (!(expr)) { \
-                       if (ir3_assert_handler) { \
-                               ir3_assert_handler(#expr, __FILE__, __LINE__, __func__); \
-                       } \
-                       assert(expr); \
-               } \
-       } while (0)
+#define ir3_assert(expr)                                                       \
+   do {                                                                        \
+      if (!(expr)) {                                                           \
+         if (ir3_assert_handler) {                                             \
+            ir3_assert_handler(#expr, __FILE__, __LINE__, __func__);           \
+         }                                                                     \
+         assert(expr);                                                         \
+      }                                                                        \
+   } while (0)
  /* size of largest OPC field of all the instruction categories: */
  #define NOPC_BITS 6
  
-#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+#define _OPC(cat, opc) (((cat) << NOPC_BITS) | opc)
  
  /* clang-format off */
  typedef enum {
@@ -347,76 +348,80 @@ typedef enum {
  const char *disasm_a3xx_instr_name(opc_t opc);
  
  typedef enum {
-       TYPE_F16 = 0,
-       TYPE_F32 = 1,
-       TYPE_U16 = 2,
-       TYPE_U32 = 3,
-       TYPE_S16 = 4,
-       TYPE_S32 = 5,
-       TYPE_U8  = 6,
-       TYPE_S8  = 7,  // XXX I assume?
+   TYPE_F16 = 0,
+   TYPE_F32 = 1,
+   TYPE_U16 = 2,
+   TYPE_U32 = 3,
+   TYPE_S16 = 4,
+   TYPE_S32 = 5,
+   TYPE_U8 = 6,
+   TYPE_S8 = 7, // XXX I assume?
  } type_t;
  
-static inline uint32_t type_size(type_t type)
+static inline uint32_t
+type_size(type_t type)
  {
-       switch (type) {
-       case TYPE_F32:
-       case TYPE_U32:
-       case TYPE_S32:
-               return 32;
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return 16;
-       case TYPE_U8:
-       case TYPE_S8:
-               return 8;
-       default:
-               ir3_assert(0); /* invalid type */
-               return 0;
-       }
+   switch (type) {
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      return 32;
+   case TYPE_F16:
+   case TYPE_U16:
+   case TYPE_S16:
+      return 16;
+   case TYPE_U8:
+   case TYPE_S8:
+      return 8;
+   default:
+      ir3_assert(0); /* invalid type */
+      return 0;
+   }
  }
  
-static inline int type_float(type_t type)
+static inline int
+type_float(type_t type)
  {
-       return (type == TYPE_F32) || (type == TYPE_F16);
+   return (type == TYPE_F32) || (type == TYPE_F16);
  }
  
-static inline int type_uint(type_t type)
+static inline int
+type_uint(type_t type)
  {
-       return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+   return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
  }
  
-static inline int type_sint(type_t type)
+static inline int
+type_sint(type_t type)
  {
-       return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+   return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
  }
  
  typedef enum {
-       ROUND_ZERO = 0,
-       ROUND_EVEN = 1,
-       ROUND_POS_INF = 2,
-       ROUND_NEG_INF = 3,
+   ROUND_ZERO = 0,
+   ROUND_EVEN = 1,
+   ROUND_POS_INF = 2,
+   ROUND_NEG_INF = 3,
  } round_t;
  
  typedef union PACKED {
-       /* normal gpr or const src register: */
-       struct PACKED {
-               uint32_t comp  : 2;
-               uint32_t num   : 10;
-       };
-       /* for immediate val: */
-       int32_t  iim_val   : 11;
-       /* to make compiler happy: */
-       uint32_t dummy32;
-       uint32_t dummy10   : 10;
-       int32_t  idummy10  : 10;
-       uint32_t dummy11   : 11;
-       uint32_t dummy12   : 12;
-       uint32_t dummy13   : 13;
-       uint32_t dummy8    : 8;
-       int32_t  idummy13  : 13;
-       int32_t  idummy8   : 8;
+   /* normal gpr or const src register: */
+   struct PACKED {
+      uint32_t comp : 2;
+      uint32_t num  : 10;
+   };
+   /* for immediate val: */
+   int32_t iim_val : 11;
+   /* to make compiler happy: */
+   uint32_t dummy32;
+   uint32_t dummy10 : 10;
+   int32_t idummy10 : 10;
+   uint32_t dummy11 : 11;
+   uint32_t dummy12 : 12;
+   uint32_t dummy13 : 13;
+   uint32_t dummy8  : 8;
+   int32_t idummy13 : 13;
+   int32_t idummy8  : 8;
  } reg_t;
  
  /* comp:
@@ -425,293 +430,296 @@ typedef union PACKED {
   *   2 - z
   *   3 - w
   */
-static inline uint32_t regid(int num, int comp)
+static inline uint32_t
+regid(int num, int comp)
  {
-       return (num << 2) | (comp & 0x3);
+   return (num << 2) | (comp & 0x3);
  }
  
-#define INVALID_REG      regid(63, 0)
-#define VALIDREG(r)      ((r) != INVALID_REG)
-#define CONDREG(r, val)  COND(VALIDREG(r), (val))
+#define INVALID_REG     regid(63, 0)
+#define VALIDREG(r)     ((r) != INVALID_REG)
+#define CONDREG(r, val) COND(VALIDREG(r), (val))
  
  /* special registers: */
-#define REG_A0 61       /* address register */
-#define REG_P0 62       /* predicate register */
+#define REG_A0 61 /* address register */
+#define REG_P0 62 /* predicate register */
  
-static inline int reg_special(reg_t reg)
+static inline int
+reg_special(reg_t reg)
  {
-       return (reg.num == REG_A0) || (reg.num == REG_P0);
+   return (reg.num == REG_A0) || (reg.num == REG_P0);
  }
  
  typedef enum {
-       BRANCH_PLAIN = 0,   /* br */
-       BRANCH_OR    = 1,   /* brao */
-       BRANCH_AND   = 2,   /* braa */
-       BRANCH_CONST = 3,   /* brac */
-       BRANCH_ANY   = 4,   /* bany */
-       BRANCH_ALL   = 5,   /* ball */
-       BRANCH_X     = 6,   /* brax ??? */
+   BRANCH_PLAIN = 0, /* br */
+   BRANCH_OR = 1,    /* brao */
+   BRANCH_AND = 2,   /* braa */
+   BRANCH_CONST = 3, /* brac */
+   BRANCH_ANY = 4,   /* bany */
+   BRANCH_ALL = 5,   /* ball */
+   BRANCH_X = 6,     /* brax ??? */
  } brtype_t;
  
  typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       int16_t  immed    : 16;
-                       uint32_t dummy1   : 16;
-               } a3xx;
-               struct PACKED {
-                       int32_t  immed    : 20;
-                       uint32_t dummy1   : 12;
-               } a4xx;
-               struct PACKED {
-                       int32_t immed     : 32;
-               } a5xx;
-       };
-
-       /* dword1: */
-       uint32_t idx      : 5;  /* brac.N index */
-       uint32_t brtype   : 3;  /* branch type, see brtype_t */
-       uint32_t repeat   : 3;
-       uint32_t dummy3   : 1;
-       uint32_t ss       : 1;
-       uint32_t inv2     : 1;
-       uint32_t comp2    : 2;
-       uint32_t eq       : 1;
-       uint32_t opc_hi   : 1;  /* at least one bit */
-       uint32_t dummy4   : 2;
-       uint32_t inv1     : 1;
-       uint32_t comp1    : 2;  /* component for first src */
-       uint32_t opc      : 4;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
+   /* dword0: */
+   union PACKED {
+      struct PACKED {
+         int16_t immed   : 16;
+         uint32_t dummy1 : 16;
+      } a3xx;
+      struct PACKED {
+         int32_t immed   : 20;
+         uint32_t dummy1 : 12;
+      } a4xx;
+      struct PACKED {
+         int32_t immed : 32;
+      } a5xx;
+   };
+
+   /* dword1: */
+   uint32_t idx     : 5; /* brac.N index */
+   uint32_t brtype  : 3; /* branch type, see brtype_t */
+   uint32_t repeat  : 3;
+   uint32_t dummy3  : 1;
+   uint32_t ss      : 1;
+   uint32_t inv2    : 1;
+   uint32_t comp2   : 2;
+   uint32_t eq      : 1;
+   uint32_t opc_hi  : 1; /* at least one bit */
+   uint32_t dummy4  : 2;
+   uint32_t inv1    : 1;
+   uint32_t comp1   : 2; /* component for first src */
+   uint32_t opc     : 4;
+   uint32_t jmp_tgt : 1;
+   uint32_t sync    : 1;
+   uint32_t opc_cat : 3;
  } instr_cat0_t;
  
  typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               /* for normal src register: */
-               struct PACKED {
-                       uint32_t src : 11;
-                       /* at least low bit of pad must be zero or it will
-                        * look like a address relative src
-                        */
-                       uint32_t pad : 21;
-               };
-               /* for address relative: */
-               struct PACKED {
-                       int32_t  off : 10;
-                       uint32_t src_rel_c : 1;
-                       uint32_t src_rel : 1;
-                       uint32_t unknown : 20;
-               };
-               /* for immediate: */
-               int32_t  iim_val;
-               uint32_t uim_val;
-               float    fim_val;
-       };
-
-       /* dword1: */
-       uint32_t dst        : 8;
-       uint32_t repeat     : 3;
-       uint32_t src_r      : 1;
-       uint32_t ss         : 1;
-       uint32_t ul         : 1;
-       uint32_t dst_type   : 3;
-       uint32_t dst_rel    : 1;
-       uint32_t src_type   : 3;
-       uint32_t src_c      : 1;
-       uint32_t src_im     : 1;
-       uint32_t even       : 1;
-       uint32_t pos_inf    : 1;
-       uint32_t opc        : 2;
-       uint32_t jmp_tgt    : 1;
-       uint32_t sync       : 1;
-       uint32_t opc_cat    : 3;
+   /* dword0: */
+   union PACKED {
+      /* for normal src register: */
+      struct PACKED {
+         uint32_t src : 11;
+         /* at least low bit of pad must be zero or it will
+          * look like a address relative src
+          */
+         uint32_t pad : 21;
+      };
+      /* for address relative: */
+      struct PACKED {
+         int32_t off        : 10;
+         uint32_t src_rel_c : 1;
+         uint32_t src_rel   : 1;
+         uint32_t unknown   : 20;
+      };
+      /* for immediate: */
+      int32_t iim_val;
+      uint32_t uim_val;
+      float fim_val;
+   };
+
+   /* dword1: */
+   uint32_t dst      : 8;
+   uint32_t repeat   : 3;
+   uint32_t src_r    : 1;
+   uint32_t ss       : 1;
+   uint32_t ul       : 1;
+   uint32_t dst_type : 3;
+   uint32_t dst_rel  : 1;
+   uint32_t src_type : 3;
+   uint32_t src_c    : 1;
+   uint32_t src_im   : 1;
+   uint32_t even     : 1;
+   uint32_t pos_inf  : 1;
+   uint32_t opc      : 2;
+   uint32_t jmp_tgt  : 1;
+   uint32_t sync     : 1;
+   uint32_t opc_cat  : 3;
  } instr_cat1_t;
  
  typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src1         : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src1_im      : 1;   /* immediate */
-                       uint32_t src1_neg     : 1;   /* negate */
-                       uint32_t src1_abs     : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src1         : 10;
-                       uint32_t src1_c       : 1;   /* relative-const */
-                       uint32_t src1_rel     : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel1;
-               struct PACKED {
-                       uint32_t src1         : 12;
-                       uint32_t src1_c       : 1;   /* const */
-                       int32_t dummy        : 3;
-               } c1;
-       };
-
-       union PACKED {
-               struct PACKED {
-                       uint32_t src2         : 11;
-                       uint32_t must_be_zero2: 2;
-                       uint32_t src2_im      : 1;   /* immediate */
-                       uint32_t src2_neg     : 1;   /* negate */
-                       uint32_t src2_abs     : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src2         : 10;
-                       uint32_t src2_c       : 1;   /* relative-const */
-                       uint32_t src2_rel     : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel2;
-               struct PACKED {
-                       uint32_t src2         : 12;
-                       uint32_t src2_c       : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c2;
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 2;
-       uint32_t sat      : 1;
-       uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;   /* dunno */
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t ei       : 1;
-       uint32_t cond     : 3;
-       uint32_t src2_r   : 1;   /* doubles as nop1 if repeat==0 */
-       uint32_t full     : 1;   /* not half */
-       uint32_t opc      : 6;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
+   /* dword0: */
+   union PACKED {
+      struct PACKED {
+         uint32_t src1          : 11;
+         uint32_t must_be_zero1 : 2;
+         uint32_t src1_im       : 1; /* immediate */
+         uint32_t src1_neg      : 1; /* negate */
+         uint32_t src1_abs      : 1; /* absolute value */
+      };
+      struct PACKED {
+         uint32_t src1         : 10;
+         uint32_t src1_c       : 1; /* relative-const */
+         uint32_t src1_rel     : 1; /* relative address */
+         uint32_t must_be_zero : 1;
+         uint32_t dummy        : 3;
+      } rel1;
+      struct PACKED {
+         uint32_t src1   : 12;
+         uint32_t src1_c : 1; /* const */
+         int32_t dummy   : 3;
+      } c1;
+   };
+
+   union PACKED {
+      struct PACKED {
+         uint32_t src2          : 11;
+         uint32_t must_be_zero2 : 2;
+         uint32_t src2_im       : 1; /* immediate */
+         uint32_t src2_neg      : 1; /* negate */
+         uint32_t src2_abs      : 1; /* absolute value */
+      };
+      struct PACKED {
+         uint32_t src2         : 10;
+         uint32_t src2_c       : 1; /* relative-const */
+         uint32_t src2_rel     : 1; /* relative address */
+         uint32_t must_be_zero : 1;
+         uint32_t dummy        : 3;
+      } rel2;
+      struct PACKED {
+         uint32_t src2   : 12;
+         uint32_t src2_c : 1; /* const */
+         uint32_t dummy  : 3;
+      } c2;
+   };
+
+   /* dword1: */
+   uint32_t dst      : 8;
+   uint32_t repeat   : 2;
+   uint32_t sat      : 1;
+   uint32_t src1_r   : 1; /* doubles as nop0 if repeat==0 */
+   uint32_t ss       : 1;
+   uint32_t ul       : 1; /* dunno */
+   uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+   uint32_t ei       : 1;
+   uint32_t cond     : 3;
+   uint32_t src2_r   : 1; /* doubles as nop1 if repeat==0 */
+   uint32_t full     : 1; /* not half */
+   uint32_t opc      : 6;
+   uint32_t jmp_tgt  : 1;
+   uint32_t sync     : 1;
+   uint32_t opc_cat  : 3;
  } instr_cat2_t;
  
  typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src1         : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src2_c       : 1;
-                       uint32_t src1_neg     : 1;
-                       uint32_t src2_r       : 1;  /* doubles as nop1 if repeat==0 */
-               };
-               struct PACKED {
-                       uint32_t src1         : 10;
-                       uint32_t src1_c       : 1;
-                       uint32_t src1_rel     : 1;
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel1;
-               struct PACKED {
-                       uint32_t src1         : 12;
-                       uint32_t src1_c       : 1;
-                       uint32_t dummy        : 3;
-               } c1;
-       };
-
-       union PACKED {
-               struct PACKED {
-                       uint32_t src3         : 11;
-                       uint32_t must_be_zero2: 2;
-                       uint32_t src3_r       : 1;
-                       uint32_t src2_neg     : 1;
-                       uint32_t src3_neg     : 1;
-               };
-               struct PACKED {
-                       uint32_t src3         : 10;
-                       uint32_t src3_c       : 1;
-                       uint32_t src3_rel     : 1;
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel2;
-               struct PACKED {
-                       uint32_t src3         : 12;
-                       uint32_t src3_c       : 1;
-                       uint32_t dummy        : 3;
-               } c2;
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 2;
-       uint32_t sat      : 1;
-       uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t src2     : 8;
-       uint32_t opc      : 4;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
+   /* dword0: */
+   union PACKED {
+      struct PACKED {
+         uint32_t src1          : 11;
+         uint32_t must_be_zero1 : 2;
+         uint32_t src2_c        : 1;
+         uint32_t src1_neg      : 1;
+         uint32_t src2_r        : 1; /* doubles as nop1 if repeat==0 */
+      };
+      struct PACKED {
+         uint32_t src1         : 10;
+         uint32_t src1_c       : 1;
+         uint32_t src1_rel     : 1;
+         uint32_t must_be_zero : 1;
+         uint32_t dummy        : 3;
+      } rel1;
+      struct PACKED {
+         uint32_t src1   : 12;
+         uint32_t src1_c : 1;
+         uint32_t dummy  : 3;
+      } c1;
+   };
+
+   union PACKED {
+      struct PACKED {
+         uint32_t src3          : 11;
+         uint32_t must_be_zero2 : 2;
+         uint32_t src3_r        : 1;
+         uint32_t src2_neg      : 1;
+         uint32_t src3_neg      : 1;
+      };
+      struct PACKED {
+         uint32_t src3         : 10;
+         uint32_t src3_c       : 1;
+         uint32_t src3_rel     : 1;
+         uint32_t must_be_zero : 1;
+         uint32_t dummy        : 3;
+      } rel2;
+      struct PACKED {
+         uint32_t src3   : 12;
+         uint32_t src3_c : 1;
+         uint32_t dummy  : 3;
+      } c2;
+   };
+
+   /* dword1: */
+   uint32_t dst      : 8;
+   uint32_t repeat   : 2;
+   uint32_t sat      : 1;
+   uint32_t src1_r   : 1; /* doubles as nop0 if repeat==0 */
+   uint32_t ss       : 1;
+   uint32_t ul       : 1;
+   uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+   uint32_t src2     : 8;
+   uint32_t opc      : 4;
+   uint32_t jmp_tgt  : 1;
+   uint32_t sync     : 1;
+   uint32_t opc_cat  : 3;
  } instr_cat3_t;
  
-static inline bool instr_cat3_full(instr_cat3_t *cat3)
+static inline bool
+instr_cat3_full(instr_cat3_t *cat3)
  {
-       switch (_OPC(3, cat3->opc)) {
-       case OPC_MAD_F16:
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_SEL_B16:
-       case OPC_SEL_S16:
-       case OPC_SEL_F16:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:  // really??
-               return false;
-       default:
-               return true;
-       }
+   switch (_OPC(3, cat3->opc)) {
+   case OPC_MAD_F16:
+   case OPC_MAD_U16:
+   case OPC_MAD_S16:
+   case OPC_SEL_B16:
+   case OPC_SEL_S16:
+   case OPC_SEL_F16:
+   case OPC_SAD_S16:
+   case OPC_SAD_S32: // really??
+      return false;
+   default:
+      return true;
+   }
  }
  
  typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src          : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src_im       : 1;   /* immediate */
-                       uint32_t src_neg      : 1;   /* negate */
-                       uint32_t src_abs      : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src          : 10;
-                       uint32_t src_c        : 1;   /* relative-const */
-                       uint32_t src_rel      : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel;
-               struct PACKED {
-                       uint32_t src          : 12;
-                       uint32_t src_c        : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c;
-       };
-       uint32_t dummy1   : 16;  /* seem to be ignored */
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 2;
-       uint32_t sat      : 1;
-       uint32_t src_r    : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t dummy2   : 5;   /* seem to be ignored */
-       uint32_t full     : 1;   /* not half */
-       uint32_t opc      : 6;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
+   /* dword0: */
+   union PACKED {
+      struct PACKED {
+         uint32_t src           : 11;
+         uint32_t must_be_zero1 : 2;
+         uint32_t src_im        : 1; /* immediate */
+         uint32_t src_neg       : 1; /* negate */
+         uint32_t src_abs       : 1; /* absolute value */
+      };
+      struct PACKED {
+         uint32_t src          : 10;
+         uint32_t src_c        : 1; /* relative-const */
+         uint32_t src_rel      : 1; /* relative address */
+         uint32_t must_be_zero : 1;
+         uint32_t dummy        : 3;
+      } rel;
+      struct PACKED {
+         uint32_t src   : 12;
+         uint32_t src_c : 1; /* const */
+         uint32_t dummy : 3;
+      } c;
+   };
+   uint32_t dummy1 : 16; /* seem to be ignored */
+
+   /* dword1: */
+   uint32_t dst      : 8;
+   uint32_t repeat   : 2;
+   uint32_t sat      : 1;
+   uint32_t src_r    : 1;
+   uint32_t ss       : 1;
+   uint32_t ul       : 1;
+   uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+   uint32_t dummy2   : 5; /* seem to be ignored */
+   uint32_t full     : 1; /* not half */
+   uint32_t opc      : 6;
+   uint32_t jmp_tgt  : 1;
+   uint32_t sync     : 1;
+   uint32_t opc_cat  : 3;
  } instr_cat4_t;
  
  /* With is_bindless_s2en = 1, this determines whether bindless is enabled and
@@ -720,153 +728,153 @@ typedef struct PACKED {
   * for the texture.
   */
  typedef enum {
-       /* Use traditional GL binding model, get texture and sampler index
-        * from src3 which is not presumed to be uniform. This is
-        * backwards-compatible with earlier generations, where this field was
-        * always 0 and nonuniform-indexed sampling always worked.
-        */
-       CAT5_NONUNIFORM = 0,
-
-       /* The sampler base comes from the low 3 bits of a1.x, and the sampler
-        * and texture index come from src3 which is presumed to be uniform.
-        */
-       CAT5_BINDLESS_A1_UNIFORM = 1,
-
-       /* The texture and sampler share the same base, and the sampler and
-        * texture index come from src3 which is *not* presumed to be uniform.
-        */
-       CAT5_BINDLESS_NONUNIFORM = 2,
-
-       /* The sampler base comes from the low 3 bits of a1.x, and the sampler
-        * and texture index come from src3 which is *not* presumed to be
-        * uniform.
-        */
-       CAT5_BINDLESS_A1_NONUNIFORM = 3,
-
-       /* Use traditional GL binding model, get texture and sampler index
-        * from src3 which is presumed to be uniform.
-        */
-       CAT5_UNIFORM = 4,
-
-       /* The texture and sampler share the same base, and the sampler and
-        * texture index come from src3 which is presumed to be uniform.
-        */
-       CAT5_BINDLESS_UNIFORM = 5,
-
-       /* The texture and sampler share the same base, get sampler index from low
-        * 4 bits of src3 and texture index from high 4 bits.
-        */
-       CAT5_BINDLESS_IMM = 6,
-
-       /* The sampler base comes from the low 3 bits of a1.x, and the texture
-        * index comes from the next 8 bits of a1.x. The sampler index is an
-        * immediate in src3.
-        */
-       CAT5_BINDLESS_A1_IMM = 7,
+   /* Use traditional GL binding model, get texture and sampler index
+    * from src3 which is not presumed to be uniform. This is
+    * backwards-compatible with earlier generations, where this field was
+    * always 0 and nonuniform-indexed sampling always worked.
+    */
+   CAT5_NONUNIFORM = 0,
+
+   /* The sampler base comes from the low 3 bits of a1.x, and the sampler
+    * and texture index come from src3 which is presumed to be uniform.
+    */
+   CAT5_BINDLESS_A1_UNIFORM = 1,
+
+   /* The texture and sampler share the same base, and the sampler and
+    * texture index come from src3 which is *not* presumed to be uniform.
+    */
+   CAT5_BINDLESS_NONUNIFORM = 2,
+
+   /* The sampler base comes from the low 3 bits of a1.x, and the sampler
+    * and texture index come from src3 which is *not* presumed to be
+    * uniform.
+    */
+   CAT5_BINDLESS_A1_NONUNIFORM = 3,
+
+   /* Use traditional GL binding model, get texture and sampler index
+    * from src3 which is presumed to be uniform.
+    */
+   CAT5_UNIFORM = 4,
+
+   /* The texture and sampler share the same base, and the sampler and
+    * texture index come from src3 which is presumed to be uniform.
+    */
+   CAT5_BINDLESS_UNIFORM = 5,
+
+   /* The texture and sampler share the same base, get sampler index from low
+    * 4 bits of src3 and texture index from high 4 bits.
+    */
+   CAT5_BINDLESS_IMM = 6,
+
+   /* The sampler base comes from the low 3 bits of a1.x, and the texture
+    * index comes from the next 8 bits of a1.x. The sampler index is an
+    * immediate in src3.
+    */
+   CAT5_BINDLESS_A1_IMM = 7,
  } cat5_desc_mode_t;
  
  typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               /* normal case: */
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t src2     : 8;
-                       uint32_t dummy1   : 4;   /* seem to be ignored */
-                       uint32_t samp     : 4;
-                       uint32_t tex      : 7;
-               } norm;
-               /* s2en case: */
-               struct PACKED {
-                       uint32_t full         : 1;   /* not half */
-                       uint32_t src1         : 8;
-                       uint32_t src2         : 8;
-                       uint32_t dummy1       : 2;
-                       uint32_t base_hi      : 2;
-                       uint32_t src3         : 8;
-                       uint32_t desc_mode    : 3;
-               } s2en_bindless;
-               /* same in either case: */
-               // XXX I think, confirm this
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t src2     : 8;
-                       uint32_t pad      : 15;
-               };
-       };
-
-       /* dword1: */
-       uint32_t dst              : 8;
-       uint32_t wrmask           : 4;   /* write-mask */
-       uint32_t type             : 3;
-       uint32_t base_lo          : 1;   /* used with bindless */
-       uint32_t is_3d            : 1;
-
-       uint32_t is_a             : 1;
-       uint32_t is_s             : 1;
-       uint32_t is_s2en_bindless : 1;
-       uint32_t is_o             : 1;
-       uint32_t is_p             : 1;
-
-       uint32_t opc              : 5;
-       uint32_t jmp_tgt          : 1;
-       uint32_t sync             : 1;
-       uint32_t opc_cat          : 3;
+   /* dword0: */
+   union PACKED {
+      /* normal case: */
+      struct PACKED {
+         uint32_t full   : 1; /* not half */
+         uint32_t src1   : 8;
+         uint32_t src2   : 8;
+         uint32_t dummy1 : 4; /* seem to be ignored */
+         uint32_t samp   : 4;
+         uint32_t tex    : 7;
+      } norm;
+      /* s2en case: */
+      struct PACKED {
+         uint32_t full      : 1; /* not half */
+         uint32_t src1      : 8;
+         uint32_t src2      : 8;
+         uint32_t dummy1    : 2;
+         uint32_t base_hi   : 2;
+         uint32_t src3      : 8;
+         uint32_t desc_mode : 3;
+      } s2en_bindless;
+      /* same in either case: */
+      // XXX I think, confirm this
+      struct PACKED {
+         uint32_t full : 1; /* not half */
+         uint32_t src1 : 8;
+         uint32_t src2 : 8;
+         uint32_t pad  : 15;
+      };
+   };
+
+   /* dword1: */
+   uint32_t dst     : 8;
+   uint32_t wrmask  : 4; /* write-mask */
+   uint32_t type    : 3;
+   uint32_t base_lo : 1; /* used with bindless */
+   uint32_t is_3d   : 1;
+
+   uint32_t is_a             : 1;
+   uint32_t is_s             : 1;
+   uint32_t is_s2en_bindless : 1;
+   uint32_t is_o             : 1;
+   uint32_t is_p             : 1;
+
+   uint32_t opc     : 5;
+   uint32_t jmp_tgt : 1;
+   uint32_t sync    : 1;
+   uint32_t opc_cat : 3;
  } instr_cat5_t;
  
  /* dword0 encoding for src_off: [src1 + off], src3: */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t mustbe1  : 1;
-       int32_t  off      : 13;   /* src2 */
-       uint32_t src1     : 8;
-       uint32_t src1_im  : 1;
-       uint32_t src3_im  : 1;
-       uint32_t src3     : 8;
-
-       /* dword1: */
-       uint32_t dword1;
+   /* dword0: */
+   uint32_t mustbe1 : 1;
+   int32_t off      : 13; /* src2 */
+   uint32_t src1    : 8;
+   uint32_t src1_im : 1;
+   uint32_t src3_im : 1;
+   uint32_t src3    : 8;
+
+   /* dword1: */
+   uint32_t dword1;
  } instr_cat6a_t;
  
  /* dword0 encoding for !src_off: [src1], src2 */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t mustbe0  : 1;
-       uint32_t src1     : 8;
-       uint32_t pad      : 5;
-       uint32_t ignore0  : 8;
-       uint32_t src1_im  : 1;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t dword1;
+   /* dword0: */
+   uint32_t mustbe0 : 1;
+   uint32_t src1    : 8;
+   uint32_t pad     : 5;
+   uint32_t ignore0 : 8;
+   uint32_t src1_im : 1;
+   uint32_t src2_im : 1;
+   uint32_t src2    : 8;
+
+   /* dword1: */
+   uint32_t dword1;
  } instr_cat6b_t;
  
  /* dword1 encoding for dst_off: */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t dw0_pad1 : 9;
-       int32_t off_high : 5;
-       uint32_t dw0_pad2 : 18;
-
-       uint32_t off      : 8;
-       uint32_t mustbe1  : 1;
-       uint32_t dst      : 8;
-       uint32_t pad1     : 15;
+   /* dword0: */
+   uint32_t dw0_pad1 : 9;
+   int32_t off_high  : 5;
+   uint32_t dw0_pad2 : 18;
+
+   uint32_t off     : 8;
+   uint32_t mustbe1 : 1;
+   uint32_t dst     : 8;
+   uint32_t pad1    : 15;
  } instr_cat6c_t;
  
  /* dword1 encoding for !dst_off: */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t dword0;
+   /* dword0: */
+   uint32_t dword0;
  
-       uint32_t dst      : 8;
-       uint32_t mustbe0  : 1;
-       uint32_t idx      : 8;
-       uint32_t pad0     : 15;
+   uint32_t dst     : 8;
+   uint32_t mustbe0 : 1;
+   uint32_t idx     : 8;
+   uint32_t pad0    : 15;
  } instr_cat6d_t;
  
  /* ldgb and atomics..
@@ -876,99 +884,99 @@ typedef struct PACKED {
   *        .l: pad0=1, pad3=0
   */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t pad0     : 1;
-       uint32_t src3     : 8;
-       uint32_t d        : 2;
-       uint32_t typed    : 1;
-       uint32_t type_size : 2;
-       uint32_t src1     : 8;
-       uint32_t src1_im  : 1;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t mustbe0  : 1;
-       uint32_t src_ssbo : 8;
-       uint32_t pad2     : 3;  // type
-       uint32_t g        : 1;
-       uint32_t src_ssbo_im : 1;
-       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+   /* dword0: */
+   uint32_t pad0      : 1;
+   uint32_t src3      : 8;
+   uint32_t d         : 2;
+   uint32_t typed     : 1;
+   uint32_t type_size : 2;
+   uint32_t src1      : 8;
+   uint32_t src1_im   : 1;
+   uint32_t src2_im   : 1;
+   uint32_t src2      : 8;
+
+   /* dword1: */
+   uint32_t dst         : 8;
+   uint32_t mustbe0     : 1;
+   uint32_t src_ssbo    : 8;
+   uint32_t pad2        : 3; // type
+   uint32_t g           : 1;
+   uint32_t src_ssbo_im : 1;
+   uint32_t pad4        : 10; // opc/jmp_tgt/sync/opc_cat
  } instr_cat6ldgb_t;
  
  /* stgb, pad0=0, pad3=2
   */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t mustbe1  : 1;  // ???
-       uint32_t src1     : 8;
-       uint32_t d        : 2;
-       uint32_t typed    : 1;
-       uint32_t type_size : 2;
-       uint32_t pad0     : 9;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t src3     : 8;
-       uint32_t src3_im  : 1;
-       uint32_t dst_ssbo : 8;
-       uint32_t pad2     : 3;  // type
-       uint32_t pad3     : 2;
-       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+   /* dword0: */
+   uint32_t mustbe1   : 1; // ???
+   uint32_t src1      : 8;
+   uint32_t d         : 2;
+   uint32_t typed     : 1;
+   uint32_t type_size : 2;
+   uint32_t pad0      : 9;
+   uint32_t src2_im   : 1;
+   uint32_t src2      : 8;
+
+   /* dword1: */
+   uint32_t src3     : 8;
+   uint32_t src3_im  : 1;
+   uint32_t dst_ssbo : 8;
+   uint32_t pad2     : 3; // type
+   uint32_t pad3     : 2;
+   uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
  } instr_cat6stgb_t;
  
  typedef union PACKED {
-       instr_cat6a_t a;
-       instr_cat6b_t b;
-       instr_cat6c_t c;
-       instr_cat6d_t d;
-       instr_cat6ldgb_t ldgb;
-       instr_cat6stgb_t stgb;
-       struct PACKED {
-               /* dword0: */
-               uint32_t src_off  : 1;
-               uint32_t pad1     : 31;
-
-               /* dword1: */
-               uint32_t pad2     : 8;
-               uint32_t dst_off  : 1;
-               uint32_t pad3     : 8;
-               uint32_t type     : 3;
-               uint32_t g        : 1;  /* or in some cases it means dst immed */
-               uint32_t pad4     : 1;
-               uint32_t opc      : 5;
-               uint32_t jmp_tgt  : 1;
-               uint32_t sync     : 1;
-               uint32_t opc_cat  : 3;
-       };
+   instr_cat6a_t a;
+   instr_cat6b_t b;
+   instr_cat6c_t c;
+   instr_cat6d_t d;
+   instr_cat6ldgb_t ldgb;
+   instr_cat6stgb_t stgb;
+   struct PACKED {
+      /* dword0: */
+      uint32_t src_off : 1;
+      uint32_t pad1    : 31;
+
+      /* dword1: */
+      uint32_t pad2    : 8;
+      uint32_t dst_off : 1;
+      uint32_t pad3    : 8;
+      uint32_t type    : 3;
+      uint32_t g       : 1; /* or in some cases it means dst immed */
+      uint32_t pad4    : 1;
+      uint32_t opc     : 5;
+      uint32_t jmp_tgt : 1;
+      uint32_t sync    : 1;
+      uint32_t opc_cat : 3;
+   };
  } instr_cat6_t;
  
  /* Similar to cat5_desc_mode_t, describes how the descriptor is loaded.
   */
  typedef enum {
-       /* Use old GL binding model with an immediate index. */
-       CAT6_IMM = 0,
+   /* Use old GL binding model with an immediate index. */
+   CAT6_IMM = 0,
  
-       CAT6_UNIFORM = 1,
+   CAT6_UNIFORM = 1,
  
-       CAT6_NONUNIFORM = 2,
+   CAT6_NONUNIFORM = 2,
  
-       /* Use the bindless model, with an immediate index.
-        */
-       CAT6_BINDLESS_IMM = 4,
+   /* Use the bindless model, with an immediate index.
+    */
+   CAT6_BINDLESS_IMM = 4,
  
-       /* Use the bindless model, with a uniform register index.
-        */
-       CAT6_BINDLESS_UNIFORM = 5,
+   /* Use the bindless model, with a uniform register index.
+    */
+   CAT6_BINDLESS_UNIFORM = 5,
  
-       /* Use the bindless model, with a register index that isn't guaranteed
-        * to be uniform. This presumably checks if the indices are equal and
-        * splits up the load/store, because it works the way you would
-        * expect.
-        */
-       CAT6_BINDLESS_NONUNIFORM = 6,
+   /* Use the bindless model, with a register index that isn't guaranteed
+    * to be uniform. This presumably checks if the indices are equal and
+    * splits up the load/store, because it works the way you would
+    * expect.
+    */
+   CAT6_BINDLESS_NONUNIFORM = 6,
  } cat6_desc_mode_t;
  
  /**
@@ -995,266 +1003,294 @@ typedef enum {
   * pad2 and pad5 are only observed to be 0.
   */
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t pad1     : 1;
-       uint32_t base     : 3;
-       uint32_t pad2     : 2;
-       uint32_t desc_mode : 3;
-       uint32_t d        : 2;
-       uint32_t typed    : 1;
-       uint32_t type_size : 2;
-       uint32_t opc      : 6;
-       uint32_t pad3     : 4;
-       uint32_t src1     : 8;  /* coordinate/offset */
-
-       /* dword1: */
-       uint32_t src2     : 8;  /* or the dst for load instructions */
-       uint32_t pad4     : 1;  //mustbe0 ??
-       uint32_t ssbo     : 8;  /* ssbo/image binding point */
-       uint32_t type     : 3;
-       uint32_t pad5     : 7;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
+   /* dword0: */
+   uint32_t pad1      : 1;
+   uint32_t base      : 3;
+   uint32_t pad2      : 2;
+   uint32_t desc_mode : 3;
+   uint32_t d         : 2;
+   uint32_t typed     : 1;
+   uint32_t type_size : 2;
+   uint32_t opc       : 6;
+   uint32_t pad3      : 4;
+   uint32_t src1      : 8; /* coordinate/offset */
+
+   /* dword1: */
+   uint32_t src2    : 8; /* or the dst for load instructions */
+   uint32_t pad4    : 1; // mustbe0 ??
+   uint32_t ssbo    : 8; /* ssbo/image binding point */
+   uint32_t type    : 3;
+   uint32_t pad5    : 7;
+   uint32_t jmp_tgt : 1;
+   uint32_t sync    : 1;
+   uint32_t opc_cat : 3;
  } instr_cat6_a6xx_t;
  
  typedef struct PACKED {
-       /* dword0: */
-       uint32_t pad1     : 32;
-
-       /* dword1: */
-       uint32_t pad2     : 12;
-       uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
-       uint32_t pad3     : 6;
-       uint32_t w        : 1;  /* write */
-       uint32_t r        : 1;  /* read */
-       uint32_t l        : 1;  /* local */
-       uint32_t g        : 1;  /* global */
-       uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
-       uint32_t jmp_tgt  : 1;  /* (jp) */
-       uint32_t sync     : 1;  /* (sy) */
-       uint32_t opc_cat  : 3;
+   /* dword0: */
+   uint32_t pad1 : 32;
+
+   /* dword1: */
+   uint32_t pad2    : 12;
+   uint32_t ss      : 1; /* maybe in the encoding, but blob only uses (sy) */
+   uint32_t pad3    : 6;
+   uint32_t w       : 1; /* write */
+   uint32_t r       : 1; /* read */
+   uint32_t l       : 1; /* local */
+   uint32_t g       : 1; /* global */
+   uint32_t opc     : 4; /* presumed, but only a couple known OPCs */
+   uint32_t jmp_tgt : 1; /* (jp) */
+   uint32_t sync    : 1; /* (sy) */
+   uint32_t opc_cat : 3;
  } instr_cat7_t;
  
  typedef union PACKED {
-       instr_cat0_t cat0;
-       instr_cat1_t cat1;
-       instr_cat2_t cat2;
-       instr_cat3_t cat3;
-       instr_cat4_t cat4;
-       instr_cat5_t cat5;
-       instr_cat6_t cat6;
-       instr_cat6_a6xx_t cat6_a6xx;
-       instr_cat7_t cat7;
-       struct PACKED {
-               /* dword0: */
-               uint32_t pad1     : 32;
-
-               /* dword1: */
-               uint32_t pad2     : 12;
-               uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
-               uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
-               uint32_t pad3     : 13;
-               uint32_t jmp_tgt  : 1;
-               uint32_t sync     : 1;
-               uint32_t opc_cat  : 3;
-
-       };
+   instr_cat0_t cat0;
+   instr_cat1_t cat1;
+   instr_cat2_t cat2;
+   instr_cat3_t cat3;
+   instr_cat4_t cat4;
+   instr_cat5_t cat5;
+   instr_cat6_t cat6;
+   instr_cat6_a6xx_t cat6_a6xx;
+   instr_cat7_t cat7;
+   struct PACKED {
+      /* dword0: */
+      uint32_t pad1 : 32;
+
+      /* dword1: */
+      uint32_t pad2 : 12;
+      uint32_t ss   : 1; /* cat1-cat4 (cat0??) and cat7 (?) */
+      uint32_t ul   : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+      uint32_t pad3 : 13;
+      uint32_t jmp_tgt : 1;
+      uint32_t sync    : 1;
+      uint32_t opc_cat : 3;
+   };
  } instr_t;
  
-static inline uint32_t instr_repeat(instr_t *instr)
+static inline uint32_t
+instr_repeat(instr_t *instr)
  {
-       switch (instr->opc_cat) {
-       case 0:  return instr->cat0.repeat;
-       case 1:  return instr->cat1.repeat;
-       case 2:  return instr->cat2.repeat;
-       case 3:  return instr->cat3.repeat;
-       case 4:  return instr->cat4.repeat;
-       default: return 0;
-       }
+   switch (instr->opc_cat) {
+   case 0:
+      return instr->cat0.repeat;
+   case 1:
+      return instr->cat1.repeat;
+   case 2:
+      return instr->cat2.repeat;
+   case 3:
+      return instr->cat3.repeat;
+   case 4:
+      return instr->cat4.repeat;
+   default:
+      return 0;
+   }
  }
  
-static inline bool instr_sat(instr_t *instr)
+static inline bool
+instr_sat(instr_t *instr)
  {
-       switch (instr->opc_cat) {
-       case 2:  return instr->cat2.sat;
-       case 3:  return instr->cat3.sat;
-       case 4:  return instr->cat4.sat;
-       default: return false;
-       }
+   switch (instr->opc_cat) {
+   case 2:
+      return instr->cat2.sat;
+   case 3:
+      return instr->cat3.sat;
+   case 4:
+      return instr->cat4.sat;
+   default:
+      return false;
+   }
  }
  
-static inline bool is_sat_compatible(opc_t opc)
+static inline bool
+is_sat_compatible(opc_t opc)
  {
-       /* On a6xx saturation doesn't work on cat4 */
-       if (opc_cat(opc) != 2 && opc_cat(opc) != 3)
-               return false;
-
-       switch (opc) {
-       /* On a3xx and a6xx saturation doesn't work on bary.f */
-       case OPC_BARY_F:
-       /* On a6xx saturation doesn't work on sel.* */
-       case OPC_SEL_B16:
-       case OPC_SEL_B32:
-       case OPC_SEL_S16:
-       case OPC_SEL_S32:
-       case OPC_SEL_F16:
-       case OPC_SEL_F32:
-               return false;
-       default:
-               return true;
-       }
+   /* On a6xx saturation doesn't work on cat4 */
+   if (opc_cat(opc) != 2 && opc_cat(opc) != 3)
+      return false;
+
+   switch (opc) {
+   /* On a3xx and a6xx saturation doesn't work on bary.f */
+   case OPC_BARY_F:
+   /* On a6xx saturation doesn't work on sel.* */
+   case OPC_SEL_B16:
+   case OPC_SEL_B32:
+   case OPC_SEL_S16:
+   case OPC_SEL_S32:
+   case OPC_SEL_F16:
+   case OPC_SEL_F32:
+      return false;
+   default:
+      return true;
+   }
  }
  
  /* We can probably drop the gpu_id arg, but keeping it for now so we can
   * assert if we see something we think should be new encoding on an older
   * gpu.
   */
-static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id)
+static inline bool
+is_cat6_legacy(instr_t *instr, unsigned gpu_id)
  {
-       instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
-
-       if (gpu_id < 600)
-               return true;
-
-       /* At least one of these two bits is pad in all the possible
-        * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
-        * cmdstream traces I have indicates that the pad bit is zero
-        * in all cases.  So we can use this to detect new encoding:
-        */
-       if ((cat6->pad3 & 0x4) && (cat6->pad5 & 0x2)) {
-               ir3_assert(instr->cat6.opc == 0);
-               return false;
-       }
-
-       return true;
+   instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
+
+   if (gpu_id < 600)
+      return true;
+
+   /* At least one of these two bits is pad in all the possible
+    * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
+    * cmdstream traces I have indicates that the pad bit is zero
+    * in all cases.  So we can use this to detect new encoding:
+    */
+   if ((cat6->pad3 & 0x4) && (cat6->pad5 & 0x2)) {
+      ir3_assert(instr->cat6.opc == 0);
+      return false;
+   }
+
+   return true;
  }
  
-static inline uint32_t instr_opc(instr_t *instr, unsigned gpu_id)
+static inline uint32_t
+instr_opc(instr_t *instr, unsigned gpu_id)
  {
-       switch (instr->opc_cat) {
-       case 0:  return instr->cat0.opc | instr->cat0.opc_hi << 4;
-       case 1:  return instr->cat1.opc;
-       case 2:  return instr->cat2.opc;
-       case 3:  return instr->cat3.opc;
-       case 4:  return instr->cat4.opc;
-       case 5:  return instr->cat5.opc;
-       case 6:
-               if (!is_cat6_legacy(instr, gpu_id))
-                       return instr->cat6_a6xx.opc;
-               return instr->cat6.opc;
-       case 7:  return instr->cat7.opc;
-       default: return 0;
-       }
+   switch (instr->opc_cat) {
+   case 0:
+      return instr->cat0.opc | instr->cat0.opc_hi << 4;
+   case 1:
+      return instr->cat1.opc;
+   case 2:
+      return instr->cat2.opc;
+   case 3:
+      return instr->cat3.opc;
+   case 4:
+      return instr->cat4.opc;
+   case 5:
+      return instr->cat5.opc;
+   case 6:
+      if (!is_cat6_legacy(instr, gpu_id))
+         return instr->cat6_a6xx.opc;
+      return instr->cat6.opc;
+   case 7:
+      return instr->cat7.opc;
+   default:
+      return 0;
+   }
  }
  
-static inline bool is_mad(opc_t opc)
+static inline bool
+is_mad(opc_t opc)
  {
-       switch (opc) {
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_MAD_U24:
-       case OPC_MAD_S24:
-       case OPC_MAD_F16:
-       case OPC_MAD_F32:
-               return true;
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_MAD_U16:
+   case OPC_MAD_S16:
+   case OPC_MAD_U24:
+   case OPC_MAD_S24:
+   case OPC_MAD_F16:
+   case OPC_MAD_F32:
+      return true;
+   default:
+      return false;
+   }
  }
  
-static inline bool is_madsh(opc_t opc)
+static inline bool
+is_madsh(opc_t opc)
  {
-       switch (opc) {
-       case OPC_MADSH_U16:
-       case OPC_MADSH_M16:
-               return true;
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_MADSH_U16:
+   case OPC_MADSH_M16:
+      return true;
+   default:
+      return false;
+   }
  }
  
-static inline bool is_atomic(opc_t opc)
+static inline bool
+is_atomic(opc_t opc)
  {
-       switch (opc) {
-       case OPC_ATOMIC_ADD:
-       case OPC_ATOMIC_SUB:
-       case OPC_ATOMIC_XCHG:
-       case OPC_ATOMIC_INC:
-       case OPC_ATOMIC_DEC:
-       case OPC_ATOMIC_CMPXCHG:
-       case OPC_ATOMIC_MIN:
-       case OPC_ATOMIC_MAX:
-       case OPC_ATOMIC_AND:
-       case OPC_ATOMIC_OR:
-       case OPC_ATOMIC_XOR:
-               return true;
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_ATOMIC_ADD:
+   case OPC_ATOMIC_SUB:
+   case OPC_ATOMIC_XCHG:
+   case OPC_ATOMIC_INC:
+   case OPC_ATOMIC_DEC:
+   case OPC_ATOMIC_CMPXCHG:
+   case OPC_ATOMIC_MIN:
+   case OPC_ATOMIC_MAX:
+   case OPC_ATOMIC_AND:
+   case OPC_ATOMIC_OR:
+   case OPC_ATOMIC_XOR:
+      return true;
+   default:
+      return false;
+   }
  }
  
-static inline bool is_ssbo(opc_t opc)
+static inline bool
+is_ssbo(opc_t opc)
  {
-       switch (opc) {
-       case OPC_RESFMT:
-       case OPC_RESINFO:
-       case OPC_LDGB:
-       case OPC_STGB:
-       case OPC_STIB:
-               return true;
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_RESFMT:
+   case OPC_RESINFO:
+   case OPC_LDGB:
+   case OPC_STGB:
+   case OPC_STIB:
+      return true;
+   default:
+      return false;
+   }
  }
  
-static inline bool is_isam(opc_t opc)
+static inline bool
+is_isam(opc_t opc)
  {
-       switch (opc) {
-       case OPC_ISAM:
-       case OPC_ISAML:
-       case OPC_ISAMM:
-               return true;
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_ISAM:
+   case OPC_ISAML:
+   case OPC_ISAMM:
+      return true;
+   default:
+      return false;
+   }
  }
  
-
-static inline bool is_cat2_float(opc_t opc)
+static inline bool
+is_cat2_float(opc_t opc)
  {
-       switch (opc) {
-       case OPC_ADD_F:
-       case OPC_MIN_F:
-       case OPC_MAX_F:
-       case OPC_MUL_F:
-       case OPC_SIGN_F:
-       case OPC_CMPS_F:
-       case OPC_ABSNEG_F:
-       case OPC_CMPV_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-               return true;
-
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_ADD_F:
+   case OPC_MIN_F:
+   case OPC_MAX_F:
+   case OPC_MUL_F:
+   case OPC_SIGN_F:
+   case OPC_CMPS_F:
+   case OPC_ABSNEG_F:
+   case OPC_CMPV_F:
+   case OPC_FLOOR_F:
+   case OPC_CEIL_F:
+   case OPC_RNDNE_F:
+   case OPC_RNDAZ_F:
+   case OPC_TRUNC_F:
+      return true;
+
+   default:
+      return false;
+   }
  }
  
-static inline bool is_cat3_float(opc_t opc)
+static inline bool
+is_cat3_float(opc_t opc)
  {
-       switch (opc) {
-       case OPC_MAD_F16:
-       case OPC_MAD_F32:
-       case OPC_SEL_F16:
-       case OPC_SEL_F32:
-               return true;
-       default:
-               return false;
-       }
+   switch (opc) {
+   case OPC_MAD_F16:
+   case OPC_MAD_F32:
+   case OPC_SEL_F16:
+   case OPC_SEL_F32:
+      return true;
+   default:
+      return false;
+   }
  }
  
  #endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c

index e47f1df..909228d 100644 (file)
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -23,12 +23,12 @@
  
  #include "ir3.h"
  
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
  #include <assert.h>
-#include <stdbool.h>
  #include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
  
  #include "util/bitscan.h"
  #include "util/ralloc.h"
@@ -40,532 +40,545 @@
  /* simple allocator to carve allocations out of an up-front allocated heap,
   * so that we can free everything easily in one shot.
   */
-void * ir3_alloc(struct ir3 *shader, int sz)
+void *
+ir3_alloc(struct ir3 *shader, int sz)
  {
-       return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
+   return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
  }
  
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *v)
+struct ir3 *
+ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
  {
-       struct ir3 *shader = rzalloc(v, struct ir3);
+   struct ir3 *shader = rzalloc(v, struct ir3);
  
-       shader->compiler = compiler;
-       shader->type = v->type;
+   shader->compiler = compiler;
+   shader->type = v->type;
  
-       list_inithead(&shader->block_list);
-       list_inithead(&shader->array_list);
+   list_inithead(&shader->block_list);
+   list_inithead(&shader->array_list);
  
-       return shader;
+   return shader;
  }
  
-void ir3_destroy(struct ir3 *shader)
+void
+ir3_destroy(struct ir3 *shader)
  {
-       ralloc_free(shader);
+   ralloc_free(shader);
  }
  
  static void
  collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
-               struct ir3_info *info)
-{
-       struct ir3_shader_variant *v = info->data;
-       unsigned repeat = instr->repeat;
-
-       if (reg->flags & IR3_REG_IMMED) {
-               /* nothing to do */
-               return;
-       }
-
-       if (!(reg->flags & IR3_REG_R)) {
-               repeat = 0;
-       }
-
-       unsigned components;
-       int16_t max;
-
-       if (reg->flags & IR3_REG_RELATIV) {
-               components = reg->size;
-               max = (reg->array.base + components - 1);
-       } else {
-               components = util_last_bit(reg->wrmask);
-               max = (reg->num + repeat + components - 1);
-       }
-
-       if (reg->flags & IR3_REG_CONST) {
-               info->max_const = MAX2(info->max_const, max >> 2);
-       } else if (max < regid(48, 0)) {
-               if (reg->flags & IR3_REG_HALF) {
-                       if (v->mergedregs) {
-                               /* starting w/ a6xx, half regs conflict with full regs: */
-                               info->max_reg = MAX2(info->max_reg, max >> 3);
-                       } else {
-                               info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
-                       }
-               } else {
-                       info->max_reg = MAX2(info->max_reg, max >> 2);
-               }
-       }
+                 struct ir3_info *info)
+{
+   struct ir3_shader_variant *v = info->data;
+   unsigned repeat = instr->repeat;
+
+   if (reg->flags & IR3_REG_IMMED) {
+      /* nothing to do */
+      return;
+   }
+
+   if (!(reg->flags & IR3_REG_R)) {
+      repeat = 0;
+   }
+
+   unsigned components;
+   int16_t max;
+
+   if (reg->flags & IR3_REG_RELATIV) {
+      components = reg->size;
+      max = (reg->array.base + components - 1);
+   } else {
+      components = util_last_bit(reg->wrmask);
+      max = (reg->num + repeat + components - 1);
+   }
+
+   if (reg->flags & IR3_REG_CONST) {
+      info->max_const = MAX2(info->max_const, max >> 2);
+   } else if (max < regid(48, 0)) {
+      if (reg->flags & IR3_REG_HALF) {
+         if (v->mergedregs) {
+            /* starting w/ a6xx, half regs conflict with full regs: */
+            info->max_reg = MAX2(info->max_reg, max >> 3);
+         } else {
+            info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
+         }
+      } else {
+         info->max_reg = MAX2(info->max_reg, max >> 2);
+      }
+   }
  }
  
  bool
-ir3_should_double_threadsize(struct ir3_shader_variant *v,
-                                                        unsigned regs_count)
-{
-       const struct ir3_compiler *compiler = v->shader->compiler;
-
-       /* We can't support more than compiler->branchstack_size diverging threads
-        * in a wave. Thus, doubling the threadsize is only possible if we don't
-        * exceed the branchstack size limit.
-        */
-       if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
-                       compiler->branchstack_size) {
-               return false;
-       }
-
-       switch (v->type) {
-       case MESA_SHADER_COMPUTE: {
-               unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
-
-               /* For a5xx, if the workgroup size is greater than the maximum number
-                * of threads per core with 32 threads per wave (512) then we have to
-                * use the doubled threadsize because otherwise the workgroup wouldn't
-                * fit. For smaller workgroup sizes, we follow the blob and use the
-                * smaller threadsize.
-                */
-               if (compiler->gpu_id < 600) {
-                       return v->local_size_variable || threads_per_wg >
-                               compiler->threadsize_base * compiler->max_waves;
-               }
-
-               /* On a6xx, we prefer the larger threadsize unless the workgroup is
-                * small enough that it would be useless. Note that because
-                * threadsize_base is bumped to 64, we don't have to worry about the
-                * workgroup fitting, unlike the a5xx case.
-                */
-               if (!v->local_size_variable) {
-                       if (threads_per_wg <= compiler->threadsize_base)
-                               return false;
-               }
-       }
-       FALLTHROUGH;
-       case MESA_SHADER_FRAGMENT: {
-               /* Check that doubling the threadsize wouldn't exceed the regfile size */
-               return regs_count * 2 <= compiler->reg_size_vec4;
-       }
-
-       default:
-               /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
-                * stages - the bit doesn't exist. The blob never used it for the VS
-                * on earlier gen's anyway.
-                */
-               return false;
-       }
+ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
+{
+   const struct ir3_compiler *compiler = v->shader->compiler;
+
+   /* We can't support more than compiler->branchstack_size diverging threads
+    * in a wave. Thus, doubling the threadsize is only possible if we don't
+    * exceed the branchstack size limit.
+    */
+   if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
+       compiler->branchstack_size) {
+      return false;
+   }
+
+   switch (v->type) {
+   case MESA_SHADER_COMPUTE: {
+      unsigned threads_per_wg =
+         v->local_size[0] * v->local_size[1] * v->local_size[2];
+
+      /* For a5xx, if the workgroup size is greater than the maximum number
+       * of threads per core with 32 threads per wave (512) then we have to
+       * use the doubled threadsize because otherwise the workgroup wouldn't
+       * fit. For smaller workgroup sizes, we follow the blob and use the
+       * smaller threadsize.
+       */
+      if (compiler->gpu_id < 600) {
+         return v->local_size_variable ||
+                threads_per_wg >
+                   compiler->threadsize_base * compiler->max_waves;
+      }
+
+      /* On a6xx, we prefer the larger threadsize unless the workgroup is
+       * small enough that it would be useless. Note that because
+       * threadsize_base is bumped to 64, we don't have to worry about the
+       * workgroup fitting, unlike the a5xx case.
+       */
+      if (!v->local_size_variable) {
+         if (threads_per_wg <= compiler->threadsize_base)
+            return false;
+      }
+   }
+      FALLTHROUGH;
+   case MESA_SHADER_FRAGMENT: {
+      /* Check that doubling the threadsize wouldn't exceed the regfile size */
+      return regs_count * 2 <= compiler->reg_size_vec4;
+   }
+
+   default:
+      /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
+       * stages - the bit doesn't exist. The blob never used it for the VS
+       * on earlier gen's anyway.
+       */
+      return false;
+   }
  }
  
  /* Get the maximum number of waves that could be used even if this shader
   * didn't use any registers.
   */
  unsigned
-ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize)
-{
-       const struct ir3_compiler *compiler = v->shader->compiler;
-       unsigned max_waves = compiler->max_waves;
-
-       /* If this is a compute shader, compute the limit based on shared size */
-       if (v->type == MESA_SHADER_COMPUTE) {
-               /* Shared is allocated in chunks of 1k */
-               unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
-               if (shared_per_wg > 0 && !v->local_size_variable) {
-                       unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
-                       unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
-                       unsigned waves_per_wg =
-                               DIV_ROUND_UP(threads_per_wg,
-                                       compiler->threadsize_base *
-                                       (double_threadsize ? 2 : 1) * compiler->wave_granularity);
-                       max_waves =
-                               MIN2(max_waves, waves_per_wg * wgs_per_core * compiler->wave_granularity);
-               }
-       }
-
-       /* Compute the limit based on branchstack */
-       if (v->branchstack > 0) {
-               unsigned branchstack_max_waves =
-                       compiler->branchstack_size / v->branchstack *
-                       compiler->wave_granularity;
-               max_waves = MIN2(max_waves, branchstack_max_waves);
-       }
-
-       return max_waves;
+ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
+                                  bool double_threadsize)
+{
+   const struct ir3_compiler *compiler = v->shader->compiler;
+   unsigned max_waves = compiler->max_waves;
+
+   /* If this is a compute shader, compute the limit based on shared size */
+   if (v->type == MESA_SHADER_COMPUTE) {
+      /* Shared is allocated in chunks of 1k */
+      unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
+      if (shared_per_wg > 0 && !v->local_size_variable) {
+         unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
+         unsigned threads_per_wg =
+            v->local_size[0] * v->local_size[1] * v->local_size[2];
+         unsigned waves_per_wg =
+            DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
+                                            (double_threadsize ? 2 : 1) *
+                                            compiler->wave_granularity);
+         max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
+                                        compiler->wave_granularity);
+      }
+   }
+
+   /* Compute the limit based on branchstack */
+   if (v->branchstack > 0) {
+      unsigned branchstack_max_waves = compiler->branchstack_size /
+                                       v->branchstack *
+                                       compiler->wave_granularity;
+      max_waves = MIN2(max_waves, branchstack_max_waves);
+   }
+
+   return max_waves;
  }
  
  /* Get the maximum number of waves that could be launched limited by reg size.
   */
  unsigned
  ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
-                                                               unsigned reg_count, bool double_threadsize)
+                                unsigned reg_count, bool double_threadsize)
  {
-       return reg_count ?
-               (compiler->reg_size_vec4 / (reg_count * (double_threadsize ? 2 : 1)) *
-                compiler->wave_granularity) :
-               compiler->max_waves;
+   return reg_count ? (compiler->reg_size_vec4 /
+                       (reg_count * (double_threadsize ? 2 : 1)) *
+                       compiler->wave_granularity)
+                    : compiler->max_waves;
  }
  
  void
  ir3_collect_info(struct ir3_shader_variant *v)
  {
-       struct ir3_info *info = &v->info;
-       struct ir3 *shader = v->ir;
-       const struct ir3_compiler *compiler = v->shader->compiler;
-
-       memset(info, 0, sizeof(*info));
-       info->data          = v;
-       info->max_reg       = -1;
-       info->max_half_reg  = -1;
-       info->max_const     = -1;
-       info->multi_dword_ldp_stp = false;
-
-       uint32_t instr_count = 0;
-       foreach_block (block, &shader->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       instr_count++;
-               }
-       }
-
-       v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
-
-       /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
-        * doesn't try to decode the following data as instructions (such as the
-        * next stage's shader in turnip)
-        */
-       info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
-       info->sizedwords = info->size / 4;
-
-       foreach_block (block, &shader->block_list) {
-               int sfu_delay = 0;
-
-               foreach_instr (instr, &block->instr_list) {
-
-                       foreach_src (reg, instr) {
-                               collect_reg_info(instr, reg, info);
-                       }
-
-                       foreach_dst (reg, instr) {
-                               if (is_dest_gpr(reg)) {
-                                       collect_reg_info(instr, reg, info);
-                               }
-                       }
-
-                       if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
-                               struct ir3_register *base = (instr->opc == OPC_STP) ?
-                                               instr->srcs[2] : instr->srcs[1];
-                               if (base->iim_val * type_size(instr->cat6.type) > 32) {
-                                       info->multi_dword_ldp_stp = true;
-                               }
-                       }
-
-                       if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
-                               info->last_baryf = info->instrs_count;
-
-                       unsigned instrs_count = 1 + instr->repeat + instr->nop;
-                       unsigned nops_count = instr->nop;
-
-                       if (instr->opc == OPC_NOP) {
-                               nops_count = 1 + instr->repeat;
-                               info->instrs_per_cat[0] += nops_count;
-                       } else {
-                               info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
-                               info->instrs_per_cat[0] += nops_count;
-                       }
-
-                       if (instr->opc == OPC_MOV) {
-                               if (instr->cat1.src_type == instr->cat1.dst_type) {
-                                       info->mov_count += 1 + instr->repeat;
-                               } else {
-                                       info->cov_count += 1 + instr->repeat;
-                               }
-                       }
-
-                       info->instrs_count += instrs_count;
-                       info->nops_count += nops_count;
-
-                       if (instr->flags & IR3_INSTR_SS) {
-                               info->ss++;
-                               info->sstall += sfu_delay;
-                               sfu_delay = 0;
-                       }
-
-                       if (instr->flags & IR3_INSTR_SY)
-                               info->sy++;
-
-                       if (is_sfu(instr)) {
-                               sfu_delay = 10;
-                       } else {
-                               int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
-                               sfu_delay -= n;
-                       }
-               }
-       }
-
-       /* TODO: for a5xx and below, is there a separate regfile for
-        * half-registers?
-        */
-       unsigned regs_count =
-               info->max_reg + 1 + (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
-
-       info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
-       unsigned reg_independent_max_waves =
-               ir3_get_reg_independent_max_waves(v, info->double_threadsize);
-       unsigned reg_dependent_max_waves =
-               ir3_get_reg_dependent_max_waves(compiler, regs_count, info->double_threadsize);
-       info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
-       assert(info->max_waves <= v->shader->compiler->max_waves);
-}
-
-static struct ir3_register * reg_create(struct ir3 *shader,
-               int num, int flags)
-{
-       struct ir3_register *reg =
-                       ir3_alloc(shader, sizeof(struct ir3_register));
-       reg->wrmask = 1;
-       reg->flags = flags;
-       reg->num = num;
-       return reg;
+   struct ir3_info *info = &v->info;
+   struct ir3 *shader = v->ir;
+   const struct ir3_compiler *compiler = v->shader->compiler;
+
+   memset(info, 0, sizeof(*info));
+   info->data = v;
+   info->max_reg = -1;
+   info->max_half_reg = -1;
+   info->max_const = -1;
+   info->multi_dword_ldp_stp = false;
+
+   uint32_t instr_count = 0;
+   foreach_block (block, &shader->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         instr_count++;
+      }
+   }
+
+   v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
+
+   /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
+    * doesn't try to decode the following data as instructions (such as the
+    * next stage's shader in turnip)
+    */
+   info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
+   info->sizedwords = info->size / 4;
+
+   foreach_block (block, &shader->block_list) {
+      int sfu_delay = 0;
+
+      foreach_instr (instr, &block->instr_list) {
+
+         foreach_src (reg, instr) {
+            collect_reg_info(instr, reg, info);
+         }
+
+         foreach_dst (reg, instr) {
+            if (is_dest_gpr(reg)) {
+               collect_reg_info(instr, reg, info);
+            }
+         }
+
+         if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
+            struct ir3_register *base =
+               (instr->opc == OPC_STP) ? instr->srcs[2] : instr->srcs[1];
+            if (base->iim_val * type_size(instr->cat6.type) > 32) {
+               info->multi_dword_ldp_stp = true;
+            }
+         }
+
+         if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
+            info->last_baryf = info->instrs_count;
+
+         unsigned instrs_count = 1 + instr->repeat + instr->nop;
+         unsigned nops_count = instr->nop;
+
+         if (instr->opc == OPC_NOP) {
+            nops_count = 1 + instr->repeat;
+            info->instrs_per_cat[0] += nops_count;
+         } else {
+            info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
+            info->instrs_per_cat[0] += nops_count;
+         }
+
+         if (instr->opc == OPC_MOV) {
+            if (instr->cat1.src_type == instr->cat1.dst_type) {
+               info->mov_count += 1 + instr->repeat;
+            } else {
+               info->cov_count += 1 + instr->repeat;
+            }
+         }
+
+         info->instrs_count += instrs_count;
+         info->nops_count += nops_count;
+
+         if (instr->flags & IR3_INSTR_SS) {
+            info->ss++;
+            info->sstall += sfu_delay;
+            sfu_delay = 0;
+         }
+
+         if (instr->flags & IR3_INSTR_SY)
+            info->sy++;
+
+         if (is_sfu(instr)) {
+            sfu_delay = 10;
+         } else {
+            int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
+            sfu_delay -= n;
+         }
+      }
+   }
+
+   /* TODO: for a5xx and below, is there a separate regfile for
+    * half-registers?
+    */
+   unsigned regs_count =
+      info->max_reg + 1 +
+      (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
+
+   info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
+   unsigned reg_independent_max_waves =
+      ir3_get_reg_independent_max_waves(v, info->double_threadsize);
+   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
+      compiler, regs_count, info->double_threadsize);
+   info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+   assert(info->max_waves <= v->shader->compiler->max_waves);
+}
+
+static struct ir3_register *
+reg_create(struct ir3 *shader, int num, int flags)
+{
+   struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
+   reg->wrmask = 1;
+   reg->flags = flags;
+   reg->num = num;
+   return reg;
  }
  
-static void insert_instr(struct ir3_block *block,
-               struct ir3_instruction *instr)
-{
-       struct ir3 *shader = block->shader;
-
-       instr->serialno = ++shader->instr_count;
-
-       list_addtail(&instr->node, &block->instr_list);
-
-       if (is_input(instr))
-               array_insert(shader, shader->baryfs, instr);
-}
-
-struct ir3_block * ir3_block_create(struct ir3 *shader)
-{
-       struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+static void
+insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
+{
+   struct ir3 *shader = block->shader;
+
+   instr->serialno = ++shader->instr_count;
+
+   list_addtail(&instr->node, &block->instr_list);
+
+   if (is_input(instr))
+      array_insert(shader, shader->baryfs, instr);
+}
+
+struct ir3_block *
+ir3_block_create(struct ir3 *shader)
+{
+   struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
  #ifdef DEBUG
-       block->serialno = ++shader->block_count;
+   block->serialno = ++shader->block_count;
  #endif
-       block->shader = shader;
-       list_inithead(&block->node);
-       list_inithead(&block->instr_list);
-       return block;
+   block->shader = shader;
+   list_inithead(&block->node);
+   list_inithead(&block->instr_list);
+   return block;
  }
  
-
-void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
+void
+ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
  {
-       array_insert(block, block->predecessors, pred);
+   array_insert(block, block->predecessors, pred);
  }
  
-void ir3_block_add_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
+void
+ir3_block_add_physical_predecessor(struct ir3_block *block,
+                                   struct ir3_block *pred)
  {
-       array_insert(block, block->physical_predecessors, pred);
+   array_insert(block, block->physical_predecessors, pred);
  }
  
-void ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
+void
+ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
  {
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               if (block->predecessors[i] == pred) {
-                       if (i < block->predecessors_count - 1) {
-                               block->predecessors[i] =
-                                       block->predecessors[block->predecessors_count - 1];
-                       }
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      if (block->predecessors[i] == pred) {
+         if (i < block->predecessors_count - 1) {
+            block->predecessors[i] =
+               block->predecessors[block->predecessors_count - 1];
+         }
  
-                       block->predecessors_count--;
-                       return;
-               }
-       }
+         block->predecessors_count--;
+         return;
+      }
+   }
  }
  
-unsigned ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
+unsigned
+ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
  {
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               if (block->predecessors[i] == pred) {
-                       return i;
-               }
-       }
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      if (block->predecessors[i] == pred) {
+         return i;
+      }
+   }
  
-       unreachable("ir3_block_get_pred_index() invalid predecessor");
+   unreachable("ir3_block_get_pred_index() invalid predecessor");
  }
  
-static struct ir3_instruction *instr_create(struct ir3_block *block,
-               opc_t opc, int ndst, int nsrc)
+static struct ir3_instruction *
+instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
  {
-       /* Add extra sources for array destinations and the address reg */
-       if (1 <= opc_cat(opc))
-               nsrc += 2;
-       struct ir3_instruction *instr;
-       unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
-               (nsrc * sizeof(instr->srcs[0]));
-       char *ptr = ir3_alloc(block->shader, sz);
+   /* Add extra sources for array destinations and the address reg */
+   if (1 <= opc_cat(opc))
+      nsrc += 2;
+   struct ir3_instruction *instr;
+   unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
+                 (nsrc * sizeof(instr->srcs[0]));
+   char *ptr = ir3_alloc(block->shader, sz);
  
-       instr = (struct ir3_instruction *)ptr;
-       ptr  += sizeof(*instr);
-       instr->dsts = (struct ir3_register **)ptr;
-       instr->srcs = instr->dsts + ndst;
+   instr = (struct ir3_instruction *)ptr;
+   ptr += sizeof(*instr);
+   instr->dsts = (struct ir3_register **)ptr;
+   instr->srcs = instr->dsts + ndst;
  
  #ifdef DEBUG
-       instr->dsts_max = ndst;
-       instr->srcs_max = nsrc;
+   instr->dsts_max = ndst;
+   instr->srcs_max = nsrc;
  #endif
  
-       return instr;
+   return instr;
  }
  
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-               opc_t opc, int ndst, int nsrc)
+struct ir3_instruction *
+ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
  {
-       struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
-       instr->block = block;
-       instr->opc = opc;
-       insert_instr(block, instr);
-       return instr;
+   struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
+   instr->block = block;
+   instr->opc = opc;
+   insert_instr(block, instr);
+   return instr;
  }
  
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+struct ir3_instruction *
+ir3_instr_clone(struct ir3_instruction *instr)
  {
-       struct ir3_instruction *new_instr = instr_create(instr->block, instr->opc,
-                       instr->dsts_count, instr->srcs_count);
-       struct ir3_register **dsts, **srcs;
+   struct ir3_instruction *new_instr = instr_create(
+      instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
+   struct ir3_register **dsts, **srcs;
  
-       dsts = new_instr->dsts;
-       srcs = new_instr->srcs;
-       *new_instr = *instr;
-       new_instr->dsts = dsts;
-       new_instr->srcs = srcs;
+   dsts = new_instr->dsts;
+   srcs = new_instr->srcs;
+   *new_instr = *instr;
+   new_instr->dsts = dsts;
+   new_instr->srcs = srcs;
  
-       insert_instr(instr->block, new_instr);
+   insert_instr(instr->block, new_instr);
  
-       /* clone registers: */
-       new_instr->dsts_count = 0;
-       new_instr->srcs_count = 0;
-       foreach_dst (reg, instr) {
-               struct ir3_register *new_reg = ir3_dst_create(new_instr, reg->num, reg->flags);
-               *new_reg = *reg;
-               if (new_reg->instr)
-                       new_reg->instr = new_instr;
-       }
-       foreach_src (reg, instr) {
-               struct ir3_register *new_reg = ir3_src_create(new_instr, reg->num, reg->flags);
-               *new_reg = *reg;
-       }
+   /* clone registers: */
+   new_instr->dsts_count = 0;
+   new_instr->srcs_count = 0;
+   foreach_dst (reg, instr) {
+      struct ir3_register *new_reg =
+         ir3_dst_create(new_instr, reg->num, reg->flags);
+      *new_reg = *reg;
+      if (new_reg->instr)
+         new_reg->instr = new_instr;
+   }
+   foreach_src (reg, instr) {
+      struct ir3_register *new_reg =
+         ir3_src_create(new_instr, reg->num, reg->flags);
+      *new_reg = *reg;
+   }
  
-       return new_instr;
+   return new_instr;
  }
  
  /* Add a false dependency to instruction, to ensure it is scheduled first: */
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+void
+ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
  {
-       for (unsigned i = 0; i < instr->deps_count; i++) {
-               if (instr->deps[i] == dep)
-                       return;
-       }
+   for (unsigned i = 0; i < instr->deps_count; i++) {
+      if (instr->deps[i] == dep)
+         return;
+   }
  
-       array_insert(instr, instr->deps, dep);
+   array_insert(instr, instr->deps, dep);
  }
  
-struct ir3_register * ir3_src_create(struct ir3_instruction *instr,
-               int num, int flags)
+struct ir3_register *
+ir3_src_create(struct ir3_instruction *instr, int num, int flags)
  {
-       struct ir3 *shader = instr->block->shader;
+   struct ir3 *shader = instr->block->shader;
  #ifdef DEBUG
-       debug_assert(instr->srcs_count < instr->srcs_max);
+   debug_assert(instr->srcs_count < instr->srcs_max);
  #endif
-       struct ir3_register *reg = reg_create(shader, num, flags);
-       instr->srcs[instr->srcs_count++] = reg;
-       return reg;
+   struct ir3_register *reg = reg_create(shader, num, flags);
+   instr->srcs[instr->srcs_count++] = reg;
+   return reg;
  }
  
-struct ir3_register * ir3_dst_create(struct ir3_instruction *instr,
-               int num, int flags)
+struct ir3_register *
+ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
  {
-       struct ir3 *shader = instr->block->shader;
+   struct ir3 *shader = instr->block->shader;
  #ifdef DEBUG
-       debug_assert(instr->dsts_count < instr->dsts_max);
+   debug_assert(instr->dsts_count < instr->dsts_max);
  #endif
-       struct ir3_register *reg = reg_create(shader, num, flags);
-       instr->dsts[instr->dsts_count++] = reg;
-       return reg;
+   struct ir3_register *reg = reg_create(shader, num, flags);
+   instr->dsts[instr->dsts_count++] = reg;
+   return reg;
  }
  
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-               struct ir3_register *reg)
+struct ir3_register *
+ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
  {
-       struct ir3_register *new_reg = reg_create(shader, 0, 0);
-       *new_reg = *reg;
-       return new_reg;
+   struct ir3_register *new_reg = reg_create(shader, 0, 0);
+   *new_reg = *reg;
+   return new_reg;
  }
  
-
-void ir3_reg_set_last_array(struct ir3_instruction *instr,
-                                                       struct ir3_register *reg,
-                                                       struct ir3_register *last_write)
+void
+ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
+                       struct ir3_register *last_write)
  {
-       assert(reg->flags & IR3_REG_ARRAY);
-       struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
-       *new_reg = *reg;
-       new_reg->def = last_write;
-       ir3_reg_tie(reg, new_reg);
+   assert(reg->flags & IR3_REG_ARRAY);
+   struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
+   *new_reg = *reg;
+   new_reg->def = last_write;
+   ir3_reg_tie(reg, new_reg);
  }
  
  void
  ir3_instr_set_address(struct ir3_instruction *instr,
-               struct ir3_instruction *addr)
+                      struct ir3_instruction *addr)
  {
-       if (!instr->address) {
-               struct ir3 *ir = instr->block->shader;
+   if (!instr->address) {
+      struct ir3 *ir = instr->block->shader;
  
-               debug_assert(instr->block == addr->block);
+      debug_assert(instr->block == addr->block);
  
-               instr->address = ir3_src_create(instr, addr->dsts[0]->num,
-                                                                               addr->dsts[0]->flags);
-               instr->address->def = addr->dsts[0];
-               debug_assert(reg_num(addr->dsts[0]) == REG_A0);
-               unsigned comp = reg_comp(addr->dsts[0]);
-               if (comp == 0) {
-                       array_insert(ir, ir->a0_users, instr);
-               } else {
-                       debug_assert(comp == 1);
-                       array_insert(ir, ir->a1_users, instr);
-               }
-       } else {
-               debug_assert(instr->address->def->instr == addr);
-       }
+      instr->address =
+         ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
+      instr->address->def = addr->dsts[0];
+      debug_assert(reg_num(addr->dsts[0]) == REG_A0);
+      unsigned comp = reg_comp(addr->dsts[0]);
+      if (comp == 0) {
+         array_insert(ir, ir->a0_users, instr);
+      } else {
+         debug_assert(comp == 1);
+         array_insert(ir, ir->a1_users, instr);
+      }
+   } else {
+      debug_assert(instr->address->def->instr == addr);
+   }
  }
  
  void
  ir3_block_clear_mark(struct ir3_block *block)
  {
-       foreach_instr (instr, &block->instr_list)
-               instr->flags &= ~IR3_INSTR_MARK;
+   foreach_instr (instr, &block->instr_list)
+      instr->flags &= ~IR3_INSTR_MARK;
  }
  
  void
  ir3_clear_mark(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list) {
-               ir3_block_clear_mark(block);
-       }
+   foreach_block (block, &ir->block_list) {
+      ir3_block_clear_mark(block);
+   }
  }
  
  unsigned
  ir3_count_instructions(struct ir3 *ir)
  {
-       unsigned cnt = 1;
-       foreach_block (block, &ir->block_list) {
-               block->start_ip = cnt;
-               foreach_instr (instr, &block->instr_list) {
-                       instr->ip = cnt++;
-               }
-               block->end_ip = cnt;
-       }
-       return cnt;
+   unsigned cnt = 1;
+   foreach_block (block, &ir->block_list) {
+      block->start_ip = cnt;
+      foreach_instr (instr, &block->instr_list) {
+         instr->ip = cnt++;
+      }
+      block->end_ip = cnt;
+   }
+   return cnt;
  }
  
  /* When counting instructions for RA, we insert extra fake instructions at the
@@ -580,48 +593,48 @@ ir3_count_instructions(struct ir3 *ir)
  unsigned
  ir3_count_instructions_ra(struct ir3 *ir)
  {
-       unsigned cnt = 1;
-       foreach_block (block, &ir->block_list) {
-               block->start_ip = cnt++;
-               foreach_instr (instr, &block->instr_list) {
-                       instr->ip = cnt++;
-               }
-               block->end_ip = cnt++;
-       }
-       return cnt;
+   unsigned cnt = 1;
+   foreach_block (block, &ir->block_list) {
+      block->start_ip = cnt++;
+      foreach_instr (instr, &block->instr_list) {
+         instr->ip = cnt++;
+      }
+      block->end_ip = cnt++;
+   }
+   return cnt;
  }
  
  struct ir3_array *
  ir3_lookup_array(struct ir3 *ir, unsigned id)
  {
-       foreach_array (arr, &ir->array_list)
-               if (arr->id == id)
-                       return arr;
-       return NULL;
+   foreach_array (arr, &ir->array_list)
+      if (arr->id == id)
+         return arr;
+   return NULL;
  }
  
  void
  ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
  {
-       /* We could do this in a single pass if we can assume instructions
-        * are always sorted.  Which currently might not always be true.
-        * (In particular after ir3_group pass, but maybe other places.)
-        */
-       foreach_block (block, &ir->block_list)
-               foreach_instr (instr, &block->instr_list)
-                       instr->uses = NULL;
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       foreach_ssa_src_n (src, n, instr) {
-                               if (__is_false_dep(instr, n) && !falsedeps)
-                                       continue;
-                               if (!src->uses)
-                                       src->uses = _mesa_pointer_set_create(mem_ctx);
-                               _mesa_set_add(src->uses, instr);
-                       }
-               }
-       }
+   /* We could do this in a single pass if we can assume instructions
+    * are always sorted.  Which currently might not always be true.
+    * (In particular after ir3_group pass, but maybe other places.)
+    */
+   foreach_block (block, &ir->block_list)
+      foreach_instr (instr, &block->instr_list)
+         instr->uses = NULL;
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         foreach_ssa_src_n (src, n, instr) {
+            if (__is_false_dep(instr, n) && !falsedeps)
+               continue;
+            if (!src->uses)
+               src->uses = _mesa_pointer_set_create(mem_ctx);
+            _mesa_set_add(src->uses, instr);
+         }
+      }
+   }
  }
  
  /**
@@ -632,35 +645,35 @@ ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
  void
  ir3_set_dst_type(struct ir3_instruction *instr, bool half)
  {
-       if (half) {
-               instr->dsts[0]->flags |= IR3_REG_HALF;
-       } else {
-               instr->dsts[0]->flags &= ~IR3_REG_HALF;
-       }
-
-       switch (opc_cat(instr->opc)) {
-       case 1: /* move instructions */
-               if (half) {
-                       instr->cat1.dst_type = half_type(instr->cat1.dst_type);
-               } else {
-                       instr->cat1.dst_type = full_type(instr->cat1.dst_type);
-               }
-               break;
-       case 4:
-               if (half) {
-                       instr->opc = cat4_half_opc(instr->opc);
-               } else {
-                       instr->opc = cat4_full_opc(instr->opc);
-               }
-               break;
-       case 5:
-               if (half) {
-                       instr->cat5.type = half_type(instr->cat5.type);
-               } else {
-                       instr->cat5.type = full_type(instr->cat5.type);
-               }
-               break;
-       }
+   if (half) {
+      instr->dsts[0]->flags |= IR3_REG_HALF;
+   } else {
+      instr->dsts[0]->flags &= ~IR3_REG_HALF;
+   }
+
+   switch (opc_cat(instr->opc)) {
+   case 1: /* move instructions */
+      if (half) {
+         instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+      } else {
+         instr->cat1.dst_type = full_type(instr->cat1.dst_type);
+      }
+      break;
+   case 4:
+      if (half) {
+         instr->opc = cat4_half_opc(instr->opc);
+      } else {
+         instr->opc = cat4_full_opc(instr->opc);
+      }
+      break;
+   case 5:
+      if (half) {
+         instr->cat5.type = half_type(instr->cat5.type);
+      } else {
+         instr->cat5.type = full_type(instr->cat5.type);
+      }
+      break;
+   }
  }
  
  /**
@@ -670,236 +683,231 @@ ir3_set_dst_type(struct ir3_instruction *instr, bool half)
  void
  ir3_fixup_src_type(struct ir3_instruction *instr)
  {
-       switch (opc_cat(instr->opc)) {
-       case 1: /* move instructions */
-               if (instr->srcs[0]->flags & IR3_REG_HALF) {
-                       instr->cat1.src_type = half_type(instr->cat1.src_type);
-               } else {
-                       instr->cat1.src_type = full_type(instr->cat1.src_type);
-               }
-               break;
-       case 3:
-               if (instr->srcs[0]->flags & IR3_REG_HALF) {
-                       instr->opc = cat3_half_opc(instr->opc);
-               } else {
-                       instr->opc = cat3_full_opc(instr->opc);
-               }
-               break;
-       }
+   switch (opc_cat(instr->opc)) {
+   case 1: /* move instructions */
+      if (instr->srcs[0]->flags & IR3_REG_HALF) {
+         instr->cat1.src_type = half_type(instr->cat1.src_type);
+      } else {
+         instr->cat1.src_type = full_type(instr->cat1.src_type);
+      }
+      break;
+   case 3:
+      if (instr->srcs[0]->flags & IR3_REG_HALF) {
+         instr->opc = cat3_half_opc(instr->opc);
+      } else {
+         instr->opc = cat3_full_opc(instr->opc);
+      }
+      break;
+   }
  }
  
  static unsigned
  cp_flags(unsigned flags)
  {
-       /* only considering these flags (at least for now): */
-       flags &= (IR3_REG_CONST | IR3_REG_IMMED |
-                       IR3_REG_FNEG | IR3_REG_FABS |
-                       IR3_REG_SNEG | IR3_REG_SABS |
-                       IR3_REG_BNOT | IR3_REG_RELATIV |
-                       IR3_REG_SHARED);
-       return flags;
+   /* only considering these flags (at least for now): */
+   flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
+             IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
+             IR3_REG_SHARED);
+   return flags;
  }
  
  bool
-ir3_valid_flags(struct ir3_instruction *instr, unsigned n,
-               unsigned flags)
-{
-       struct ir3_compiler *compiler = instr->block->shader->compiler;
-       unsigned valid_flags;
-
-       if ((flags & IR3_REG_SHARED) &&
-                       opc_cat(instr->opc) > 3)
-               return false;
-
-       flags = cp_flags(flags);
-
-       /* If destination is indirect, then source cannot be.. at least
-        * I don't think so..
-        */
-       if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
-                       (flags & IR3_REG_RELATIV))
-               return false;
-
-       if (flags & IR3_REG_RELATIV) {
-               /* TODO need to test on earlier gens.. pretty sure the earlier
-                * problem was just that we didn't check that the src was from
-                * same block (since we can't propagate address register values
-                * across blocks currently)
-                */
-               if (compiler->gpu_id < 600)
-                       return false;
-
-               /* NOTE in the special try_swap_mad_two_srcs() case we can be
-                * called on a src that has already had an indirect load folded
-                * in, in which case ssa() returns NULL
-                */
-               if (instr->srcs[n]->flags & IR3_REG_SSA) {
-                       struct ir3_instruction *src = ssa(instr->srcs[n]);
-                       if (src->address->def->instr->block != instr->block)
-                               return false;
-               }
-       }
-
-       if (is_meta(instr)) {
-               /* collect and phi nodes support const/immed sources, which will be
-                * turned into move instructions, but not anything else.
-                */
-               if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
-                       return false;
-
-               if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
-                       return false;
-
-               return true;
-       }
-
-       switch (opc_cat(instr->opc)) {
-       case 0: /* end, chmask */
-               return flags == 0;
-       case 1:
-               switch (instr->opc) {
-                       case OPC_MOVMSK:
-                       case OPC_SWZ:
-                       case OPC_SCT:
-                       case OPC_GAT:
-                               valid_flags = IR3_REG_SHARED;
-                               break;
-                       default:
-                               valid_flags =
-                                       IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV |
-                                       IR3_REG_SHARED;
-               }
-               if (flags & ~valid_flags)
-                       return false;
-               break;
-       case 2:
-               valid_flags = ir3_cat2_absneg(instr->opc) |
-                               IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
-
-               if (ir3_cat2_int(instr->opc))
-                       valid_flags |= IR3_REG_IMMED;
-
-               if (flags & ~valid_flags)
-                       return false;
-
-               if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
-                       unsigned m = n ^ 1;
-                       /* cannot deal w/ const or shared in both srcs:
-                        * (note that some cat2 actually only have a single src)
-                        */
-                       if (m < instr->srcs_count) {
-                               struct ir3_register *reg = instr->srcs[m];
-                               if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
-                                       (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
-                                       return false;
-                               if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
-                                       return false;
-                       }
-               }
-               break;
-       case 3:
-               valid_flags = ir3_cat3_absneg(instr->opc) |
-                               IR3_REG_RELATIV | IR3_REG_SHARED;
-
-               if (instr->opc == OPC_SHLG_B16) {
-                       valid_flags |= IR3_REG_IMMED;
-                       /* shlg.b16 can be RELATIV+CONST but not CONST: */
-                       if (flags & IR3_REG_RELATIV)
-                               valid_flags |= IR3_REG_CONST;
-               } else {
-                       valid_flags |= IR3_REG_CONST;
-               }
-
-               if (flags & ~valid_flags)
-                       return false;
-
-               if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
-                       /* cannot deal w/ const/shared/relativ in 2nd src: */
-                       if (n == 1)
-                               return false;
-               }
-
-               break;
-       case 4:
-               /* seems like blob compiler avoids const as src.. */
-               /* TODO double check if this is still the case on a4xx */
-               if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
-                       return false;
-               if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
-                       return false;
-               break;
-       case 5:
-               /* no flags allowed */
-               if (flags)
-                       return false;
-               break;
-       case 6:
-               valid_flags = IR3_REG_IMMED;
-               if (flags & ~valid_flags)
-                       return false;
-
-               if (flags & IR3_REG_IMMED) {
-                       /* doesn't seem like we can have immediate src for store
-                        * instructions:
-                        *
-                        * TODO this restriction could also apply to load instructions,
-                        * but for load instructions this arg is the address (and not
-                        * really sure any good way to test a hard-coded immed addr src)
-                        */
-                       if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
-                               return false;
-
-                       if ((instr->opc == OPC_LDL) && (n == 0))
-                               return false;
-
-                       if ((instr->opc == OPC_STL) && (n != 2))
-                               return false;
-
-                       if ((instr->opc == OPC_LDP) && (n == 0))
-                               return false;
-
-                       if ((instr->opc == OPC_STP) && (n != 2))
-                               return false;
-
-                       if (instr->opc == OPC_STLW && n == 0)
-                               return false;
-
-                       if (instr->opc == OPC_LDLW && n == 0)
-                               return false;
-
-                       /* disallow immediates in anything but the SSBO slot argument for
-                        * cat6 instructions:
-                        */
-                       if (is_atomic(instr->opc) && (n != 0))
-                               return false;
-
-                       if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
-                               return false;
-
-                       if (instr->opc == OPC_STG && (n == 2))
-                               return false;
-
-                       if (instr->opc == OPC_STG_A && (n == 4))
-                               return false;
-
-                       /* as with atomics, these cat6 instrs can only have an immediate
-                        * for SSBO/IBO slot argument
-                        */
-                       switch (instr->opc) {
-                       case OPC_LDIB:
-                       case OPC_STIB:
-                       case OPC_LDC:
-                       case OPC_RESINFO:
-                               if (n != 0)
-                                       return false;
-                               break;
-                       default:
-                               break;
-                       }
-               }
-
-               break;
-       }
-
-       return true;
+ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
+{
+   struct ir3_compiler *compiler = instr->block->shader->compiler;
+   unsigned valid_flags;
+
+   if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
+      return false;
+
+   flags = cp_flags(flags);
+
+   /* If destination is indirect, then source cannot be.. at least
+    * I don't think so..
+    */
+   if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
+       (flags & IR3_REG_RELATIV))
+      return false;
+
+   if (flags & IR3_REG_RELATIV) {
+      /* TODO need to test on earlier gens.. pretty sure the earlier
+       * problem was just that we didn't check that the src was from
+       * same block (since we can't propagate address register values
+       * across blocks currently)
+       */
+      if (compiler->gpu_id < 600)
+         return false;
+
+      /* NOTE in the special try_swap_mad_two_srcs() case we can be
+       * called on a src that has already had an indirect load folded
+       * in, in which case ssa() returns NULL
+       */
+      if (instr->srcs[n]->flags & IR3_REG_SSA) {
+         struct ir3_instruction *src = ssa(instr->srcs[n]);
+         if (src->address->def->instr->block != instr->block)
+            return false;
+      }
+   }
+
+   if (is_meta(instr)) {
+      /* collect and phi nodes support const/immed sources, which will be
+       * turned into move instructions, but not anything else.
+       */
+      if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
+         return false;
+
+      if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
+         return false;
+
+      return true;
+   }
+
+   switch (opc_cat(instr->opc)) {
+   case 0: /* end, chmask */
+      return flags == 0;
+   case 1:
+      switch (instr->opc) {
+      case OPC_MOVMSK:
+      case OPC_SWZ:
+      case OPC_SCT:
+      case OPC_GAT:
+         valid_flags = IR3_REG_SHARED;
+         break;
+      default:
+         valid_flags =
+            IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
+      }
+      if (flags & ~valid_flags)
+         return false;
+      break;
+   case 2:
+      valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
+                    IR3_REG_RELATIV | IR3_REG_SHARED;
+
+      if (ir3_cat2_int(instr->opc))
+         valid_flags |= IR3_REG_IMMED;
+
+      if (flags & ~valid_flags)
+         return false;
+
+      if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
+         unsigned m = n ^ 1;
+         /* cannot deal w/ const or shared in both srcs:
+          * (note that some cat2 actually only have a single src)
+          */
+         if (m < instr->srcs_count) {
+            struct ir3_register *reg = instr->srcs[m];
+            if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
+                (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
+               return false;
+            if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
+               return false;
+         }
+      }
+      break;
+   case 3:
+      valid_flags =
+         ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
+
+      if (instr->opc == OPC_SHLG_B16) {
+         valid_flags |= IR3_REG_IMMED;
+         /* shlg.b16 can be RELATIV+CONST but not CONST: */
+         if (flags & IR3_REG_RELATIV)
+            valid_flags |= IR3_REG_CONST;
+      } else {
+         valid_flags |= IR3_REG_CONST;
+      }
+
+      if (flags & ~valid_flags)
+         return false;
+
+      if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
+         /* cannot deal w/ const/shared/relativ in 2nd src: */
+         if (n == 1)
+            return false;
+      }
+
+      break;
+   case 4:
+      /* seems like blob compiler avoids const as src.. */
+      /* TODO double check if this is still the case on a4xx */
+      if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
+         return false;
+      if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
+         return false;
+      break;
+   case 5:
+      /* no flags allowed */
+      if (flags)
+         return false;
+      break;
+   case 6:
+      valid_flags = IR3_REG_IMMED;
+      if (flags & ~valid_flags)
+         return false;
+
+      if (flags & IR3_REG_IMMED) {
+         /* doesn't seem like we can have immediate src for store
+          * instructions:
+          *
+          * TODO this restriction could also apply to load instructions,
+          * but for load instructions this arg is the address (and not
+          * really sure any good way to test a hard-coded immed addr src)
+          */
+         if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
+            return false;
+
+         if ((instr->opc == OPC_LDL) && (n == 0))
+            return false;
+
+         if ((instr->opc == OPC_STL) && (n != 2))
+            return false;
+
+         if ((instr->opc == OPC_LDP) && (n == 0))
+            return false;
+
+         if ((instr->opc == OPC_STP) && (n != 2))
+            return false;
+
+         if (instr->opc == OPC_STLW && n == 0)
+            return false;
+
+         if (instr->opc == OPC_LDLW && n == 0)
+            return false;
+
+         /* disallow immediates in anything but the SSBO slot argument for
+          * cat6 instructions:
+          */
+         if (is_atomic(instr->opc) && (n != 0))
+            return false;
+
+         if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+            return false;
+
+         if (instr->opc == OPC_STG && (n == 2))
+            return false;
+
+         if (instr->opc == OPC_STG_A && (n == 4))
+            return false;
+
+         /* as with atomics, these cat6 instrs can only have an immediate
+          * for SSBO/IBO slot argument
+          */
+         switch (instr->opc) {
+         case OPC_LDIB:
+         case OPC_STIB:
+         case OPC_LDC:
+         case OPC_RESINFO:
+            if (n != 0)
+               return false;
+            break;
+         default:
+            break;
+         }
+      }
+
+      break;
+   }
+
+   return true;
  }
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

index eae3d36..48f8192 100644 (file)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -24,8 +24,8 @@
  #ifndef IR3_H_
  #define IR3_H_
  
-#include <stdint.h>
  #include <stdbool.h>
+#include <stdint.h>
  
  #include "compiler/shader_enums.h"
  
@@ -44,539 +44,540 @@ struct ir3_instruction;
  struct ir3_block;
  
  struct ir3_info {
-       void *data;              /* used internally in ir3 assembler */
-       /* Size in bytes of the shader binary, including NIR constants and
-        * padding
-        */
-       uint32_t size;
-       /* byte offset from start of the shader to the NIR constant data. */
-       uint32_t constant_data_offset;
-       /* Size in dwords of the instructions. */
-       uint16_t sizedwords;
-       uint16_t instrs_count;   /* expanded to account for rpt's */
-       uint16_t nops_count;     /* # of nop instructions, including nopN */
-       uint16_t mov_count;
-       uint16_t cov_count;
-       /* NOTE: max_reg, etc, does not include registers not touched
-        * by the shader (ie. vertex fetched via VFD_DECODE but not
-        * touched by shader)
-        */
-       int8_t   max_reg;   /* highest GPR # used by shader */
-       int8_t   max_half_reg;
-       int16_t  max_const;
-       /* This is the maximum # of waves that can executed at once in one core,
-        * assuming that they are all executing this shader.
-        */
-       int8_t   max_waves;
-       bool     double_threadsize;
-       bool     multi_dword_ldp_stp;
-
-       /* number of sync bits: */
-       uint16_t ss, sy;
-
-       /* estimate of number of cycles stalled on (ss) */
-       uint16_t sstall;
-
-       uint16_t last_baryf;     /* instruction # of last varying fetch */
-
-       /* Number of instructions of a given category: */
-       uint16_t instrs_per_cat[8];
+   void *data; /* used internally in ir3 assembler */
+   /* Size in bytes of the shader binary, including NIR constants and
+    * padding
+    */
+   uint32_t size;
+   /* byte offset from start of the shader to the NIR constant data. */
+   uint32_t constant_data_offset;
+   /* Size in dwords of the instructions. */
+   uint16_t sizedwords;
+   uint16_t instrs_count; /* expanded to account for rpt's */
+   uint16_t nops_count;   /* # of nop instructions, including nopN */
+   uint16_t mov_count;
+   uint16_t cov_count;
+   /* NOTE: max_reg, etc, does not include registers not touched
+    * by the shader (ie. vertex fetched via VFD_DECODE but not
+    * touched by shader)
+    */
+   int8_t max_reg; /* highest GPR # used by shader */
+   int8_t max_half_reg;
+   int16_t max_const;
+   /* This is the maximum # of waves that can executed at once in one core,
+    * assuming that they are all executing this shader.
+    */
+   int8_t max_waves;
+   bool double_threadsize;
+   bool multi_dword_ldp_stp;
+
+   /* number of sync bits: */
+   uint16_t ss, sy;
+
+   /* estimate of number of cycles stalled on (ss) */
+   uint16_t sstall;
+
+   uint16_t last_baryf; /* instruction # of last varying fetch */
+
+   /* Number of instructions of a given category: */
+   uint16_t instrs_per_cat[8];
  };
  
  struct ir3_merge_set {
-       uint16_t preferred_reg;
-       uint16_t size;
-       uint16_t alignment;
+   uint16_t preferred_reg;
+   uint16_t size;
+   uint16_t alignment;
  
-       unsigned interval_start;
+   unsigned interval_start;
  
-       unsigned regs_count;
-       struct ir3_register **regs;
+   unsigned regs_count;
+   struct ir3_register **regs;
  };
  
  struct ir3_register {
-       enum {
-               IR3_REG_CONST  = 0x001,
-               IR3_REG_IMMED  = 0x002,
-               IR3_REG_HALF   = 0x004,
-               /* Shared registers have the same value for all threads when read.
-                * They can only be written when one thread is active (that is, inside
-                * a "getone" block).
-                */
-               IR3_REG_SHARED = 0x008,
-               IR3_REG_RELATIV= 0x010,
-               IR3_REG_R      = 0x020,
-               /* Most instructions, it seems, can do float abs/neg but not
-                * integer.  The CP pass needs to know what is intended (int or
-                * float) in order to do the right thing.  For this reason the
-                * abs/neg flags are split out into float and int variants.  In
-                * addition, .b (bitwise) operations, the negate is actually a
-                * bitwise not, so split that out into a new flag to make it
-                * more clear.
-                */
-               IR3_REG_FNEG   = 0x040,
-               IR3_REG_FABS   = 0x080,
-               IR3_REG_SNEG   = 0x100,
-               IR3_REG_SABS   = 0x200,
-               IR3_REG_BNOT   = 0x400,
-               /* (ei) flag, end-input?  Set on last bary, presumably to signal
-                * that the shader needs no more input:
-                */
-               IR3_REG_EI     = 0x2000,
-               /* meta-flags, for intermediate stages of IR, ie.
-                * before register assignment is done:
-                */
-               IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
-               IR3_REG_ARRAY  = 0x8000,
-
-               IR3_REG_KILL = 0x10000,
-               IR3_REG_FIRST_KILL = 0x20000,
-               IR3_REG_UNUSED = 0x40000,
-       } flags;
-
-       /* used for cat5 instructions, but also for internal/IR level
-        * tracking of what registers are read/written by an instruction.
-        * wrmask may be a bad name since it is used to represent both
-        * src and dst that touch multiple adjacent registers.
-        */
-       unsigned wrmask : 16;  /* up to vec16 */
-
-       /* for relative addressing, 32bits for array size is too small,
-        * but otoh we don't need to deal with disjoint sets, so instead
-        * use a simple size field (number of scalar components).
-        *
-        * Note the size field isn't important for relative const (since
-        * we don't have to do register allocation for constants).
-        */
-       unsigned size : 16;
-
-       /* normal registers:
-        * the component is in the low two bits of the reg #, so
-        * rN.x becomes: (N << 2) | x
-        */
-       uint16_t num;
-       uint16_t name;
-       union {
-               /* immediate: */
-               int32_t  iim_val;
-               uint32_t uim_val;
-               float    fim_val;
-               /* relative: */
-               struct {
-                       uint16_t id;
-                       int16_t offset;
-                       uint16_t base;
-               } array;
-       };
-
-
-       /* For IR3_REG_DEST, pointer back to the instruction containing this
-        * register.
-        */
-       struct ir3_instruction *instr;
-
-       /* For IR3_REG_SSA, src registers contain ptr back to assigning
-        * instruction.
-        *
-        * For IR3_REG_ARRAY, the pointer is back to the last dependent
-        * array access (although the net effect is the same, it points
-        * back to a previous instruction that we depend on).
-        */
-       struct ir3_register *def;
-
-       /* Pointer to another register in the instruction that must share the same
-        * physical register. Each destination can be tied with one source, and
-        * they must have "tied" pointing to each other.
-        */
-       struct ir3_register *tied;
-
-       unsigned merge_set_offset;
-       struct ir3_merge_set *merge_set;
-       unsigned interval_start, interval_end;
+   enum {
+      IR3_REG_CONST = 0x001,
+      IR3_REG_IMMED = 0x002,
+      IR3_REG_HALF = 0x004,
+      /* Shared registers have the same value for all threads when read.
+       * They can only be written when one thread is active (that is, inside
+       * a "getone" block).
+       */
+      IR3_REG_SHARED = 0x008,
+      IR3_REG_RELATIV = 0x010,
+      IR3_REG_R = 0x020,
+      /* Most instructions, it seems, can do float abs/neg but not
+       * integer.  The CP pass needs to know what is intended (int or
+       * float) in order to do the right thing.  For this reason the
+       * abs/neg flags are split out into float and int variants.  In
+       * addition, .b (bitwise) operations, the negate is actually a
+       * bitwise not, so split that out into a new flag to make it
+       * more clear.
+       */
+      IR3_REG_FNEG = 0x040,
+      IR3_REG_FABS = 0x080,
+      IR3_REG_SNEG = 0x100,
+      IR3_REG_SABS = 0x200,
+      IR3_REG_BNOT = 0x400,
+      /* (ei) flag, end-input?  Set on last bary, presumably to signal
+       * that the shader needs no more input:
+       */
+      IR3_REG_EI = 0x2000,
+      /* meta-flags, for intermediate stages of IR, ie.
+       * before register assignment is done:
+       */
+      IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */
+      IR3_REG_ARRAY = 0x8000,
+
+      IR3_REG_KILL = 0x10000,
+      IR3_REG_FIRST_KILL = 0x20000,
+      IR3_REG_UNUSED = 0x40000,
+   } flags;
+
+   /* used for cat5 instructions, but also for internal/IR level
+    * tracking of what registers are read/written by an instruction.
+    * wrmask may be a bad name since it is used to represent both
+    * src and dst that touch multiple adjacent registers.
+    */
+   unsigned wrmask : 16; /* up to vec16 */
+
+   /* for relative addressing, 32bits for array size is too small,
+    * but otoh we don't need to deal with disjoint sets, so instead
+    * use a simple size field (number of scalar components).
+    *
+    * Note the size field isn't important for relative const (since
+    * we don't have to do register allocation for constants).
+    */
+   unsigned size : 16;
+
+   /* normal registers:
+    * the component is in the low two bits of the reg #, so
+    * rN.x becomes: (N << 2) | x
+    */
+   uint16_t num;
+   uint16_t name;
+   union {
+      /* immediate: */
+      int32_t iim_val;
+      uint32_t uim_val;
+      float fim_val;
+      /* relative: */
+      struct {
+         uint16_t id;
+         int16_t offset;
+         uint16_t base;
+      } array;
+   };
+
+   /* For IR3_REG_DEST, pointer back to the instruction containing this
+    * register.
+    */
+   struct ir3_instruction *instr;
+
+   /* For IR3_REG_SSA, src registers contain ptr back to assigning
+    * instruction.
+    *
+    * For IR3_REG_ARRAY, the pointer is back to the last dependent
+    * array access (although the net effect is the same, it points
+    * back to a previous instruction that we depend on).
+    */
+   struct ir3_register *def;
+
+   /* Pointer to another register in the instruction that must share the same
+    * physical register. Each destination can be tied with one source, and
+    * they must have "tied" pointing to each other.
+    */
+   struct ir3_register *tied;
+
+   unsigned merge_set_offset;
+   struct ir3_merge_set *merge_set;
+   unsigned interval_start, interval_end;
  };
  
  /*
   * Stupid/simple growable array implementation:
   */
-#define DECLARE_ARRAY(type, name) \
-       unsigned name ## _count, name ## _sz; \
-       type * name;
-
-#define array_insert(ctx, arr, ...) do { \
-               if (arr ## _count == arr ## _sz) { \
-                       arr ## _sz = MAX2(2 * arr ## _sz, 16); \
-                       arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
-               } \
-               arr[arr ##_count++] = __VA_ARGS__; \
-       } while (0)
+#define DECLARE_ARRAY(type, name)                                              \
+   unsigned name##_count, name##_sz;                                           \
+   type *name;
+
+#define array_insert(ctx, arr, ...)                                            \
+   do {                                                                        \
+      if (arr##_count == arr##_sz) {                                           \
+         arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
+         arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
+      }                                                                        \
+      arr[arr##_count++] = __VA_ARGS__;                                        \
+   } while (0)
  
  struct ir3_instruction {
-       struct ir3_block *block;
-       opc_t opc;
-       enum {
-               /* (sy) flag is set on first instruction, and after sample
-                * instructions (probably just on RAW hazard).
-                */
-               IR3_INSTR_SY    = 0x001,
-               /* (ss) flag is set on first instruction, and first instruction
-                * to depend on the result of "long" instructions (RAW hazard):
-                *
-                *   rcp, rsq, log2, exp2, sin, cos, sqrt
-                *
-                * It seems to synchronize until all in-flight instructions are
-                * completed, for example:
-                *
-                *   rsq hr1.w, hr1.w
-                *   add.f hr2.z, (neg)hr2.z, hc0.y
-                *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
-                *   rsq hr2.x, hr2.x
-                *   (rpt1)nop
-                *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
-                *   nop
-                *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
-                *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
-                *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
-                *
-                * The last mul.f does not have (ss) set, presumably because the
-                * (ss) on the previous instruction does the job.
-                *
-                * The blob driver also seems to set it on WAR hazards, although
-                * not really clear if this is needed or just blob compiler being
-                * sloppy.  So far I haven't found a case where removing the (ss)
-                * causes problems for WAR hazard, but I could just be getting
-                * lucky:
-                *
-                *   rcp r1.y, r3.y
-                *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
-                *
-                */
-               IR3_INSTR_SS    = 0x002,
-               /* (jp) flag is set on jump targets:
-                */
-               IR3_INSTR_JP    = 0x004,
-               IR3_INSTR_UL    = 0x008,
-               IR3_INSTR_3D    = 0x010,
-               IR3_INSTR_A     = 0x020,
-               IR3_INSTR_O     = 0x040,
-               IR3_INSTR_P     = 0x080,
-               IR3_INSTR_S     = 0x100,
-               IR3_INSTR_S2EN  = 0x200,
-               IR3_INSTR_G     = 0x400,
-               IR3_INSTR_SAT   = 0x800,
-               /* (cat5/cat6) Bindless */
-               IR3_INSTR_B     = 0x1000,
-               /* (cat5/cat6) nonuniform */
-               IR3_INSTR_NONUNIF    = 0x02000,
-               /* (cat5-only) Get some parts of the encoding from a1.x */
-               IR3_INSTR_A1EN       = 0x04000,
-               /* meta-flags, for intermediate stages of IR, ie.
-                * before register assignment is done:
-                */
-               IR3_INSTR_MARK       = 0x08000,
-               IR3_INSTR_UNUSED     = 0x10000,
-       } flags;
-       uint8_t repeat;
-       uint8_t nop;
+   struct ir3_block *block;
+   opc_t opc;
+   enum {
+      /* (sy) flag is set on first instruction, and after sample
+       * instructions (probably just on RAW hazard).
+       */
+      IR3_INSTR_SY = 0x001,
+      /* (ss) flag is set on first instruction, and first instruction
+       * to depend on the result of "long" instructions (RAW hazard):
+       *
+       *   rcp, rsq, log2, exp2, sin, cos, sqrt
+       *
+       * It seems to synchronize until all in-flight instructions are
+       * completed, for example:
+       *
+       *   rsq hr1.w, hr1.w
+       *   add.f hr2.z, (neg)hr2.z, hc0.y
+       *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+       *   rsq hr2.x, hr2.x
+       *   (rpt1)nop
+       *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+       *   nop
+       *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+       *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+       *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+       *
+       * The last mul.f does not have (ss) set, presumably because the
+       * (ss) on the previous instruction does the job.
+       *
+       * The blob driver also seems to set it on WAR hazards, although
+       * not really clear if this is needed or just blob compiler being
+       * sloppy.  So far I haven't found a case where removing the (ss)
+       * causes problems for WAR hazard, but I could just be getting
+       * lucky:
+       *
+       *   rcp r1.y, r3.y
+       *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+       *
+       */
+      IR3_INSTR_SS = 0x002,
+      /* (jp) flag is set on jump targets:
+       */
+      IR3_INSTR_JP = 0x004,
+      IR3_INSTR_UL = 0x008,
+      IR3_INSTR_3D = 0x010,
+      IR3_INSTR_A = 0x020,
+      IR3_INSTR_O = 0x040,
+      IR3_INSTR_P = 0x080,
+      IR3_INSTR_S = 0x100,
+      IR3_INSTR_S2EN = 0x200,
+      IR3_INSTR_G = 0x400,
+      IR3_INSTR_SAT = 0x800,
+      /* (cat5/cat6) Bindless */
+      IR3_INSTR_B = 0x1000,
+      /* (cat5/cat6) nonuniform */
+      IR3_INSTR_NONUNIF = 0x02000,
+      /* (cat5-only) Get some parts of the encoding from a1.x */
+      IR3_INSTR_A1EN = 0x04000,
+      /* meta-flags, for intermediate stages of IR, ie.
+       * before register assignment is done:
+       */
+      IR3_INSTR_MARK = 0x08000,
+      IR3_INSTR_UNUSED = 0x10000,
+   } flags;
+   uint8_t repeat;
+   uint8_t nop;
  #ifdef DEBUG
-       unsigned srcs_max, dsts_max;
+   unsigned srcs_max, dsts_max;
  #endif
-       unsigned srcs_count, dsts_count;
-       struct ir3_register **dsts;
-       struct ir3_register **srcs;
-       union {
-               struct {
-                       char inv1, inv2;
-                       char comp1, comp2;
-                       int  immed;
-                       struct ir3_block *target;
-                       const char *target_label;
-                       brtype_t brtype;
-                       unsigned idx;  /* for brac.N */
-               } cat0;
-               struct {
-                       type_t src_type, dst_type;
-                       round_t round;
-               } cat1;
-               struct {
-                       enum {
-                               IR3_COND_LT = 0,
-                               IR3_COND_LE = 1,
-                               IR3_COND_GT = 2,
-                               IR3_COND_GE = 3,
-                               IR3_COND_EQ = 4,
-                               IR3_COND_NE = 5,
-                       } condition;
-               } cat2;
-               struct {
-                       unsigned samp, tex;
-                       unsigned tex_base : 3;
-                       type_t type;
-               } cat5;
-               struct {
-                       type_t type;
-                       /* TODO remove dst_offset and handle as a ir3_register
-                        * which might be IMMED, similar to how src_offset is
-                        * handled.
-                        */
-                       int dst_offset;
-                       int iim_val : 3;      /* for ldgb/stgb, # of components */
-                       unsigned d : 3;       /* for ldc, component offset */
-                       bool typed : 1;
-                       unsigned base : 3;
-               } cat6;
-               struct {
-                       unsigned w : 1;       /* write */
-                       unsigned r : 1;       /* read */
-                       unsigned l : 1;       /* local */
-                       unsigned g : 1;       /* global */
-               } cat7;
-               /* for meta-instructions, just used to hold extra data
-                * before instruction scheduling, etc
-                */
-               struct {
-                       int off;              /* component/offset */
-               } split;
-               struct {
-                       /* Per-source index back to the entry in the
-                        * ir3_shader_variant::outputs table.
-                        */
-                       unsigned *outidxs;
-               } end;
-               struct {
-                       /* used to temporarily hold reference to nir_phi_instr
-                        * until we resolve the phi srcs
-                        */
-                       void *nphi;
-               } phi;
-               struct {
-                       unsigned samp, tex;
-                       unsigned input_offset;
-                       unsigned samp_base : 3;
-                       unsigned tex_base : 3;
-               } prefetch;
-               struct {
-                       /* maps back to entry in ir3_shader_variant::inputs table: */
-                       int inidx;
-                       /* for sysvals, identifies the sysval type.  Mostly so we can
-                        * identify the special cases where a sysval should not be DCE'd
-                        * (currently, just pre-fs texture fetch)
-                        */
-                       gl_system_value sysval;
-               } input;
-       };
-
-       /* When we get to the RA stage, we need instruction's position/name: */
-       uint16_t ip;
-       uint16_t name;
-
-       /* used for per-pass extra instruction data.
-        *
-        * TODO we should remove the per-pass data like this and 'use_count'
-        * and do something similar to what RA does w/ ir3_ra_instr_data..
-        * ie. use the ir3_count_instructions pass, and then use instr->ip
-        * to index into a table of pass-private data.
-        */
-       void *data;
-
-       /**
-        * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
-        */
-       struct set *uses;
-
-       int use_count;      /* currently just updated/used by cp */
-
-       /* an instruction can reference at most one address register amongst
-        * it's src/dst registers.  Beyond that, you need to insert mov's.
-        *
-        * NOTE: do not write this directly, use ir3_instr_set_address()
-        */
-       struct ir3_register *address;
-
-       /* Tracking for additional dependent instructions.  Used to handle
-        * barriers, WAR hazards for arrays/SSBOs/etc.
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, deps);
-
-       /*
-        * From PoV of instruction scheduling, not execution (ie. ignores global/
-        * local distinction):
-        *                            shared  image  atomic  SSBO  everything
-        *   barrier()/            -   R/W     R/W    R/W     R/W       X
-        *     groupMemoryBarrier()
-        *     memoryBarrier()
-        *     (but only images declared coherent?)
-        *   memoryBarrierAtomic() -                  R/W
-        *   memoryBarrierBuffer() -                          R/W
-        *   memoryBarrierImage()  -           R/W
-        *   memoryBarrierShared() -   R/W
-        *
-        * TODO I think for SSBO/image/shared, in cases where we can determine
-        * which variable is accessed, we don't need to care about accesses to
-        * different variables (unless declared coherent??)
-        */
-       enum {
-               IR3_BARRIER_EVERYTHING = 1 << 0,
-               IR3_BARRIER_SHARED_R   = 1 << 1,
-               IR3_BARRIER_SHARED_W   = 1 << 2,
-               IR3_BARRIER_IMAGE_R    = 1 << 3,
-               IR3_BARRIER_IMAGE_W    = 1 << 4,
-               IR3_BARRIER_BUFFER_R   = 1 << 5,
-               IR3_BARRIER_BUFFER_W   = 1 << 6,
-               IR3_BARRIER_ARRAY_R    = 1 << 7,
-               IR3_BARRIER_ARRAY_W    = 1 << 8,
-               IR3_BARRIER_PRIVATE_R  = 1 << 9,
-               IR3_BARRIER_PRIVATE_W  = 1 << 10,
-       } barrier_class, barrier_conflict;
-
-       /* Entry in ir3_block's instruction list: */
-       struct list_head node;
-
-       uint32_t serialno;
-
-       // TODO only computerator/assembler:
-       int line;
+   unsigned srcs_count, dsts_count;
+   struct ir3_register **dsts;
+   struct ir3_register **srcs;
+   union {
+      struct {
+         char inv1, inv2;
+         char comp1, comp2;
+         int immed;
+         struct ir3_block *target;
+         const char *target_label;
+         brtype_t brtype;
+         unsigned idx; /* for brac.N */
+      } cat0;
+      struct {
+         type_t src_type, dst_type;
+         round_t round;
+      } cat1;
+      struct {
+         enum {
+            IR3_COND_LT = 0,
+            IR3_COND_LE = 1,
+            IR3_COND_GT = 2,
+            IR3_COND_GE = 3,
+            IR3_COND_EQ = 4,
+            IR3_COND_NE = 5,
+         } condition;
+      } cat2;
+      struct {
+         unsigned samp, tex;
+         unsigned tex_base : 3;
+         type_t type;
+      } cat5;
+      struct {
+         type_t type;
+         /* TODO remove dst_offset and handle as a ir3_register
+          * which might be IMMED, similar to how src_offset is
+          * handled.
+          */
+         int dst_offset;
+         int iim_val   : 3; /* for ldgb/stgb, # of components */
+         unsigned d    : 3; /* for ldc, component offset */
+         bool typed    : 1;
+         unsigned base : 3;
+      } cat6;
+      struct {
+         unsigned w : 1; /* write */
+         unsigned r : 1; /* read */
+         unsigned l : 1; /* local */
+         unsigned g : 1; /* global */
+      } cat7;
+      /* for meta-instructions, just used to hold extra data
+       * before instruction scheduling, etc
+       */
+      struct {
+         int off; /* component/offset */
+      } split;
+      struct {
+         /* Per-source index back to the entry in the
+          * ir3_shader_variant::outputs table.
+          */
+         unsigned *outidxs;
+      } end;
+      struct {
+         /* used to temporarily hold reference to nir_phi_instr
+          * until we resolve the phi srcs
+          */
+         void *nphi;
+      } phi;
+      struct {
+         unsigned samp, tex;
+         unsigned input_offset;
+         unsigned samp_base : 3;
+         unsigned tex_base  : 3;
+      } prefetch;
+      struct {
+         /* maps back to entry in ir3_shader_variant::inputs table: */
+         int inidx;
+         /* for sysvals, identifies the sysval type.  Mostly so we can
+          * identify the special cases where a sysval should not be DCE'd
+          * (currently, just pre-fs texture fetch)
+          */
+         gl_system_value sysval;
+      } input;
+   };
+
+   /* When we get to the RA stage, we need instruction's position/name: */
+   uint16_t ip;
+   uint16_t name;
+
+   /* used for per-pass extra instruction data.
+    *
+    * TODO we should remove the per-pass data like this and 'use_count'
+    * and do something similar to what RA does w/ ir3_ra_instr_data..
+    * ie. use the ir3_count_instructions pass, and then use instr->ip
+    * to index into a table of pass-private data.
+    */
+   void *data;
+
+   /**
+    * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
+    */
+   struct set *uses;
+
+   int use_count; /* currently just updated/used by cp */
+
+   /* an instruction can reference at most one address register amongst
+    * it's src/dst registers.  Beyond that, you need to insert mov's.
+    *
+    * NOTE: do not write this directly, use ir3_instr_set_address()
+    */
+   struct ir3_register *address;
+
+   /* Tracking for additional dependent instructions.  Used to handle
+    * barriers, WAR hazards for arrays/SSBOs/etc.
+    */
+   DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+   /*
+    * From PoV of instruction scheduling, not execution (ie. ignores global/
+    * local distinction):
+    *                            shared  image  atomic  SSBO  everything
+    *   barrier()/            -   R/W     R/W    R/W     R/W       X
+    *     groupMemoryBarrier()
+    *     memoryBarrier()
+    *     (but only images declared coherent?)
+    *   memoryBarrierAtomic() -                  R/W
+    *   memoryBarrierBuffer() -                          R/W
+    *   memoryBarrierImage()  -           R/W
+    *   memoryBarrierShared() -   R/W
+    *
+    * TODO I think for SSBO/image/shared, in cases where we can determine
+    * which variable is accessed, we don't need to care about accesses to
+    * different variables (unless declared coherent??)
+    */
+   enum {
+      IR3_BARRIER_EVERYTHING = 1 << 0,
+      IR3_BARRIER_SHARED_R = 1 << 1,
+      IR3_BARRIER_SHARED_W = 1 << 2,
+      IR3_BARRIER_IMAGE_R = 1 << 3,
+      IR3_BARRIER_IMAGE_W = 1 << 4,
+      IR3_BARRIER_BUFFER_R = 1 << 5,
+      IR3_BARRIER_BUFFER_W = 1 << 6,
+      IR3_BARRIER_ARRAY_R = 1 << 7,
+      IR3_BARRIER_ARRAY_W = 1 << 8,
+      IR3_BARRIER_PRIVATE_R = 1 << 9,
+      IR3_BARRIER_PRIVATE_W = 1 << 10,
+   } barrier_class,
+      barrier_conflict;
+
+   /* Entry in ir3_block's instruction list: */
+   struct list_head node;
+
+   uint32_t serialno;
+
+   // TODO only computerator/assembler:
+   int line;
  };
  
  struct ir3 {
-       struct ir3_compiler *compiler;
-       gl_shader_stage type;
-
-       DECLARE_ARRAY(struct ir3_instruction *, inputs);
-
-       /* Track bary.f (and ldlv) instructions.. this is needed in
-        * scheduling to ensure that all varying fetches happen before
-        * any potential kill instructions.  The hw gets grumpy if all
-        * threads in a group are killed before the last bary.f gets
-        * a chance to signal end of input (ei).
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, baryfs);
-
-       /* Track all indirect instructions (read and write).  To avoid
-        * deadlock scenario where an address register gets scheduled,
-        * but other dependent src instructions cannot be scheduled due
-        * to dependency on a *different* address register value, the
-        * scheduler needs to ensure that all dependencies other than
-        * the instruction other than the address register are scheduled
-        * before the one that writes the address register.  Having a
-        * convenient list of instructions that reference some address
-        * register simplifies this.
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, a0_users);
-
-       /* same for a1.x: */
-       DECLARE_ARRAY(struct ir3_instruction *, a1_users);
-
-       /* and same for instructions that consume predicate register: */
-       DECLARE_ARRAY(struct ir3_instruction *, predicates);
-
-       /* Track texture sample instructions which need texture state
-        * patched in (for astc-srgb workaround):
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
-
-       /* List of blocks: */
-       struct list_head block_list;
-
-       /* List of ir3_array's: */
-       struct list_head array_list;
+   struct ir3_compiler *compiler;
+   gl_shader_stage type;
+
+   DECLARE_ARRAY(struct ir3_instruction *, inputs);
+
+   /* Track bary.f (and ldlv) instructions.. this is needed in
+    * scheduling to ensure that all varying fetches happen before
+    * any potential kill instructions.  The hw gets grumpy if all
+    * threads in a group are killed before the last bary.f gets
+    * a chance to signal end of input (ei).
+    */
+   DECLARE_ARRAY(struct ir3_instruction *, baryfs);
+
+   /* Track all indirect instructions (read and write).  To avoid
+    * deadlock scenario where an address register gets scheduled,
+    * but other dependent src instructions cannot be scheduled due
+    * to dependency on a *different* address register value, the
+    * scheduler needs to ensure that all dependencies other than
+    * the instruction other than the address register are scheduled
+    * before the one that writes the address register.  Having a
+    * convenient list of instructions that reference some address
+    * register simplifies this.
+    */
+   DECLARE_ARRAY(struct ir3_instruction *, a0_users);
+
+   /* same for a1.x: */
+   DECLARE_ARRAY(struct ir3_instruction *, a1_users);
+
+   /* and same for instructions that consume predicate register: */
+   DECLARE_ARRAY(struct ir3_instruction *, predicates);
+
+   /* Track texture sample instructions which need texture state
+    * patched in (for astc-srgb workaround):
+    */
+   DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
+
+   /* List of blocks: */
+   struct list_head block_list;
+
+   /* List of ir3_array's: */
+   struct list_head array_list;
  
  #ifdef DEBUG
-       unsigned block_count;
+   unsigned block_count;
  #endif
-       unsigned instr_count;
+   unsigned instr_count;
  };
  
  struct ir3_array {
-       struct list_head node;
-       unsigned length;
-       unsigned id;
+   struct list_head node;
+   unsigned length;
+   unsigned id;
  
-       struct nir_register *r;
+   struct nir_register *r;
  
-       /* To avoid array write's from getting DCE'd, keep track of the
-        * most recent write.  Any array access depends on the most
-        * recent write.  This way, nothing depends on writes after the
-        * last read.  But all the writes that happen before that have
-        * something depending on them
-        */
-       struct ir3_register *last_write;
+   /* To avoid array write's from getting DCE'd, keep track of the
+    * most recent write.  Any array access depends on the most
+    * recent write.  This way, nothing depends on writes after the
+    * last read.  But all the writes that happen before that have
+    * something depending on them
+    */
+   struct ir3_register *last_write;
  
-       /* extra stuff used in RA pass: */
-       unsigned base;      /* base vreg name */
-       unsigned reg;       /* base physical reg */
-       uint16_t start_ip, end_ip;
+   /* extra stuff used in RA pass: */
+   unsigned base; /* base vreg name */
+   unsigned reg;  /* base physical reg */
+   uint16_t start_ip, end_ip;
  
-       /* Indicates if half-precision */
-       bool half;
+   /* Indicates if half-precision */
+   bool half;
  
-       bool unused;
+   bool unused;
  };
  
-struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
  
  enum ir3_branch_type {
-       IR3_BRANCH_COND, /* condition */
-       IR3_BRANCH_ANY, /* subgroupAny(condition) */
-       IR3_BRANCH_ALL, /* subgroupAll(condition) */
-       IR3_BRANCH_GETONE, /* subgroupElect() */
+   IR3_BRANCH_COND,   /* condition */
+   IR3_BRANCH_ANY,    /* subgroupAny(condition) */
+   IR3_BRANCH_ALL,    /* subgroupAll(condition) */
+   IR3_BRANCH_GETONE, /* subgroupElect() */
  };
  
  struct ir3_block {
-       struct list_head node;
-       struct ir3 *shader;
+   struct list_head node;
+   struct ir3 *shader;
  
-       const struct nir_block *nblock;
+   const struct nir_block *nblock;
  
-       struct list_head instr_list;  /* list of ir3_instruction */
+   struct list_head instr_list; /* list of ir3_instruction */
  
-       /* The actual branch condition, if there are two successors */
-       enum ir3_branch_type brtype;
+   /* The actual branch condition, if there are two successors */
+   enum ir3_branch_type brtype;
  
-       /* each block has either one or two successors.. in case of two
-        * successors, 'condition' decides which one to follow.  A block preceding
-        * an if/else has two successors.
-        *
-        * In some cases the path that the machine actually takes through the
-        * program may not match the per-thread view of the CFG. In particular
-        * this is the case for if/else, where the machine jumps from the end of
-        * the if to the beginning of the else and switches active lanes. While
-        * most things only care about the per-thread view, we need to use the
-        * "physical" view when allocating shared registers. "successors" contains
-        * the per-thread successors, and "physical_successors" contains the
-        * physical successors which includes the fallthrough edge from the if to
-        * the else.
-        */
-       struct ir3_instruction *condition;
-       struct ir3_block *successors[2];
-       struct ir3_block *physical_successors[2];
+   /* each block has either one or two successors.. in case of two
+    * successors, 'condition' decides which one to follow.  A block preceding
+    * an if/else has two successors.
+    *
+    * In some cases the path that the machine actually takes through the
+    * program may not match the per-thread view of the CFG. In particular
+    * this is the case for if/else, where the machine jumps from the end of
+    * the if to the beginning of the else and switches active lanes. While
+    * most things only care about the per-thread view, we need to use the
+    * "physical" view when allocating shared registers. "successors" contains
+    * the per-thread successors, and "physical_successors" contains the
+    * physical successors which includes the fallthrough edge from the if to
+    * the else.
+    */
+   struct ir3_instruction *condition;
+   struct ir3_block *successors[2];
+   struct ir3_block *physical_successors[2];
  
-       DECLARE_ARRAY(struct ir3_block *, predecessors);
-       DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
+   DECLARE_ARRAY(struct ir3_block *, predecessors);
+   DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
  
-       uint16_t start_ip, end_ip;
+   uint16_t start_ip, end_ip;
  
-       /* Track instructions which do not write a register but other-
-        * wise must not be discarded (such as kill, stg, etc)
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, keeps);
+   /* Track instructions which do not write a register but other-
+    * wise must not be discarded (such as kill, stg, etc)
+    */
+   DECLARE_ARRAY(struct ir3_instruction *, keeps);
  
-       /* used for per-pass extra block data.  Mainly used right
-        * now in RA step to track livein/liveout.
-        */
-       void *data;
+   /* used for per-pass extra block data.  Mainly used right
+    * now in RA step to track livein/liveout.
+    */
+   void *data;
  
-       uint32_t index;
+   uint32_t index;
  
-       struct ir3_block *imm_dom;
-       DECLARE_ARRAY(struct ir3_block *, dom_children);
+   struct ir3_block *imm_dom;
+   DECLARE_ARRAY(struct ir3_block *, dom_children);
  
-       uint32_t dom_pre_index;
-       uint32_t dom_post_index;
+   uint32_t dom_pre_index;
+   uint32_t dom_post_index;
  
-       uint32_t loop_id;
+   uint32_t loop_id;
  
  #ifdef DEBUG
-       uint32_t serialno;
+   uint32_t serialno;
  #endif
  };
  
@@ -584,78 +585,86 @@ static inline uint32_t
  block_id(struct ir3_block *block)
  {
  #ifdef DEBUG
-       return block->serialno;
+   return block->serialno;
  #else
-       return (uint32_t)(unsigned long)block;
+   return (uint32_t)(unsigned long)block;
  #endif
  }
  
  static inline struct ir3_block *
  ir3_start_block(struct ir3 *ir)
  {
-       return list_first_entry(&ir->block_list, struct ir3_block, node);
+   return list_first_entry(&ir->block_list, struct ir3_block, node);
  }
  
  void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
-void ir3_block_add_physical_predecessor(struct ir3_block *block, struct ir3_block *pred);
-void ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred);
-unsigned ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred);
+void ir3_block_add_physical_predecessor(struct ir3_block *block,
+                                        struct ir3_block *pred);
+void ir3_block_remove_predecessor(struct ir3_block *block,
+                                  struct ir3_block *pred);
+unsigned ir3_block_get_pred_index(struct ir3_block *block,
+                                  struct ir3_block *pred);
  
  void ir3_calc_dominance(struct ir3 *ir);
  bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
  
  struct ir3_shader_variant;
  
-struct ir3 * ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v);
+struct ir3 *ir3_create(struct ir3_compiler *compiler,
+                       struct ir3_shader_variant *v);
  void ir3_destroy(struct ir3 *shader);
  
  void ir3_collect_info(struct ir3_shader_variant *v);
-void * ir3_alloc(struct ir3 *shader, int sz);
+void *ir3_alloc(struct ir3 *shader, int sz);
  
  unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
-                                                                                unsigned reg_count, bool double_threadsize);
+                                         unsigned reg_count,
+                                         bool double_threadsize);
  
  unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
-                                                                                  bool double_threadsize);
+                                           bool double_threadsize);
  
  bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
-                                                                 unsigned regs_count);
+                                  unsigned regs_count);
  
-struct ir3_block * ir3_block_create(struct ir3 *shader);
+struct ir3_block *ir3_block_create(struct ir3 *shader);
  
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-               opc_t opc, int ndst, int nsrc);
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
+struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
+                                         int ndst, int nsrc);
+struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr,
+                       struct ir3_instruction *dep);
  const char *ir3_instr_name(struct ir3_instruction *instr);
  
-struct ir3_register * ir3_src_create(struct ir3_instruction *instr,
-               int num, int flags);
-struct ir3_register * ir3_dst_create(struct ir3_instruction *instr,
-               int num, int flags);
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-               struct ir3_register *reg);
+struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
+                                    int flags);
+struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
+                                    int flags);
+struct ir3_register *ir3_reg_clone(struct ir3 *shader,
+                                   struct ir3_register *reg);
  
-static inline void ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
+static inline void
+ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
  {
-       assert(!dst->tied && !src->tied);
-       dst->tied = src;
-       src->tied = dst;
+   assert(!dst->tied && !src->tied);
+   dst->tied = src;
+   src->tied = dst;
  }
  
  void ir3_reg_set_last_array(struct ir3_instruction *instr,
-                                                       struct ir3_register *reg,
-                                                       struct ir3_register *last_write);
+                            struct ir3_register *reg,
+                            struct ir3_register *last_write);
  
  void ir3_instr_set_address(struct ir3_instruction *instr,
-               struct ir3_instruction *addr);
+                           struct ir3_instruction *addr);
  
-static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+static inline bool
+ir3_instr_check_mark(struct ir3_instruction *instr)
  {
-       if (instr->flags & IR3_INSTR_MARK)
-               return true;  /* already visited */
-       instr->flags |= IR3_INSTR_MARK;
-       return false;
+   if (instr->flags & IR3_INSTR_MARK)
+      return true; /* already visited */
+   instr->flags |= IR3_INSTR_MARK;
+   return false;
  }
  
  void ir3_block_clear_mark(struct ir3_block *block);
@@ -669,10 +678,10 @@ unsigned ir3_count_instructions_ra(struct ir3 *ir);
   */
  static inline void
  ir3_instr_move_before(struct ir3_instruction *instr,
-               struct ir3_instruction *after)
+                      struct ir3_instruction *after)
  {
-       list_delinit(&instr->node);
-       list_addtail(&instr->node, &after->node);
+   list_delinit(&instr->node);
+   list_addtail(&instr->node, &after->node);
  }
  
  /**
@@ -680,10 +689,10 @@ ir3_instr_move_before(struct ir3_instruction *instr,
   */
  static inline void
  ir3_instr_move_after(struct ir3_instruction *instr,
-               struct ir3_instruction *before)
+                     struct ir3_instruction *before)
  {
-       list_delinit(&instr->node);
-       list_add(&instr->node, &before->node);
+   list_delinit(&instr->node);
+   list_add(&instr->node, &before->node);
  }
  
  void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
@@ -694,358 +703,400 @@ void ir3_fixup_src_type(struct ir3_instruction *instr);
  bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
  
  #include "util/set.h"
-#define foreach_ssa_use(__use, __instr) \
-       for (struct ir3_instruction *__use = (void *)~0; \
-            __use && (__instr)->uses; __use = NULL) \
-               set_foreach ((__instr)->uses, __entry) \
-                       if ((__use = (void *)__entry->key))
+#define foreach_ssa_use(__use, __instr)                                        \
+   for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
+        __use = NULL)                                                          \
+      set_foreach ((__instr)->uses, __entry)                                   \
+         if ((__use = (void *)__entry->key))
  
-static inline uint32_t reg_num(const struct ir3_register *reg)
+static inline uint32_t
+reg_num(const struct ir3_register *reg)
  {
-       return reg->num >> 2;
+   return reg->num >> 2;
  }
  
-static inline uint32_t reg_comp(const struct ir3_register *reg)
+static inline uint32_t
+reg_comp(const struct ir3_register *reg)
  {
-       return reg->num & 0x3;
+   return reg->num & 0x3;
  }
  
-static inline bool is_flow(struct ir3_instruction *instr)
+static inline bool
+is_flow(struct ir3_instruction *instr)
  {
-       return (opc_cat(instr->opc) == 0);
+   return (opc_cat(instr->opc) == 0);
  }
  
-static inline bool is_kill_or_demote(struct ir3_instruction *instr)
+static inline bool
+is_kill_or_demote(struct ir3_instruction *instr)
  {
-       return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
+   return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
  }
  
-static inline bool is_nop(struct ir3_instruction *instr)
+static inline bool
+is_nop(struct ir3_instruction *instr)
  {
-       return instr->opc == OPC_NOP;
+   return instr->opc == OPC_NOP;
  }
  
-static inline bool is_same_type_reg(struct ir3_register *dst,
-               struct ir3_register *src)
+static inline bool
+is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
  {
-       unsigned dst_type = (dst->flags & IR3_REG_HALF);
-       unsigned src_type = (src->flags & IR3_REG_HALF);
+   unsigned dst_type = (dst->flags & IR3_REG_HALF);
+   unsigned src_type = (src->flags & IR3_REG_HALF);
  
-       /* Treat shared->normal copies as same-type, because they can generally be
-        * folded, but not normal->shared copies.
-        */
-       if (dst_type != src_type ||
-               ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
-               return false;
-       else
-               return true;
+   /* Treat shared->normal copies as same-type, because they can generally be
+    * folded, but not normal->shared copies.
+    */
+   if (dst_type != src_type ||
+       ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
+      return false;
+   else
+      return true;
  }
  
  /* Is it a non-transformative (ie. not type changing) mov?  This can
   * also include absneg.s/absneg.f, which for the most part can be
   * treated as a mov (single src argument).
   */
-static inline bool is_same_type_mov(struct ir3_instruction *instr)
-{
-       struct ir3_register *dst;
-
-       switch (instr->opc) {
-       case OPC_MOV:
-               if (instr->cat1.src_type != instr->cat1.dst_type)
-                       return false;
-               /* If the type of dest reg and src reg are different,
-                * it shouldn't be considered as same type mov
-                */
-               if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
-                       return false;
-               break;
-       case OPC_ABSNEG_F:
-       case OPC_ABSNEG_S:
-               if (instr->flags & IR3_INSTR_SAT)
-                       return false;
-               /* If the type of dest reg and src reg are different,
-                * it shouldn't be considered as same type mov
-                */
-               if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
-                       return false;
-               break;
-       default:
-               return false;
-       }
-
-       dst = instr->dsts[0];
-
-       /* mov's that write to a0 or p0.x are special: */
-       if (dst->num == regid(REG_P0, 0))
-               return false;
-       if (reg_num(dst) == REG_A0)
-               return false;
-
-       if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-               return false;
-
-       return true;
+static inline bool
+is_same_type_mov(struct ir3_instruction *instr)
+{
+   struct ir3_register *dst;
+
+   switch (instr->opc) {
+   case OPC_MOV:
+      if (instr->cat1.src_type != instr->cat1.dst_type)
+         return false;
+      /* If the type of dest reg and src reg are different,
+       * it shouldn't be considered as same type mov
+       */
+      if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
+         return false;
+      break;
+   case OPC_ABSNEG_F:
+   case OPC_ABSNEG_S:
+      if (instr->flags & IR3_INSTR_SAT)
+         return false;
+      /* If the type of dest reg and src reg are different,
+       * it shouldn't be considered as same type mov
+       */
+      if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
+         return false;
+      break;
+   default:
+      return false;
+   }
+
+   dst = instr->dsts[0];
+
+   /* mov's that write to a0 or p0.x are special: */
+   if (dst->num == regid(REG_P0, 0))
+      return false;
+   if (reg_num(dst) == REG_A0)
+      return false;
+
+   if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+      return false;
+
+   return true;
  }
  
  /* A move from const, which changes size but not type, can also be
   * folded into dest instruction in some cases.
   */
-static inline bool is_const_mov(struct ir3_instruction *instr)
+static inline bool
+is_const_mov(struct ir3_instruction *instr)
  {
-       if (instr->opc != OPC_MOV)
-               return false;
+   if (instr->opc != OPC_MOV)
+      return false;
  
-       if (!(instr->srcs[0]->flags & IR3_REG_CONST))
-               return false;
+   if (!(instr->srcs[0]->flags & IR3_REG_CONST))
+      return false;
  
-       type_t src_type = instr->cat1.src_type;
-       type_t dst_type = instr->cat1.dst_type;
+   type_t src_type = instr->cat1.src_type;
+   type_t dst_type = instr->cat1.dst_type;
  
-       return (type_float(src_type) && type_float(dst_type)) ||
-               (type_uint(src_type) && type_uint(dst_type)) ||
-               (type_sint(src_type) && type_sint(dst_type));
+   return (type_float(src_type) && type_float(dst_type)) ||
+          (type_uint(src_type) && type_uint(dst_type)) ||
+          (type_sint(src_type) && type_sint(dst_type));
  }
  
-static inline bool is_alu(struct ir3_instruction *instr)
+static inline bool
+is_alu(struct ir3_instruction *instr)
  {
-       return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
+   return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
  }
  
-static inline bool is_sfu(struct ir3_instruction *instr)
+static inline bool
+is_sfu(struct ir3_instruction *instr)
  {
-       return (opc_cat(instr->opc) == 4);
+   return (opc_cat(instr->opc) == 4);
  }
  
-static inline bool is_tex(struct ir3_instruction *instr)
+static inline bool
+is_tex(struct ir3_instruction *instr)
  {
-       return (opc_cat(instr->opc) == 5);
+   return (opc_cat(instr->opc) == 5);
  }
  
-static inline bool is_tex_or_prefetch(struct ir3_instruction *instr)
+static inline bool
+is_tex_or_prefetch(struct ir3_instruction *instr)
  {
-       return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
+   return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
  }
  
-static inline bool is_mem(struct ir3_instruction *instr)
+static inline bool
+is_mem(struct ir3_instruction *instr)
  {
-       return (opc_cat(instr->opc) == 6);
+   return (opc_cat(instr->opc) == 6);
  }
  
-static inline bool is_barrier(struct ir3_instruction *instr)
+static inline bool
+is_barrier(struct ir3_instruction *instr)
  {
-       return (opc_cat(instr->opc) == 7);
+   return (opc_cat(instr->opc) == 7);
  }
  
  static inline bool
  is_half(struct ir3_instruction *instr)
  {
-       return !!(instr->dsts[0]->flags & IR3_REG_HALF);
+   return !!(instr->dsts[0]->flags & IR3_REG_HALF);
  }
  
  static inline bool
  is_shared(struct ir3_instruction *instr)
  {
-       return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
+   return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
  }
  
  static inline bool
  is_store(struct ir3_instruction *instr)
  {
-       /* these instructions, the "destination" register is
-        * actually a source, the address to store to.
-        */
-       switch (instr->opc) {
-       case OPC_STG:
-       case OPC_STG_A:
-       case OPC_STGB:
-       case OPC_STIB:
-       case OPC_STP:
-       case OPC_STL:
-       case OPC_STLW:
-       case OPC_L2G:
-       case OPC_G2L:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_load(struct ir3_instruction *instr)
-{
-       switch (instr->opc) {
-       case OPC_LDG:
-       case OPC_LDG_A:
-       case OPC_LDGB:
-       case OPC_LDIB:
-       case OPC_LDL:
-       case OPC_LDP:
-       case OPC_L2G:
-       case OPC_LDLW:
-       case OPC_LDC:
-       case OPC_LDLV:
-               /* probably some others too.. */
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_input(struct ir3_instruction *instr)
-{
-       /* in some cases, ldlv is used to fetch varying without
-        * interpolation.. fortunately inloc is the first src
-        * register in either case
-        */
-       switch (instr->opc) {
-       case OPC_LDLV:
-       case OPC_BARY_F:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_bool(struct ir3_instruction *instr)
-{
-       switch (instr->opc) {
-       case OPC_CMPS_F:
-       case OPC_CMPS_S:
-       case OPC_CMPS_U:
-               return true;
-       default:
-               return false;
-       }
+   /* these instructions, the "destination" register is
+    * actually a source, the address to store to.
+    */
+   switch (instr->opc) {
+   case OPC_STG:
+   case OPC_STG_A:
+   case OPC_STGB:
+   case OPC_STIB:
+   case OPC_STP:
+   case OPC_STL:
+   case OPC_STLW:
+   case OPC_L2G:
+   case OPC_G2L:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+is_load(struct ir3_instruction *instr)
+{
+   switch (instr->opc) {
+   case OPC_LDG:
+   case OPC_LDG_A:
+   case OPC_LDGB:
+   case OPC_LDIB:
+   case OPC_LDL:
+   case OPC_LDP:
+   case OPC_L2G:
+   case OPC_LDLW:
+   case OPC_LDC:
+   case OPC_LDLV:
+      /* probably some others too.. */
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+is_input(struct ir3_instruction *instr)
+{
+   /* in some cases, ldlv is used to fetch varying without
+    * interpolation.. fortunately inloc is the first src
+    * register in either case
+    */
+   switch (instr->opc) {
+   case OPC_LDLV:
+   case OPC_BARY_F:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+is_bool(struct ir3_instruction *instr)
+{
+   switch (instr->opc) {
+   case OPC_CMPS_F:
+   case OPC_CMPS_S:
+   case OPC_CMPS_U:
+      return true;
+   default:
+      return false;
+   }
  }
  
  static inline opc_t
  cat3_half_opc(opc_t opc)
  {
-       switch (opc) {
-       case OPC_MAD_F32: return OPC_MAD_F16;
-       case OPC_SEL_B32: return OPC_SEL_B16;
-       case OPC_SEL_S32: return OPC_SEL_S16;
-       case OPC_SEL_F32: return OPC_SEL_F16;
-       case OPC_SAD_S32: return OPC_SAD_S16;
-       default:          return opc;
-       }
+   switch (opc) {
+   case OPC_MAD_F32:
+      return OPC_MAD_F16;
+   case OPC_SEL_B32:
+      return OPC_SEL_B16;
+   case OPC_SEL_S32:
+      return OPC_SEL_S16;
+   case OPC_SEL_F32:
+      return OPC_SEL_F16;
+   case OPC_SAD_S32:
+      return OPC_SAD_S16;
+   default:
+      return opc;
+   }
  }
  
  static inline opc_t
  cat3_full_opc(opc_t opc)
  {
-       switch (opc) {
-       case OPC_MAD_F16: return OPC_MAD_F32;
-       case OPC_SEL_B16: return OPC_SEL_B32;
-       case OPC_SEL_S16: return OPC_SEL_S32;
-       case OPC_SEL_F16: return OPC_SEL_F32;
-       case OPC_SAD_S16: return OPC_SAD_S32;
-       default:          return opc;
-       }
+   switch (opc) {
+   case OPC_MAD_F16:
+      return OPC_MAD_F32;
+   case OPC_SEL_B16:
+      return OPC_SEL_B32;
+   case OPC_SEL_S16:
+      return OPC_SEL_S32;
+   case OPC_SEL_F16:
+      return OPC_SEL_F32;
+   case OPC_SAD_S16:
+      return OPC_SAD_S32;
+   default:
+      return opc;
+   }
  }
  
  static inline opc_t
  cat4_half_opc(opc_t opc)
  {
-       switch (opc) {
-       case OPC_RSQ:  return OPC_HRSQ;
-       case OPC_LOG2: return OPC_HLOG2;
-       case OPC_EXP2: return OPC_HEXP2;
-       default:       return opc;
-       }
+   switch (opc) {
+   case OPC_RSQ:
+      return OPC_HRSQ;
+   case OPC_LOG2:
+      return OPC_HLOG2;
+   case OPC_EXP2:
+      return OPC_HEXP2;
+   default:
+      return opc;
+   }
  }
  
  static inline opc_t
  cat4_full_opc(opc_t opc)
  {
-       switch (opc) {
-       case OPC_HRSQ:  return OPC_RSQ;
-       case OPC_HLOG2: return OPC_LOG2;
-       case OPC_HEXP2: return OPC_EXP2;
-       default:        return opc;
-       }
+   switch (opc) {
+   case OPC_HRSQ:
+      return OPC_RSQ;
+   case OPC_HLOG2:
+      return OPC_LOG2;
+   case OPC_HEXP2:
+      return OPC_EXP2;
+   default:
+      return opc;
+   }
  }
  
-static inline bool is_meta(struct ir3_instruction *instr)
+static inline bool
+is_meta(struct ir3_instruction *instr)
  {
-       return (opc_cat(instr->opc) == -1);
+   return (opc_cat(instr->opc) == -1);
  }
  
-static inline unsigned reg_elems(const struct ir3_register *reg)
+static inline unsigned
+reg_elems(const struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_ARRAY)
-               return reg->size;
-       else
-               return util_last_bit(reg->wrmask);
+   if (reg->flags & IR3_REG_ARRAY)
+      return reg->size;
+   else
+      return util_last_bit(reg->wrmask);
  }
  
  static inline unsigned
  reg_elem_size(const struct ir3_register *reg)
  {
-       return (reg->flags & IR3_REG_HALF) ? 1 : 2;
+   return (reg->flags & IR3_REG_HALF) ? 1 : 2;
  }
  
  static inline unsigned
  reg_size(const struct ir3_register *reg)
  {
-       return reg_elems(reg) * reg_elem_size(reg);
+   return reg_elems(reg) * reg_elem_size(reg);
  }
  
-static inline unsigned dest_regs(struct ir3_instruction *instr)
+static inline unsigned
+dest_regs(struct ir3_instruction *instr)
  {
-       if (instr->dsts_count == 0)
-               return 0;
+   if (instr->dsts_count == 0)
+      return 0;
  
-       debug_assert(instr->dsts_count == 1);
-       return util_last_bit(instr->dsts[0]->wrmask);
+   debug_assert(instr->dsts_count == 1);
+   return util_last_bit(instr->dsts[0]->wrmask);
  }
  
  /* is dst a normal temp register: */
-static inline bool is_dest_gpr(struct ir3_register *dst)
+static inline bool
+is_dest_gpr(struct ir3_register *dst)
  {
-       if (dst->wrmask == 0)
-               return false;
-       if ((reg_num(dst) == REG_A0) ||
-                       (dst->num == regid(REG_P0, 0)))
-               return false;
-       return true;
+   if (dst->wrmask == 0)
+      return false;
+   if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
+      return false;
+   return true;
  }
  
  static inline bool
  writes_gpr(struct ir3_instruction *instr)
  {
-       if (dest_regs(instr) == 0)
-               return false;
-       return is_dest_gpr(instr->dsts[0]);
+   if (dest_regs(instr) == 0)
+      return false;
+   return is_dest_gpr(instr->dsts[0]);
  }
  
-static inline bool writes_addr0(struct ir3_instruction *instr)
+static inline bool
+writes_addr0(struct ir3_instruction *instr)
  {
-       /* Note: only the first dest can write to a0.x */
-       if (instr->dsts_count > 0) {
-               struct ir3_register *dst = instr->dsts[0];
-               return dst->num == regid(REG_A0, 0);
-       }
-       return false;
+   /* Note: only the first dest can write to a0.x */
+   if (instr->dsts_count > 0) {
+      struct ir3_register *dst = instr->dsts[0];
+      return dst->num == regid(REG_A0, 0);
+   }
+   return false;
  }
  
-static inline bool writes_addr1(struct ir3_instruction *instr)
+static inline bool
+writes_addr1(struct ir3_instruction *instr)
  {
-       /* Note: only the first dest can write to a1.x */
-       if (instr->dsts_count > 0) {
-               struct ir3_register *dst = instr->dsts[0];
-               return dst->num == regid(REG_A0, 1);
-       }
-       return false;
+   /* Note: only the first dest can write to a1.x */
+   if (instr->dsts_count > 0) {
+      struct ir3_register *dst = instr->dsts[0];
+      return dst->num == regid(REG_A0, 1);
+   }
+   return false;
  }
  
-static inline bool writes_pred(struct ir3_instruction *instr)
+static inline bool
+writes_pred(struct ir3_instruction *instr)
  {
-       /* Note: only the first dest can write to p0.x */
-       if (instr->dsts_count > 0) {
-               struct ir3_register *dst = instr->dsts[0];
-               return reg_num(dst) == REG_P0;
-       }
-       return false;
+   /* Note: only the first dest can write to p0.x */
+   if (instr->dsts_count > 0) {
+      struct ir3_register *dst = instr->dsts[0];
+      return reg_num(dst) == REG_P0;
+   }
+   return false;
  }
  
  /* Is it something other than a normal register. Shared regs, p0, and a0/a1
@@ -1053,201 +1104,215 @@ static inline bool writes_pred(struct ir3_instruction *instr)
   * size and never alias normal registers, even though a naive calculation
   * would sometimes make it seem like e.g. r30.z aliases a0.x.
   */
-static inline bool is_reg_special(const struct ir3_register *reg)
+static inline bool
+is_reg_special(const struct ir3_register *reg)
  {
-       return (reg->flags & IR3_REG_SHARED) ||
-               (reg_num(reg) == REG_A0) || (reg_num(reg) == REG_P0);
+   return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
+          (reg_num(reg) == REG_P0);
  }
  
  /* returns defining instruction for reg */
  /* TODO better name */
-static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+static inline struct ir3_instruction *
+ssa(struct ir3_register *reg)
  {
-       if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
-               return reg->def->instr;
-       return NULL;
+   if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
+      return reg->def->instr;
+   return NULL;
  }
  
-static inline bool conflicts(struct ir3_register *a,
-               struct ir3_register *b)
+static inline bool
+conflicts(struct ir3_register *a, struct ir3_register *b)
  {
-       return (a && b) && (a->def != b->def);
+   return (a && b) && (a->def != b->def);
  }
  
-static inline bool reg_gpr(struct ir3_register *r)
+static inline bool
+reg_gpr(struct ir3_register *r)
  {
-       if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-               return false;
-       if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
-               return false;
-       return true;
+   if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+      return false;
+   if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+      return false;
+   return true;
  }
  
-static inline type_t half_type(type_t type)
-{
-       switch (type) {
-       case TYPE_F32: return TYPE_F16;
-       case TYPE_U32: return TYPE_U16;
-       case TYPE_S32: return TYPE_S16;
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return type;
-       default:
-               assert(0);
-               return ~0;
-       }
+static inline type_t
+half_type(type_t type)
+{
+   switch (type) {
+   case TYPE_F32:
+      return TYPE_F16;
+   case TYPE_U32:
+      return TYPE_U16;
+   case TYPE_S32:
+      return TYPE_S16;
+   case TYPE_F16:
+   case TYPE_U16:
+   case TYPE_S16:
+      return type;
+   default:
+      assert(0);
+      return ~0;
+   }
  }
  
-static inline type_t full_type(type_t type)
-{
-       switch (type) {
-       case TYPE_F16: return TYPE_F32;
-       case TYPE_U16: return TYPE_U32;
-       case TYPE_S16: return TYPE_S32;
-       case TYPE_F32:
-       case TYPE_U32:
-       case TYPE_S32:
-               return type;
-       default:
-               assert(0);
-               return ~0;
-       }
+static inline type_t
+full_type(type_t type)
+{
+   switch (type) {
+   case TYPE_F16:
+      return TYPE_F32;
+   case TYPE_U16:
+      return TYPE_U32;
+   case TYPE_S16:
+      return TYPE_S32;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      return type;
+   default:
+      assert(0);
+      return ~0;
+   }
  }
  
  /* some cat2 instructions (ie. those which are not float) can embed an
   * immediate:
   */
-static inline bool ir3_cat2_int(opc_t opc)
-{
-       switch (opc) {
-       case OPC_ADD_U:
-       case OPC_ADD_S:
-       case OPC_SUB_U:
-       case OPC_SUB_S:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-       case OPC_MIN_U:
-       case OPC_MIN_S:
-       case OPC_MAX_U:
-       case OPC_MAX_S:
-       case OPC_CMPV_U:
-       case OPC_CMPV_S:
-       case OPC_MUL_U24:
-       case OPC_MUL_S24:
-       case OPC_MULL_U:
-       case OPC_CLZ_S:
-       case OPC_ABSNEG_S:
-       case OPC_AND_B:
-       case OPC_OR_B:
-       case OPC_NOT_B:
-       case OPC_XOR_B:
-       case OPC_BFREV_B:
-       case OPC_CLZ_B:
-       case OPC_SHL_B:
-       case OPC_SHR_B:
-       case OPC_ASHR_B:
-       case OPC_MGEN_B:
-       case OPC_GETBIT_B:
-       case OPC_CBITS_B:
-       case OPC_BARY_F:
-               return true;
-
-       default:
-               return false;
-       }
+static inline bool
+ir3_cat2_int(opc_t opc)
+{
+   switch (opc) {
+   case OPC_ADD_U:
+   case OPC_ADD_S:
+   case OPC_SUB_U:
+   case OPC_SUB_S:
+   case OPC_CMPS_U:
+   case OPC_CMPS_S:
+   case OPC_MIN_U:
+   case OPC_MIN_S:
+   case OPC_MAX_U:
+   case OPC_MAX_S:
+   case OPC_CMPV_U:
+   case OPC_CMPV_S:
+   case OPC_MUL_U24:
+   case OPC_MUL_S24:
+   case OPC_MULL_U:
+   case OPC_CLZ_S:
+   case OPC_ABSNEG_S:
+   case OPC_AND_B:
+   case OPC_OR_B:
+   case OPC_NOT_B:
+   case OPC_XOR_B:
+   case OPC_BFREV_B:
+   case OPC_CLZ_B:
+   case OPC_SHL_B:
+   case OPC_SHR_B:
+   case OPC_ASHR_B:
+   case OPC_MGEN_B:
+   case OPC_GETBIT_B:
+   case OPC_CBITS_B:
+   case OPC_BARY_F:
+      return true;
+
+   default:
+      return false;
+   }
  }
  
  /* map cat2 instruction to valid abs/neg flags: */
-static inline unsigned ir3_cat2_absneg(opc_t opc)
-{
-       switch (opc) {
-       case OPC_ADD_F:
-       case OPC_MIN_F:
-       case OPC_MAX_F:
-       case OPC_MUL_F:
-       case OPC_SIGN_F:
-       case OPC_CMPS_F:
-       case OPC_ABSNEG_F:
-       case OPC_CMPV_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-       case OPC_BARY_F:
-               return IR3_REG_FABS | IR3_REG_FNEG;
-
-       case OPC_ADD_U:
-       case OPC_ADD_S:
-       case OPC_SUB_U:
-       case OPC_SUB_S:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-       case OPC_MIN_U:
-       case OPC_MIN_S:
-       case OPC_MAX_U:
-       case OPC_MAX_S:
-       case OPC_CMPV_U:
-       case OPC_CMPV_S:
-       case OPC_MUL_U24:
-       case OPC_MUL_S24:
-       case OPC_MULL_U:
-       case OPC_CLZ_S:
-               return 0;
-
-       case OPC_ABSNEG_S:
-               return IR3_REG_SABS | IR3_REG_SNEG;
-
-       case OPC_AND_B:
-       case OPC_OR_B:
-       case OPC_NOT_B:
-       case OPC_XOR_B:
-       case OPC_BFREV_B:
-       case OPC_CLZ_B:
-       case OPC_SHL_B:
-       case OPC_SHR_B:
-       case OPC_ASHR_B:
-       case OPC_MGEN_B:
-       case OPC_GETBIT_B:
-       case OPC_CBITS_B:
-               return IR3_REG_BNOT;
-
-       default:
-               return 0;
-       }
+static inline unsigned
+ir3_cat2_absneg(opc_t opc)
+{
+   switch (opc) {
+   case OPC_ADD_F:
+   case OPC_MIN_F:
+   case OPC_MAX_F:
+   case OPC_MUL_F:
+   case OPC_SIGN_F:
+   case OPC_CMPS_F:
+   case OPC_ABSNEG_F:
+   case OPC_CMPV_F:
+   case OPC_FLOOR_F:
+   case OPC_CEIL_F:
+   case OPC_RNDNE_F:
+   case OPC_RNDAZ_F:
+   case OPC_TRUNC_F:
+   case OPC_BARY_F:
+      return IR3_REG_FABS | IR3_REG_FNEG;
+
+   case OPC_ADD_U:
+   case OPC_ADD_S:
+   case OPC_SUB_U:
+   case OPC_SUB_S:
+   case OPC_CMPS_U:
+   case OPC_CMPS_S:
+   case OPC_MIN_U:
+   case OPC_MIN_S:
+   case OPC_MAX_U:
+   case OPC_MAX_S:
+   case OPC_CMPV_U:
+   case OPC_CMPV_S:
+   case OPC_MUL_U24:
+   case OPC_MUL_S24:
+   case OPC_MULL_U:
+   case OPC_CLZ_S:
+      return 0;
+
+   case OPC_ABSNEG_S:
+      return IR3_REG_SABS | IR3_REG_SNEG;
+
+   case OPC_AND_B:
+   case OPC_OR_B:
+   case OPC_NOT_B:
+   case OPC_XOR_B:
+   case OPC_BFREV_B:
+   case OPC_CLZ_B:
+   case OPC_SHL_B:
+   case OPC_SHR_B:
+   case OPC_ASHR_B:
+   case OPC_MGEN_B:
+   case OPC_GETBIT_B:
+   case OPC_CBITS_B:
+      return IR3_REG_BNOT;
+
+   default:
+      return 0;
+   }
  }
  
  /* map cat3 instructions to valid abs/neg flags: */
-static inline unsigned ir3_cat3_absneg(opc_t opc)
+static inline unsigned
+ir3_cat3_absneg(opc_t opc)
  {
-       switch (opc) {
-       case OPC_MAD_F16:
-       case OPC_MAD_F32:
-       case OPC_SEL_F16:
-       case OPC_SEL_F32:
-               return IR3_REG_FNEG;
+   switch (opc) {
+   case OPC_MAD_F16:
+   case OPC_MAD_F32:
+   case OPC_SEL_F16:
+   case OPC_SEL_F32:
+      return IR3_REG_FNEG;
  
-       case OPC_MAD_U16:
-       case OPC_MADSH_U16:
-       case OPC_MAD_S16:
-       case OPC_MADSH_M16:
-       case OPC_MAD_U24:
-       case OPC_MAD_S24:
-       case OPC_SEL_S16:
-       case OPC_SEL_S32:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:
-               /* neg *may* work on 3rd src.. */
+   case OPC_MAD_U16:
+   case OPC_MADSH_U16:
+   case OPC_MAD_S16:
+   case OPC_MADSH_M16:
+   case OPC_MAD_U24:
+   case OPC_MAD_S24:
+   case OPC_SEL_S16:
+   case OPC_SEL_S32:
+   case OPC_SAD_S16:
+   case OPC_SAD_S32:
+      /* neg *may* work on 3rd src.. */
  
-       case OPC_SEL_B16:
-       case OPC_SEL_B32:
+   case OPC_SEL_B16:
+   case OPC_SEL_B32:
  
-       case OPC_SHLG_B16:
+   case OPC_SHLG_B16:
  
-       default:
-               return 0;
-       }
+   default:
+      return 0;
+   }
  }
  
  /* Return the type (float, int, or uint) the op uses when converting from the
@@ -1260,55 +1325,55 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)
  static inline type_t
  ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
  {
-       *can_fold = true;
-       switch (instr->opc) {
-       case OPC_ADD_F:
-       case OPC_MUL_F:
-       case OPC_BARY_F:
-       case OPC_MAD_F32:
-       case OPC_MAD_F16:
-               return TYPE_F32;
-
-       case OPC_ADD_U:
-       case OPC_SUB_U:
-       case OPC_MIN_U:
-       case OPC_MAX_U:
-       case OPC_AND_B:
-       case OPC_OR_B:
-       case OPC_NOT_B:
-       case OPC_XOR_B:
-       case OPC_MUL_U24:
-       case OPC_MULL_U:
-       case OPC_SHL_B:
-       case OPC_SHR_B:
-       case OPC_ASHR_B:
-       case OPC_MAD_U24:
-       /* Comparison ops zero-extend/truncate their results, so consider them as
-        * unsigned here.
-        */
-       case OPC_CMPS_F:
-       case OPC_CMPV_F:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-               return TYPE_U32;
-
-       case OPC_ADD_S:
-       case OPC_SUB_S:
-       case OPC_MIN_S:
-       case OPC_MAX_S:
-       case OPC_ABSNEG_S:
-       case OPC_MUL_S24:
-       case OPC_MAD_S24:
-               return TYPE_S32;
-
-       /* We assume that any move->move folding that could be done was done by
-        * NIR.
-        */
-       case OPC_MOV:
-       default:
-               *can_fold = false;
-               return TYPE_U32;
-       }
+   *can_fold = true;
+   switch (instr->opc) {
+   case OPC_ADD_F:
+   case OPC_MUL_F:
+   case OPC_BARY_F:
+   case OPC_MAD_F32:
+   case OPC_MAD_F16:
+      return TYPE_F32;
+
+   case OPC_ADD_U:
+   case OPC_SUB_U:
+   case OPC_MIN_U:
+   case OPC_MAX_U:
+   case OPC_AND_B:
+   case OPC_OR_B:
+   case OPC_NOT_B:
+   case OPC_XOR_B:
+   case OPC_MUL_U24:
+   case OPC_MULL_U:
+   case OPC_SHL_B:
+   case OPC_SHR_B:
+   case OPC_ASHR_B:
+   case OPC_MAD_U24:
+   /* Comparison ops zero-extend/truncate their results, so consider them as
+    * unsigned here.
+    */
+   case OPC_CMPS_F:
+   case OPC_CMPV_F:
+   case OPC_CMPS_U:
+   case OPC_CMPS_S:
+      return TYPE_U32;
+
+   case OPC_ADD_S:
+   case OPC_SUB_S:
+   case OPC_MIN_S:
+   case OPC_MAX_S:
+   case OPC_ABSNEG_S:
+   case OPC_MUL_S24:
+   case OPC_MAD_S24:
+      return TYPE_S32;
+
+   /* We assume that any move->move folding that could be done was done by
+    * NIR.
+    */
+   case OPC_MOV:
+   default:
+      *can_fold = false;
+      return TYPE_U32;
+   }
  }
  
  /* Return the src and dst types for the conversion which is already folded
@@ -1319,39 +1384,39 @@ ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
  static inline type_t
  ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
  {
-       switch (instr->opc) {
-       case OPC_CMPS_F:
-       case OPC_CMPV_F:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-               /* Comparisons only return 0/1 and the size of the comparison sources
-                * is irrelevant, never consider them as having an output conversion
-                * by returning a type with the dest size here:
-                */
-               return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type) :
-                       full_type(base_type);
-
-       case OPC_BARY_F:
-               /* bary.f doesn't have an explicit source, but we can assume here that
-                * the varying data it reads is in fp32.
-                *
-                * This may be fp16 on older gen's depending on some register
-                * settings, but it's probably not worth plumbing that through for a
-                * small improvement that NIR would hopefully handle for us anyway.
-                */
-               return TYPE_F32;
-
-       default:
-               return (instr->dsts[1]->flags & IR3_REG_HALF) ? half_type(base_type) :
-                       full_type(base_type);
-       }
+   switch (instr->opc) {
+   case OPC_CMPS_F:
+   case OPC_CMPV_F:
+   case OPC_CMPS_U:
+   case OPC_CMPS_S:
+      /* Comparisons only return 0/1 and the size of the comparison sources
+       * is irrelevant, never consider them as having an output conversion
+       * by returning a type with the dest size here:
+       */
+      return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
+                                                    : full_type(base_type);
+
+   case OPC_BARY_F:
+      /* bary.f doesn't have an explicit source, but we can assume here that
+       * the varying data it reads is in fp32.
+       *
+       * This may be fp16 on older gen's depending on some register
+       * settings, but it's probably not worth plumbing that through for a
+       * small improvement that NIR would hopefully handle for us anyway.
+       */
+      return TYPE_F32;
+
+   default:
+      return (instr->dsts[1]->flags & IR3_REG_HALF) ? half_type(base_type)
+                                                    : full_type(base_type);
+   }
  }
  
  static inline type_t
  ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
  {
-       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type) :
-               full_type(base_type);
+   return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
+                                                 : full_type(base_type);
  }
  
  /* Some instructions have signed/unsigned variants which are identical except
@@ -1362,129 +1427,137 @@ ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
  static inline opc_t
  ir3_try_swap_signedness(opc_t opc, bool *can_swap)
  {
-       switch (opc) {
-#define PAIR(u, s)             \
-       case OPC_##u:           \
-               return OPC_##s; \
-       case OPC_##s:           \
-               return OPC_##u;
-       PAIR(ADD_U, ADD_S)
-       PAIR(SUB_U, SUB_S)
-       /* Note: these are only identical when the sources are half, but that's
-        * the only case we call this function for anyway.
-        */
-       PAIR(MUL_U24, MUL_S24)
-
-       default:
-               *can_swap = false;
-               return opc;
-       }
+   switch (opc) {
+#define PAIR(u, s)                                                             \
+   case OPC_##u:                                                               \
+      return OPC_##s;                                                          \
+   case OPC_##s:                                                               \
+      return OPC_##u;
+      PAIR(ADD_U, ADD_S)
+      PAIR(SUB_U, SUB_S)
+      /* Note: these are only identical when the sources are half, but that's
+       * the only case we call this function for anyway.
+       */
+      PAIR(MUL_U24, MUL_S24)
+
+   default:
+      *can_swap = false;
+      return opc;
+   }
  }
  
  #define MASK(n) ((1 << (n)) - 1)
  
  /* iterator for an instructions's sources (reg), also returns src #: */
-#define foreach_src_n(__srcreg, __n, __instr) \
-       if ((__instr)->srcs_count) \
-               for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
-                       for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; __n++) \
-                               if ((__srcreg = (__instr)->srcs[__n]))
+#define foreach_src_n(__srcreg, __n, __instr)                                  \
+   if ((__instr)->srcs_count)                                                  \
+      for (struct ir3_register *__srcreg = (void *)~0; __srcreg;               \
+           __srcreg = NULL)                                                    \
+         for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
+              __n++)                                                           \
+            if ((__srcreg = (__instr)->srcs[__n]))
  
  /* iterator for an instructions's sources (reg): */
-#define foreach_src(__srcreg, __instr) \
-       foreach_src_n(__srcreg, __i, __instr)
+#define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
  
  /* iterator for an instructions's destinations (reg), also returns dst #: */
-#define foreach_dst_n(__dstreg, __n, __instr) \
-       if ((__instr)->dsts_count) \
-               for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
-                       for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; __n++) \
-                               if ((__dstreg = (__instr)->dsts[__n]))
+#define foreach_dst_n(__dstreg, __n, __instr)                                  \
+   if ((__instr)->dsts_count)                                                  \
+      for (struct ir3_register *__dstreg = (void *)~0; __dstreg;               \
+           __dstreg = NULL)                                                    \
+         for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
+              __n++)                                                           \
+            if ((__dstreg = (__instr)->dsts[__n]))
  
  /* iterator for an instructions's destinations (reg): */
-#define foreach_dst(__dstreg, __instr) \
-       foreach_dst_n(__dstreg, __i, __instr)
+#define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
  
-static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
+static inline unsigned
+__ssa_src_cnt(struct ir3_instruction *instr)
  {
-       return instr->srcs_count + instr->deps_count;
+   return instr->srcs_count + instr->deps_count;
  }
  
-static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+static inline bool
+__is_false_dep(struct ir3_instruction *instr, unsigned n)
  {
-       if (n >= instr->srcs_count)
-               return true;
-       return false;
+   if (n >= instr->srcs_count)
+      return true;
+   return false;
  }
  
  static inline struct ir3_instruction **
  __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
  {
-       if (__is_false_dep(instr, n))
-               return &instr->deps[n - instr->srcs_count];
-       if (ssa(instr->srcs[n]))
-               return &instr->srcs[n]->def->instr;
-       return NULL;
+   if (__is_false_dep(instr, n))
+      return &instr->deps[n - instr->srcs_count];
+   if (ssa(instr->srcs[n]))
+      return &instr->srcs[n]->def->instr;
+   return NULL;
  }
  
-#define foreach_ssa_srcp_n(__srcp, __n, __instr) \
-       for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
-               for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
-                       if ((__srcp = __ssa_srcp_n(__instr, __n)))
+#define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
+   for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
+      for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
+           __n++)                                                              \
+         if ((__srcp = __ssa_srcp_n(__instr, __n)))
  
-#define foreach_ssa_srcp(__srcp, __instr) \
-       foreach_ssa_srcp_n(__srcp, __i, __instr)
+#define foreach_ssa_srcp(__srcp, __instr)                                      \
+   foreach_ssa_srcp_n (__srcp, __i, __instr)
  
  /* iterator for an instruction's SSA sources (instr), also returns src #: */
-#define foreach_ssa_src_n(__srcinst, __n, __instr) \
-       for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; __srcinst = NULL) \
-               foreach_ssa_srcp_n(__srcp, __n, __instr) \
-                       if ((__srcinst = *__srcp))
+#define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
+   for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
+        __srcinst = NULL)                                                      \
+      foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
+         if ((__srcinst = *__srcp))
  
  /* iterator for an instruction's SSA sources (instr): */
-#define foreach_ssa_src(__srcinst, __instr) \
-       foreach_ssa_src_n(__srcinst, __i, __instr)
+#define foreach_ssa_src(__srcinst, __instr)                                    \
+   foreach_ssa_src_n (__srcinst, __i, __instr)
  
  /* iterators for shader inputs: */
-#define foreach_input_n(__ininstr, __cnt, __ir) \
-       for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; __ininstr = NULL) \
-               for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
-                       if ((__ininstr = (__ir)->inputs[__cnt]))
-#define foreach_input(__ininstr, __ir) \
-       foreach_input_n(__ininstr, __i, __ir)
+#define foreach_input_n(__ininstr, __cnt, __ir)                                \
+   for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
+        __ininstr = NULL)                                                      \
+      for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
+         if ((__ininstr = (__ir)->inputs[__cnt]))
+#define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
  
  /* iterators for instructions: */
-#define foreach_instr(__instr, __list) \
-       list_for_each_entry(struct ir3_instruction, __instr, __list, node)
-#define foreach_instr_rev(__instr, __list) \
-       list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node)
-#define foreach_instr_safe(__instr, __list) \
-       list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node)
-#define foreach_instr_from_safe(__instr, __start, __list) \
-       list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, __list, node)
+#define foreach_instr(__instr, __list)                                         \
+   list_for_each_entry (struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_rev(__instr, __list)                                     \
+   list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_safe(__instr, __list)                                    \
+   list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_from_safe(__instr, __start, __list)                      \
+   list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
+                                 __list, node)
  
  /* iterators for blocks: */
-#define foreach_block(__block, __list) \
-       list_for_each_entry(struct ir3_block, __block, __list, node)
-#define foreach_block_safe(__block, __list) \
-       list_for_each_entry_safe(struct ir3_block, __block, __list, node)
-#define foreach_block_rev(__block, __list) \
-       list_for_each_entry_rev(struct ir3_block, __block, __list, node)
+#define foreach_block(__block, __list)                                         \
+   list_for_each_entry (struct ir3_block, __block, __list, node)
+#define foreach_block_safe(__block, __list)                                    \
+   list_for_each_entry_safe (struct ir3_block, __block, __list, node)
+#define foreach_block_rev(__block, __list)                                     \
+   list_for_each_entry_rev (struct ir3_block, __block, __list, node)
  
  /* iterators for arrays: */
-#define foreach_array(__array, __list) \
-       list_for_each_entry(struct ir3_array, __array, __list, node)
-#define foreach_array_safe(__array, __list) \
-       list_for_each_entry_safe(struct ir3_array, __array, __list, node)
-
-#define IR3_PASS(ir, pass, ...) ({ \
-               bool progress = pass(ir, ##__VA_ARGS__); \
-               if (progress) { \
-                       ir3_debug_print(ir, "AFTER: " #pass); \
-                       ir3_validate(ir); \
-               } \
-               progress; \
-       })
+#define foreach_array(__array, __list)                                         \
+   list_for_each_entry (struct ir3_array, __array, __list, node)
+#define foreach_array_safe(__array, __list)                                    \
+   list_for_each_entry_safe (struct ir3_array, __array, __list, node)
+
+#define IR3_PASS(ir, pass, ...)                                                \
+   ({                                                                          \
+      bool progress = pass(ir, ##__VA_ARGS__);                                 \
+      if (progress) {                                                          \
+         ir3_debug_print(ir, "AFTER: " #pass);                                 \
+         ir3_validate(ir);                                                     \
+      }                                                                        \
+      progress;                                                                \
+   })
  
  /* validate: */
  void ir3_validate(struct ir3 *ir);
@@ -1495,12 +1568,14 @@ void ir3_print_instr(struct ir3_instruction *instr);
  
  /* delay calculation: */
  int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr);
-unsigned ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
-               bool soft, bool mergedregs);
-unsigned ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
-               bool mergedregs);
+                   struct ir3_instruction *consumer, unsigned n, bool soft);
+unsigned ir3_delay_calc_prera(struct ir3_block *block,
+                              struct ir3_instruction *instr);
+unsigned ir3_delay_calc_postra(struct ir3_block *block,
+                               struct ir3_instruction *instr, bool soft,
+                               bool mergedregs);
+unsigned ir3_delay_calc_exact(struct ir3_block *block,
+                              struct ir3_instruction *instr, bool mergedregs);
  void ir3_remove_nops(struct ir3 *ir);
  
  /* dead code elimination: */
@@ -1539,183 +1614,187 @@ bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
  static inline bool
  ir3_has_latency_to_hide(struct ir3 *ir)
  {
-       /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
-        * know the nature of the fragment shader.  Just assume it will have
-        * latency to hide:
-        */
-       if (ir->type != MESA_SHADER_FRAGMENT)
-               return true;
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (is_tex_or_prefetch(instr))
-                               return true;
-
-                       if (is_load(instr)) {
-                               switch (instr->opc) {
-                               case OPC_LDLV:
-                               case OPC_LDL:
-                               case OPC_LDLW:
-                                       break;
-                               default:
-                                       return true;
-                               }
-                       }
-               }
-       }
-
-       return false;
+   /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
+    * know the nature of the fragment shader.  Just assume it will have
+    * latency to hide:
+    */
+   if (ir->type != MESA_SHADER_FRAGMENT)
+      return true;
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (is_tex_or_prefetch(instr))
+            return true;
+
+         if (is_load(instr)) {
+            switch (instr->opc) {
+            case OPC_LDLV:
+            case OPC_LDL:
+            case OPC_LDLW:
+               break;
+            default:
+               return true;
+            }
+         }
+      }
+   }
+
+   return false;
  }
  
  /* ************************************************************************* */
  /* instruction helpers */
  
  /* creates SSA src of correct type (ie. half vs full precision) */
-static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
-               struct ir3_instruction *src, unsigned flags)
+static inline struct ir3_register *
+__ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
+          unsigned flags)
  {
-       struct ir3_register *reg;
-       if (src->dsts[0]->flags & IR3_REG_HALF)
-               flags |= IR3_REG_HALF;
-       reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
-       reg->def = src->dsts[0];
-       reg->wrmask = src->dsts[0]->wrmask;
-       return reg;
+   struct ir3_register *reg;
+   if (src->dsts[0]->flags & IR3_REG_HALF)
+      flags |= IR3_REG_HALF;
+   reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
+   reg->def = src->dsts[0];
+   reg->wrmask = src->dsts[0]->wrmask;
+   return reg;
  }
  
-static inline struct ir3_register * __ssa_dst(struct ir3_instruction *instr)
+static inline struct ir3_register *
+__ssa_dst(struct ir3_instruction *instr)
  {
-       struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
-       reg->instr = instr;
-       return reg;
+   struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
+   reg->instr = instr;
+   return reg;
  }
  
  static inline struct ir3_instruction *
  create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
  {
-       struct ir3_instruction *mov;
-       unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+   struct ir3_instruction *mov;
+   unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
  
-       mov = ir3_instr_create(block, OPC_MOV, 1, 1);
-       mov->cat1.src_type = type;
-       mov->cat1.dst_type = type;
-       __ssa_dst(mov)->flags |= flags;
-       ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
+   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   mov->cat1.src_type = type;
+   mov->cat1.dst_type = type;
+   __ssa_dst(mov)->flags |= flags;
+   ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
  
-       return mov;
+   return mov;
  }
  
  static inline struct ir3_instruction *
  create_immed(struct ir3_block *block, uint32_t val)
  {
-       return create_immed_typed(block, val, TYPE_U32);
+   return create_immed_typed(block, val, TYPE_U32);
  }
  
  static inline struct ir3_instruction *
  create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
  {
-       struct ir3_instruction *mov;
-       unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+   struct ir3_instruction *mov;
+   unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
  
-       mov = ir3_instr_create(block, OPC_MOV, 1, 1);
-       mov->cat1.src_type = type;
-       mov->cat1.dst_type = type;
-       __ssa_dst(mov)->flags |= flags;
-       ir3_src_create(mov, n, IR3_REG_CONST | flags);
+   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   mov->cat1.src_type = type;
+   mov->cat1.dst_type = type;
+   __ssa_dst(mov)->flags |= flags;
+   ir3_src_create(mov, n, IR3_REG_CONST | flags);
  
-       return mov;
+   return mov;
  }
  
  static inline struct ir3_instruction *
  create_uniform(struct ir3_block *block, unsigned n)
  {
-       return create_uniform_typed(block, n, TYPE_F32);
+   return create_uniform_typed(block, n, TYPE_F32);
  }
  
  static inline struct ir3_instruction *
  create_uniform_indirect(struct ir3_block *block, int n, type_t type,
-               struct ir3_instruction *address)
+                        struct ir3_instruction *address)
  {
-       struct ir3_instruction *mov;
+   struct ir3_instruction *mov;
  
-       mov = ir3_instr_create(block, OPC_MOV, 1, 1);
-       mov->cat1.src_type = type;
-       mov->cat1.dst_type = type;
-       __ssa_dst(mov);
-       ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
+   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   mov->cat1.src_type = type;
+   mov->cat1.dst_type = type;
+   __ssa_dst(mov);
+   ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
  
-       ir3_instr_set_address(mov, address);
+   ir3_instr_set_address(mov, address);
  
-       return mov;
+   return mov;
  }
  
  static inline struct ir3_instruction *
  ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
  {
-       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
-       unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+   struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
+   unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
  
-       __ssa_dst(instr)->flags |= flags;
-       if (src->dsts[0]->flags & IR3_REG_ARRAY) {
-               struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
-               src_reg->array = src->dsts[0]->array;
-       } else {
-               __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
-       }
-       debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
-       instr->cat1.src_type = type;
-       instr->cat1.dst_type = type;
-       return instr;
+   __ssa_dst(instr)->flags |= flags;
+   if (src->dsts[0]->flags & IR3_REG_ARRAY) {
+      struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
+      src_reg->array = src->dsts[0]->array;
+   } else {
+      __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
+   }
+   debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
+   instr->cat1.src_type = type;
+   instr->cat1.dst_type = type;
+   return instr;
  }
  
  static inline struct ir3_instruction *
-ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
-               type_t src_type, type_t dst_type)
+ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
+        type_t dst_type)
  {
-       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
-       unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
-       unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
+   struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
+   unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
+   unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
  
-       debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
+   debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
  
-       __ssa_dst(instr)->flags |= dst_flags;
-       __ssa_src(instr, src, 0);
-       instr->cat1.src_type = src_type;
-       instr->cat1.dst_type = dst_type;
-       debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
-       return instr;
+   __ssa_dst(instr)->flags |= dst_flags;
+   __ssa_src(instr, src, 0);
+   instr->cat1.src_type = src_type;
+   instr->cat1.dst_type = dst_type;
+   debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
+   return instr;
  }
  
  static inline struct ir3_instruction *
  ir3_MOVMSK(struct ir3_block *block, unsigned components)
  {
-       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
+   struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
  
-       struct ir3_register *dst = __ssa_dst(instr);
-       dst->flags |= IR3_REG_SHARED;
-       dst->wrmask = (1 << components) - 1;
-       instr->repeat = components - 1;
-       return instr;
+   struct ir3_register *dst = __ssa_dst(instr);
+   dst->flags |= IR3_REG_SHARED;
+   dst->wrmask = (1 << components) - 1;
+   instr->repeat = components - 1;
+   return instr;
  }
  
  static inline struct ir3_instruction *
-ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src, unsigned components)
+ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
+                 unsigned components)
  {
-       struct ir3_instruction *instr = ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
+   struct ir3_instruction *instr =
+      ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
  
-       struct ir3_register *dst = __ssa_dst(instr);
-       dst->flags |= IR3_REG_SHARED;
-       dst->wrmask = (1 << components) - 1;
+   struct ir3_register *dst = __ssa_dst(instr);
+   dst->flags |= IR3_REG_SHARED;
+   dst->wrmask = (1 << components) - 1;
  
-       __ssa_src(instr, src, 0);
+   __ssa_src(instr, src, 0);
  
-       return instr;
+   return instr;
  }
  
  static inline struct ir3_instruction *
  ir3_NOP(struct ir3_block *block)
  {
-       return ir3_instr_create(block, OPC_NOP, 0, 0);
+   return ir3_instr_create(block, OPC_NOP, 0, 0);
  }
  
  #define IR3_INSTR_0 0
@@ -1882,10 +1961,10 @@ INSTR2(READ_COND_MACRO)
  static inline struct ir3_instruction *
  ir3_ELECT_MACRO(struct ir3_block *block)
  {
-       struct ir3_instruction *instr =
-               ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
-       __ssa_dst(instr);
-       return instr;
+   struct ir3_instruction *instr =
+      ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
+   __ssa_dst(instr);
+   return instr;
  }
  
  /* cat2 instructions, most 2 src but some 1 src: */
@@ -1977,38 +2056,38 @@ INSTR1F(3D, DSY)
  INSTR1(RGETPOS)
  
  static inline struct ir3_instruction *
-ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
-               unsigned wrmask, unsigned flags, struct ir3_instruction *samp_tex,
-               struct ir3_instruction *src0, struct ir3_instruction *src1)
-{
-       struct ir3_instruction *sam;
-       unsigned nreg = 0;
-
-       if (flags & IR3_INSTR_S2EN) {
-               nreg++;
-       }
-       if (src0) {
-               nreg++;
-       }
-       if (src1) {
-               nreg++;
-       }
-
-       sam = ir3_instr_create(block, opc, 1, nreg);
-       sam->flags |= flags;
-       __ssa_dst(sam)->wrmask = wrmask;
-       if (flags & IR3_INSTR_S2EN) {
-               __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
-       }
-       if (src0) {
-               __ssa_src(sam, src0, 0);
-       }
-       if (src1) {
-               __ssa_src(sam, src1, 0);
-       }
-       sam->cat5.type  = type;
-
-       return sam;
+ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
+        unsigned flags, struct ir3_instruction *samp_tex,
+        struct ir3_instruction *src0, struct ir3_instruction *src1)
+{
+   struct ir3_instruction *sam;
+   unsigned nreg = 0;
+
+   if (flags & IR3_INSTR_S2EN) {
+      nreg++;
+   }
+   if (src0) {
+      nreg++;
+   }
+   if (src1) {
+      nreg++;
+   }
+
+   sam = ir3_instr_create(block, opc, 1, nreg);
+   sam->flags |= flags;
+   __ssa_dst(sam)->wrmask = wrmask;
+   if (flags & IR3_INSTR_S2EN) {
+      __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
+   }
+   if (src0) {
+      __ssa_src(sam, src0, 0);
+   }
+   if (src1) {
+      __ssa_src(sam, src1, 0);
+   }
+   sam->cat5.type = type;
+
+   return sam;
  }
  
  /* cat6 instructions: */
@@ -2075,34 +2154,35 @@ INSTR0(FENCE)
  /* ************************************************************************* */
  #include "regmask.h"
  
-static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
-{
-       bool half = reg->flags & IR3_REG_HALF;
-       if (reg->flags & IR3_REG_RELATIV) {
-               for (unsigned i = 0; i < reg->size; i++)
-                       __regmask_set(regmask, half, reg->array.base + i);
-       } else {
-               for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
-                       if (mask & 1)
-                               __regmask_set(regmask, half, n);
-       }
-}
-
-static inline bool regmask_get(regmask_t *regmask,
-               struct ir3_register *reg)
-{
-       bool half = reg->flags & IR3_REG_HALF;
-       if (reg->flags & IR3_REG_RELATIV) {
-               for (unsigned i = 0; i < reg->size; i++)
-                       if (__regmask_get(regmask, half, reg->array.base + i))
-                               return true;
-       } else {
-               for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
-                       if (mask & 1)
-                               if (__regmask_get(regmask, half, n))
-                                       return true;
-       }
-       return false;
+static inline void
+regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+   bool half = reg->flags & IR3_REG_HALF;
+   if (reg->flags & IR3_REG_RELATIV) {
+      for (unsigned i = 0; i < reg->size; i++)
+         __regmask_set(regmask, half, reg->array.base + i);
+   } else {
+      for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
+         if (mask & 1)
+            __regmask_set(regmask, half, n);
+   }
+}
+
+static inline bool
+regmask_get(regmask_t *regmask, struct ir3_register *reg)
+{
+   bool half = reg->flags & IR3_REG_HALF;
+   if (reg->flags & IR3_REG_RELATIV) {
+      for (unsigned i = 0; i < reg->size; i++)
+         if (__regmask_get(regmask, half, reg->array.base + i))
+            return true;
+   } else {
+      for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
+         if (mask & 1)
+            if (__regmask_get(regmask, half, n))
+               return true;
+   }
+   return false;
  }
  /* ************************************************************************* */
  
diff --git a/src/freedreno/ir3/ir3_a4xx.c b/src/freedreno/ir3/ir3_a4xx.c

index 2a0cfa3..ea93285 100644 (file)
--- a/src/freedreno/ir3/ir3_a4xx.c
+++ b/src/freedreno/ir3/ir3_a4xx.c
@@ -33,67 +33,65 @@
   * Handlers for instructions changed/added in a4xx:
   */
  
-
  /* src[] = { buffer_index, offset }. No const_index */
  static void
  emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                         struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
  
-       struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
+   struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
  
-       byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
-       offset = ir3_get_src(ctx, &intr->src[2])[0];
+   byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
+   offset = ir3_get_src(ctx, &intr->src[2])[0];
  
-       /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
-       src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
-       src1 = offset;
+   /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+   src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
+   src1 = offset;
  
-       ldgb = ir3_LDGB(b, ssbo, 0,
-                       src0, 0, src1, 0);
-       ldgb->dsts[0]->wrmask = MASK(intr->num_components);
-       ldgb->cat6.iim_val = intr->num_components;
-       ldgb->cat6.d = 4;
-       ldgb->cat6.type = TYPE_U32;
-       ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
-       ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
+   ldgb = ir3_LDGB(b, ssbo, 0, src0, 0, src1, 0);
+   ldgb->dsts[0]->wrmask = MASK(intr->num_components);
+   ldgb->cat6.iim_val = intr->num_components;
+   ldgb->cat6.d = 4;
+   ldgb->cat6.type = TYPE_U32;
+   ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+   ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
  
-       ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
+   ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
  }
  
  /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
  static void
  emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
-       unsigned wrmask = nir_intrinsic_write_mask(intr);
-       unsigned ncomp = ffs(~wrmask) - 1;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   unsigned ncomp = ffs(~wrmask) - 1;
  
-       assert(wrmask == BITFIELD_MASK(intr->num_components));
+   assert(wrmask == BITFIELD_MASK(intr->num_components));
  
-       struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
+   struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
  
-       byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
-       offset = ir3_get_src(ctx, &intr->src[3])[0];
+   byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
+   offset = ir3_get_src(ctx, &intr->src[3])[0];
  
-       /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
-        * nir already *= 4:
-        */
-       src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
-       src1 = offset;
-       src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
+   /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+    * nir already *= 4:
+    */
+   src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
+   src1 = offset;
+   src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
  
-       stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-       stgb->cat6.iim_val = ncomp;
-       stgb->cat6.d = 4;
-       stgb->cat6.type = TYPE_U32;
-       stgb->barrier_class = IR3_BARRIER_BUFFER_W;
-       stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+   stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+   stgb->cat6.iim_val = ncomp;
+   stgb->cat6.d = 4;
+   stgb->cat6.type = TYPE_U32;
+   stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+   stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
  
-       array_insert(b, b->keeps, stgb);
+   array_insert(b, b->keeps, stgb);
  }
  
  /*
@@ -116,229 +114,228 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  static struct ir3_instruction *
  emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
-               *offset;
-       type_t type = TYPE_U32;
-
-       ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
-
-       byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
-       offset = ir3_get_src(ctx, &intr->src[3])[0];
-
-       /* src0 is data (or uvec2(data, compare))
-        * src1 is offset
-        * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
-        *
-        * Note that nir already multiplies the offset by four
-        */
-       src0 = ir3_get_src(ctx, &intr->src[2])[0];
-       src1 = offset;
-       src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_ssbo_atomic_add_ir3:
-               atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_imin_ir3:
-               atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_ssbo_atomic_umin_ir3:
-               atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_imax_ir3:
-               atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_ssbo_atomic_umax_ir3:
-               atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_and_ir3:
-               atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_or_ir3:
-               atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_xor_ir3:
-               atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_exchange_ir3:
-               atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
-               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
-               src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
-               src1 = ir3_get_src(ctx, &intr->src[4])[0];
-               atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = 4;
-       atomic->cat6.type = type;
-       atomic->barrier_class = IR3_BARRIER_BUFFER_W;
-       atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       return atomic;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
+      *offset;
+   type_t type = TYPE_U32;
+
+   ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
+
+   byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
+   offset = ir3_get_src(ctx, &intr->src[3])[0];
+
+   /* src0 is data (or uvec2(data, compare))
+    * src1 is offset
+    * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
+    *
+    * Note that nir already multiplies the offset by four
+    */
+   src0 = ir3_get_src(ctx, &intr->src[2])[0];
+   src1 = offset;
+   src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_ssbo_atomic_add_ir3:
+      atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_imin_ir3:
+      atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      type = TYPE_S32;
+      break;
+   case nir_intrinsic_ssbo_atomic_umin_ir3:
+      atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_imax_ir3:
+      atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      type = TYPE_S32;
+      break;
+   case nir_intrinsic_ssbo_atomic_umax_ir3:
+      atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_and_ir3:
+      atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_or_ir3:
+      atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_xor_ir3:
+      atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_exchange_ir3:
+      atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
+      /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+      src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
+      src1 = ir3_get_src(ctx, &intr->src[4])[0];
+      atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   default:
+      unreachable("boo");
+   }
+
+   atomic->cat6.iim_val = 1;
+   atomic->cat6.d = 4;
+   atomic->cat6.type = type;
+   atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+   atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+   /* even if nothing consume the result, we can't DCE the instruction: */
+   array_insert(b, b->keeps, atomic);
+
+   return atomic;
  }
  
  static struct ir3_instruction *
  get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
-               struct ir3_instruction * const *coords, bool byteoff)
+                 struct ir3_instruction *const *coords, bool byteoff)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *offset;
-       unsigned index = nir_src_as_uint(instr->src[0]);
-       unsigned ncoords = ir3_get_image_coords(instr, NULL);
-
-       /* to calculate the byte offset (yes, uggg) we need (up to) three
-        * const values to know the bytes per pixel, and y and z stride:
-        */
-       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-       unsigned cb = regid(const_state->offsets.image_dims, 0) +
-               const_state->image_dims.off[index];
-
-       debug_assert(const_state->image_dims.mask & (1 << index));
-
-       /* offset = coords.x * bytes_per_pixel: */
-       offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
-       if (ncoords > 1) {
-               /* offset += coords.y * y_pitch: */
-               offset = ir3_MAD_S24(b, create_uniform(b, cb + 1), 0,
-                               coords[1], 0, offset, 0);
-       }
-       if (ncoords > 2) {
-               /* offset += coords.z * z_pitch: */
-               offset = ir3_MAD_S24(b, create_uniform(b, cb + 2), 0,
-                               coords[2], 0, offset, 0);
-       }
-
-       if (!byteoff) {
-               /* Some cases, like atomics, seem to use dword offset instead
-                * of byte offsets.. blob just puts an extra shr.b in there
-                * in those cases:
-                */
-               offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-       }
-
-       return ir3_collect(ctx, offset, create_immed(b, 0));
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *offset;
+   unsigned index = nir_src_as_uint(instr->src[0]);
+   unsigned ncoords = ir3_get_image_coords(instr, NULL);
+
+   /* to calculate the byte offset (yes, uggg) we need (up to) three
+    * const values to know the bytes per pixel, and y and z stride:
+    */
+   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   unsigned cb = regid(const_state->offsets.image_dims, 0) +
+                 const_state->image_dims.off[index];
+
+   debug_assert(const_state->image_dims.mask & (1 << index));
+
+   /* offset = coords.x * bytes_per_pixel: */
+   offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
+   if (ncoords > 1) {
+      /* offset += coords.y * y_pitch: */
+      offset =
+         ir3_MAD_S24(b, create_uniform(b, cb + 1), 0, coords[1], 0, offset, 0);
+   }
+   if (ncoords > 2) {
+      /* offset += coords.z * z_pitch: */
+      offset =
+         ir3_MAD_S24(b, create_uniform(b, cb + 2), 0, coords[2], 0, offset, 0);
+   }
+
+   if (!byteoff) {
+      /* Some cases, like atomics, seem to use dword offset instead
+       * of byte offsets.. blob just puts an extra shr.b in there
+       * in those cases:
+       */
+      offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+   }
+
+   return ir3_collect(ctx, offset, create_immed(b, 0));
  }
  
  /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
  static void
  emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stib, *offset;
-       struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
-       struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
-       struct ir3_instruction * ibo = ir3_image_to_ibo(ctx, intr->src[0]);
-       unsigned ncoords = ir3_get_image_coords(intr, NULL);
-       unsigned ncomp = ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
-
-       /* src0 is value
-        * src1 is coords
-        * src2 is 64b byte offset
-        */
-
-       offset = get_image_offset(ctx, intr, coords, true);
-
-       /* NOTE: stib seems to take byte offset, but stgb.typed can be used
-        * too and takes a dword offset.. not quite sure yet why blob uses
-        * one over the other in various cases.
-        */
-
-       stib = ir3_STIB(b, ibo, 0,
-                       ir3_create_collect(ctx, value, ncomp), 0,
-                       ir3_create_collect(ctx, coords, ncoords), 0,
-                       offset, 0);
-       stib->cat6.iim_val = ncomp;
-       stib->cat6.d = ncoords;
-       stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
-       stib->cat6.typed = true;
-       stib->barrier_class = IR3_BARRIER_IMAGE_W;
-       stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-
-       array_insert(b, b->keeps, stib);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *stib, *offset;
+   struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
+   struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
+   struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
+   unsigned ncoords = ir3_get_image_coords(intr, NULL);
+   unsigned ncomp =
+      ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
+
+   /* src0 is value
+    * src1 is coords
+    * src2 is 64b byte offset
+    */
+
+   offset = get_image_offset(ctx, intr, coords, true);
+
+   /* NOTE: stib seems to take byte offset, but stgb.typed can be used
+    * too and takes a dword offset.. not quite sure yet why blob uses
+    * one over the other in various cases.
+    */
+
+   stib = ir3_STIB(b, ibo, 0, ir3_create_collect(ctx, value, ncomp), 0,
+                   ir3_create_collect(ctx, coords, ncoords), 0, offset, 0);
+   stib->cat6.iim_val = ncomp;
+   stib->cat6.d = ncoords;
+   stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
+   stib->cat6.typed = true;
+   stib->barrier_class = IR3_BARRIER_IMAGE_W;
+   stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+   array_insert(b, b->keeps, stib);
  }
  
  /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
  static struct ir3_instruction *
  emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *src0, *src1, *src2;
-       struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
-       struct ir3_instruction * image = ir3_image_to_ibo(ctx, intr->src[0]);
-       unsigned ncoords = ir3_get_image_coords(intr, NULL);
-
-       /* src0 is value (or uvec2(value, compare))
-        * src1 is coords
-        * src2 is 64b byte offset
-        */
-       src0 = ir3_get_src(ctx, &intr->src[3])[0];
-       src1 = ir3_create_collect(ctx, coords, ncoords);
-       src2 = get_image_offset(ctx, intr, coords, false);
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_image_atomic_add:
-               atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_imin:
-       case nir_intrinsic_image_atomic_umin:
-               atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_imax:
-       case nir_intrinsic_image_atomic_umax:
-               atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_and:
-               atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_or:
-               atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_xor:
-               atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_exchange:
-               atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_atomic_comp_swap:
-               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
-               src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
-               atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = ncoords;
-       atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
-       atomic->cat6.typed = true;
-       atomic->barrier_class = IR3_BARRIER_IMAGE_W;
-       atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       return atomic;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *atomic, *src0, *src1, *src2;
+   struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
+   struct ir3_instruction *image = ir3_image_to_ibo(ctx, intr->src[0]);
+   unsigned ncoords = ir3_get_image_coords(intr, NULL);
+
+   /* src0 is value (or uvec2(value, compare))
+    * src1 is coords
+    * src2 is 64b byte offset
+    */
+   src0 = ir3_get_src(ctx, &intr->src[3])[0];
+   src1 = ir3_create_collect(ctx, coords, ncoords);
+   src2 = get_image_offset(ctx, intr, coords, false);
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_image_atomic_add:
+      atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_imin:
+   case nir_intrinsic_image_atomic_umin:
+      atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_imax:
+   case nir_intrinsic_image_atomic_umax:
+      atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_and:
+      atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_or:
+      atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_xor:
+      atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_exchange:
+      atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   case nir_intrinsic_image_atomic_comp_swap:
+      /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+      src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
+      atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+      break;
+   default:
+      unreachable("boo");
+   }
+
+   atomic->cat6.iim_val = 1;
+   atomic->cat6.d = ncoords;
+   atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
+   atomic->cat6.typed = true;
+   atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+   atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+   /* even if nothing consume the result, we can't DCE the instruction: */
+   array_insert(b, b->keeps, atomic);
+
+   return atomic;
  }
  
  const struct ir3_context_funcs ir3_a4xx_funcs = {
-               .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
-               .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
-               .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
-               .emit_intrinsic_store_image = emit_intrinsic_store_image,
-               .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
-               .emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
-               .emit_intrinsic_load_global_ir3 = NULL,
-               .emit_intrinsic_store_global_ir3 = NULL,
+   .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
+   .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
+   .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
+   .emit_intrinsic_store_image = emit_intrinsic_store_image,
+   .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
+   .emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
+   .emit_intrinsic_load_global_ir3 = NULL,
+   .emit_intrinsic_store_global_ir3 = NULL,
  };
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c

index 99bc386..aca14fb 100644 (file)
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -40,53 +40,53 @@
  /* src[] = { buffer_index, offset }. No const_index */
  static void
  emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                         struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *offset;
-       struct ir3_instruction *ldib;
-
-       offset = ir3_get_src(ctx, &intr->src[2])[0];
-
-       ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);
-       ldib->dsts[0]->wrmask = MASK(intr->num_components);
-       ldib->cat6.iim_val = intr->num_components;
-       ldib->cat6.d = 1;
-       ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
-       ldib->barrier_class = IR3_BARRIER_BUFFER_R;
-       ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
-       ir3_handle_bindless_cat6(ldib, intr->src[0]);
-       ir3_handle_nonuniform(ldib, intr);
-
-       ir3_split_dest(b, dst, ldib, 0, intr->num_components);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *offset;
+   struct ir3_instruction *ldib;
+
+   offset = ir3_get_src(ctx, &intr->src[2])[0];
+
+   ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);
+   ldib->dsts[0]->wrmask = MASK(intr->num_components);
+   ldib->cat6.iim_val = intr->num_components;
+   ldib->cat6.d = 1;
+   ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
+   ldib->barrier_class = IR3_BARRIER_BUFFER_R;
+   ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
+   ir3_handle_bindless_cat6(ldib, intr->src[0]);
+   ir3_handle_nonuniform(ldib, intr);
+
+   ir3_split_dest(b, dst, ldib, 0, intr->num_components);
  }
  
  /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
  static void
  emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stib, *val, *offset;
-       unsigned wrmask = nir_intrinsic_write_mask(intr);
-       unsigned ncomp = ffs(~wrmask) - 1;
-
-       assert(wrmask == BITFIELD_MASK(intr->num_components));
-
-       /* src0 is offset, src1 is value:
-        */
-       val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
-       offset = ir3_get_src(ctx, &intr->src[3])[0];
-
-       stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
-       stib->cat6.iim_val = ncomp;
-       stib->cat6.d = 1;
-       stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
-       stib->barrier_class = IR3_BARRIER_BUFFER_W;
-       stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-       ir3_handle_bindless_cat6(stib, intr->src[1]);
-       ir3_handle_nonuniform(stib, intr);
-
-       array_insert(b, b->keeps, stib);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *stib, *val, *offset;
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   unsigned ncomp = ffs(~wrmask) - 1;
+
+   assert(wrmask == BITFIELD_MASK(intr->num_components));
+
+   /* src0 is offset, src1 is value:
+    */
+   val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
+   offset = ir3_get_src(ctx, &intr->src[3])[0];
+
+   stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
+   stib->cat6.iim_val = ncomp;
+   stib->cat6.d = 1;
+   stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
+   stib->barrier_class = IR3_BARRIER_BUFFER_W;
+   stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+   ir3_handle_bindless_cat6(stib, intr->src[1]);
+   ir3_handle_nonuniform(stib, intr);
+
+   array_insert(b, b->keeps, stib);
  }
  
  /*
@@ -109,329 +109,321 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  static struct ir3_instruction *
  emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
-       type_t type = TYPE_U32;
-
-       ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
-
-       data   = ir3_get_src(ctx, &intr->src[2])[0];
-
-       /* So this gets a bit creative:
-        *
-        *    src0    - vecN offset/coords
-        *    src1.x  - is actually destination register
-        *    src1.y  - is 'data' except for cmpxchg where src2.y is 'compare'
-        *    src1.z  - is 'data' for cmpxchg
-        *
-        * The combining src and dest kinda doesn't work out so well with how
-        * scheduling and RA work. So we create a dummy src2 which is tied to the
-        * destination in RA (i.e. must be allocated to the same vec2/vec3
-        * register) and then immediately extract the first component.
-        *
-        * Note that nir already multiplies the offset by four
-        */
-       dummy = create_immed(b, 0);
-
-       if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
-               src0 = ir3_get_src(ctx, &intr->src[4])[0];
-               struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
-               src1 = ir3_collect(ctx, dummy, compare, data);
-       } else {
-               src0 = ir3_get_src(ctx, &intr->src[3])[0];
-               src1 = ir3_collect(ctx, dummy, data);
-       }
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_ssbo_atomic_add_ir3:
-               atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_imin_ir3:
-               atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_ssbo_atomic_umin_ir3:
-               atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_imax_ir3:
-               atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_ssbo_atomic_umax_ir3:
-               atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_and_ir3:
-               atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_or_ir3:
-               atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_xor_ir3:
-               atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_exchange_ir3:
-               atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
-               atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = 1;
-       atomic->cat6.type = type;
-       atomic->barrier_class = IR3_BARRIER_BUFFER_W;
-       atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-       ir3_handle_bindless_cat6(atomic, intr->src[0]);
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
-       ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
-       struct ir3_instruction *split;
-       ir3_split_dest(b, &split, atomic, 0, 1);
-       return split;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
+   type_t type = TYPE_U32;
+
+   ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
+
+   data = ir3_get_src(ctx, &intr->src[2])[0];
+
+   /* So this gets a bit creative:
+    *
+    *    src0    - vecN offset/coords
+    *    src1.x  - is actually destination register
+    *    src1.y  - is 'data' except for cmpxchg where src2.y is 'compare'
+    *    src1.z  - is 'data' for cmpxchg
+    *
+    * The combining src and dest kinda doesn't work out so well with how
+    * scheduling and RA work. So we create a dummy src2 which is tied to the
+    * destination in RA (i.e. must be allocated to the same vec2/vec3
+    * register) and then immediately extract the first component.
+    *
+    * Note that nir already multiplies the offset by four
+    */
+   dummy = create_immed(b, 0);
+
+   if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
+      src0 = ir3_get_src(ctx, &intr->src[4])[0];
+      struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
+      src1 = ir3_collect(ctx, dummy, compare, data);
+   } else {
+      src0 = ir3_get_src(ctx, &intr->src[3])[0];
+      src1 = ir3_collect(ctx, dummy, data);
+   }
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_ssbo_atomic_add_ir3:
+      atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_imin_ir3:
+      atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
+      type = TYPE_S32;
+      break;
+   case nir_intrinsic_ssbo_atomic_umin_ir3:
+      atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_imax_ir3:
+      atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
+      type = TYPE_S32;
+      break;
+   case nir_intrinsic_ssbo_atomic_umax_ir3:
+      atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_and_ir3:
+      atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_or_ir3:
+      atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_xor_ir3:
+      atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_exchange_ir3:
+      atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
+      atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   default:
+      unreachable("boo");
+   }
+
+   atomic->cat6.iim_val = 1;
+   atomic->cat6.d = 1;
+   atomic->cat6.type = type;
+   atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+   atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+   ir3_handle_bindless_cat6(atomic, intr->src[0]);
+
+   /* even if nothing consume the result, we can't DCE the instruction: */
+   array_insert(b, b->keeps, atomic);
+
+   atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
+   ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
+   struct ir3_instruction *split;
+   ir3_split_dest(b, &split, atomic, 0, 1);
+   return split;
  }
  
  /* src[] = { deref, coord, sample_index }. const_index[] = {} */
  static void
  emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                          struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ldib;
-       struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
-       unsigned ncoords = ir3_get_image_coords(intr, NULL);
-
-       ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
-                                       ir3_create_collect(ctx, coords, ncoords), 0);
-       ldib->dsts[0]->wrmask = MASK(intr->num_components);
-       ldib->cat6.iim_val = intr->num_components;
-       ldib->cat6.d = ncoords;
-       ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
-       ldib->cat6.typed = true;
-       ldib->barrier_class = IR3_BARRIER_IMAGE_R;
-       ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
-       ir3_handle_bindless_cat6(ldib, intr->src[0]);
-       ir3_handle_nonuniform(ldib, intr);
-
-       ir3_split_dest(b, dst, ldib, 0, intr->num_components);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *ldib;
+   struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
+   unsigned ncoords = ir3_get_image_coords(intr, NULL);
+
+   ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
+                   ir3_create_collect(ctx, coords, ncoords), 0);
+   ldib->dsts[0]->wrmask = MASK(intr->num_components);
+   ldib->cat6.iim_val = intr->num_components;
+   ldib->cat6.d = ncoords;
+   ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
+   ldib->cat6.typed = true;
+   ldib->barrier_class = IR3_BARRIER_IMAGE_R;
+   ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
+   ir3_handle_bindless_cat6(ldib, intr->src[0]);
+   ir3_handle_nonuniform(ldib, intr);
+
+   ir3_split_dest(b, dst, ldib, 0, intr->num_components);
  }
  
  /* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
  static void
  emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stib;
-       struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
-       struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
-       unsigned ncoords = ir3_get_image_coords(intr, NULL);
-       enum pipe_format format = nir_intrinsic_format(intr);
-       unsigned ncomp = ir3_get_num_components_for_image_format(format);
-
-       /* src0 is offset, src1 is value:
-        */
-       stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
-                       ir3_create_collect(ctx, coords, ncoords), 0,
-                       ir3_create_collect(ctx, value, ncomp), 0);
-       stib->cat6.iim_val = ncomp;
-       stib->cat6.d = ncoords;
-       stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
-       stib->cat6.typed = true;
-       stib->barrier_class = IR3_BARRIER_IMAGE_W;
-       stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-       ir3_handle_bindless_cat6(stib, intr->src[0]);
-       ir3_handle_nonuniform(stib, intr);
-
-       array_insert(b, b->keeps, stib);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *stib;
+   struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
+   struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
+   unsigned ncoords = ir3_get_image_coords(intr, NULL);
+   enum pipe_format format = nir_intrinsic_format(intr);
+   unsigned ncomp = ir3_get_num_components_for_image_format(format);
+
+   /* src0 is offset, src1 is value:
+    */
+   stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
+                   ir3_create_collect(ctx, coords, ncoords), 0,
+                   ir3_create_collect(ctx, value, ncomp), 0);
+   stib->cat6.iim_val = ncomp;
+   stib->cat6.d = ncoords;
+   stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
+   stib->cat6.typed = true;
+   stib->barrier_class = IR3_BARRIER_IMAGE_W;
+   stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+   ir3_handle_bindless_cat6(stib, intr->src[0]);
+   ir3_handle_nonuniform(stib, intr);
+
+   array_insert(b, b->keeps, stib);
  }
  
  /* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
  static struct ir3_instruction *
  emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
-       struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
-       struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
-       unsigned ncoords = ir3_get_image_coords(intr, NULL);
-
-       ibo = ir3_image_to_ibo(ctx, intr->src[0]);
-
-       /* So this gets a bit creative:
-        *
-        *    src0    - vecN offset/coords
-        *    src1.x  - is actually destination register
-        *    src1.y  - is 'value' except for cmpxchg where src2.y is 'compare'
-        *    src1.z  - is 'value' for cmpxchg
-        *
-        * The combining src and dest kinda doesn't work out so well with how
-        * scheduling and RA work. So we create a dummy src2 which is tied to the
-        * destination in RA (i.e. must be allocated to the same vec2/vec3
-        * register) and then immediately extract the first component.
-        */
-       dummy = create_immed(b, 0);
-       src0 = ir3_create_collect(ctx, coords, ncoords);
-
-       if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
-               intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
-               struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
-               src1 = ir3_collect(ctx, dummy, compare, value);
-       } else {
-               src1 = ir3_collect(ctx, dummy, value);
-       }
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_image_atomic_add:
-       case nir_intrinsic_bindless_image_atomic_add:
-               atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_imin:
-       case nir_intrinsic_image_atomic_umin:
-       case nir_intrinsic_bindless_image_atomic_imin:
-       case nir_intrinsic_bindless_image_atomic_umin:
-               atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_imax:
-       case nir_intrinsic_image_atomic_umax:
-       case nir_intrinsic_bindless_image_atomic_imax:
-       case nir_intrinsic_bindless_image_atomic_umax:
-               atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_and:
-       case nir_intrinsic_bindless_image_atomic_and:
-               atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_or:
-       case nir_intrinsic_bindless_image_atomic_or:
-               atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_xor:
-       case nir_intrinsic_bindless_image_atomic_xor:
-               atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_exchange:
-       case nir_intrinsic_bindless_image_atomic_exchange:
-               atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_image_atomic_comp_swap:
-       case nir_intrinsic_bindless_image_atomic_comp_swap:
-               atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = ncoords;
-       atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
-       atomic->cat6.typed = true;
-       atomic->barrier_class = IR3_BARRIER_IMAGE_W;
-       atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-       ir3_handle_bindless_cat6(atomic, intr->src[0]);
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
-       ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
-       struct ir3_instruction *split;
-       ir3_split_dest(b, &split, atomic, 0, 1);
-       return split;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
+   struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
+   struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
+   unsigned ncoords = ir3_get_image_coords(intr, NULL);
+
+   ibo = ir3_image_to_ibo(ctx, intr->src[0]);
+
+   /* So this gets a bit creative:
+    *
+    *    src0    - vecN offset/coords
+    *    src1.x  - is actually destination register
+    *    src1.y  - is 'value' except for cmpxchg where src2.y is 'compare'
+    *    src1.z  - is 'value' for cmpxchg
+    *
+    * The combining src and dest kinda doesn't work out so well with how
+    * scheduling and RA work. So we create a dummy src2 which is tied to the
+    * destination in RA (i.e. must be allocated to the same vec2/vec3
+    * register) and then immediately extract the first component.
+    */
+   dummy = create_immed(b, 0);
+   src0 = ir3_create_collect(ctx, coords, ncoords);
+
+   if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
+       intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
+      struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
+      src1 = ir3_collect(ctx, dummy, compare, value);
+   } else {
+      src1 = ir3_collect(ctx, dummy, value);
+   }
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_bindless_image_atomic_add:
+      atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_imin:
+   case nir_intrinsic_image_atomic_umin:
+   case nir_intrinsic_bindless_image_atomic_imin:
+   case nir_intrinsic_bindless_image_atomic_umin:
+      atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_imax:
+   case nir_intrinsic_image_atomic_umax:
+   case nir_intrinsic_bindless_image_atomic_imax:
+   case nir_intrinsic_bindless_image_atomic_umax:
+      atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_bindless_image_atomic_and:
+      atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_bindless_image_atomic_or:
+      atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_bindless_image_atomic_xor:
+      atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_bindless_image_atomic_exchange:
+      atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_image_atomic_comp_swap:
+   case nir_intrinsic_bindless_image_atomic_comp_swap:
+      atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
+      break;
+   default:
+      unreachable("boo");
+   }
+
+   atomic->cat6.iim_val = 1;
+   atomic->cat6.d = ncoords;
+   atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
+   atomic->cat6.typed = true;
+   atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+   atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+   ir3_handle_bindless_cat6(atomic, intr->src[0]);
+
+   /* even if nothing consume the result, we can't DCE the instruction: */
+   array_insert(b, b->keeps, atomic);
+
+   atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
+   ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
+   struct ir3_instruction *split;
+   ir3_split_dest(b, &split, atomic, 0, 1);
+   return split;
  }
  
  static void
  emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                          struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
-       struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
-       resinfo->cat6.iim_val = 1;
-       resinfo->cat6.d = intr->num_components;
-       resinfo->cat6.type = TYPE_U32;
-       resinfo->cat6.typed = false;
-       /* resinfo has no writemask and always writes out 3 components: */
-       compile_assert(ctx, intr->num_components <= 3);
-       resinfo->dsts[0]->wrmask = MASK(3);
-       ir3_handle_bindless_cat6(resinfo, intr->src[0]);
-
-       ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
+   struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
+   resinfo->cat6.iim_val = 1;
+   resinfo->cat6.d = intr->num_components;
+   resinfo->cat6.type = TYPE_U32;
+   resinfo->cat6.typed = false;
+   /* resinfo has no writemask and always writes out 3 components: */
+   compile_assert(ctx, intr->num_components <= 3);
+   resinfo->dsts[0]->wrmask = MASK(3);
+   ir3_handle_bindless_cat6(resinfo, intr->src[0]);
+
+   ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
  }
  
  static void
-emit_intrinsic_load_global_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
+                               nir_intrinsic_instr *intr,
+                               struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       unsigned dest_components = nir_intrinsic_dest_components(intr);
-       struct ir3_instruction *addr, *offset;
+   struct ir3_block *b = ctx->block;
+   unsigned dest_components = nir_intrinsic_dest_components(intr);
+   struct ir3_instruction *addr, *offset;
  
-       addr = ir3_collect(ctx,
-                       ir3_get_src(ctx, &intr->src[0])[0],
-                       ir3_get_src(ctx, &intr->src[0])[1]);
+   addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[0])[0],
+                      ir3_get_src(ctx, &intr->src[0])[1]);
  
-       offset = ir3_get_src(ctx, &intr->src[1])[0];
+   offset = ir3_get_src(ctx, &intr->src[1])[0];
  
-       struct ir3_instruction *load =
-               ir3_LDG_A(b, addr, 0, offset, 0,
-                               create_immed(b, 0), 0,
-                               create_immed(b, 0), 0,
-                               create_immed(b, dest_components), 0);
-       load->cat6.type = TYPE_U32;
-       load->dsts[0]->wrmask = MASK(dest_components);
+   struct ir3_instruction *load =
+      ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
+                create_immed(b, 0), 0, create_immed(b, dest_components), 0);
+   load->cat6.type = TYPE_U32;
+   load->dsts[0]->wrmask = MASK(dest_components);
  
-       load->barrier_class = IR3_BARRIER_BUFFER_R;
-       load->barrier_conflict = IR3_BARRIER_BUFFER_W;
+   load->barrier_class = IR3_BARRIER_BUFFER_R;
+   load->barrier_conflict = IR3_BARRIER_BUFFER_W;
  
-       ir3_split_dest(b, dst, load, 0, dest_components);
+   ir3_split_dest(b, dst, load, 0, dest_components);
  }
  
  static void
-emit_intrinsic_store_global_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
+                                nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *value, *addr, *offset;
-       unsigned ncomp = nir_intrinsic_src_components(intr, 0);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *value, *addr, *offset;
+   unsigned ncomp = nir_intrinsic_src_components(intr, 0);
  
-       addr = ir3_collect(ctx,
-                       ir3_get_src(ctx, &intr->src[1])[0],
-                       ir3_get_src(ctx, &intr->src[1])[1]);
+   addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[1])[0],
+                      ir3_get_src(ctx, &intr->src[1])[1]);
  
-       offset = ir3_get_src(ctx, &intr->src[2])[0];
+   offset = ir3_get_src(ctx, &intr->src[2])[0];
  
-       value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
+   value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
  
-       struct ir3_instruction *stg =
-               ir3_STG_A(b,
-                                       addr, 0,
-                                       offset, 0,
-                                       create_immed(b, 0), 0,
-                                       create_immed(b, 0), 0,
-                                       value, 0,
-                                       create_immed(b, ncomp), 0);
-       stg->cat6.type = TYPE_U32;
-       stg->cat6.iim_val = 1;
+   struct ir3_instruction *stg =
+      ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
+                create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
+   stg->cat6.type = TYPE_U32;
+   stg->cat6.iim_val = 1;
  
-       array_insert(b, b->keeps, stg);
+   array_insert(b, b->keeps, stg);
  
-       stg->barrier_class = IR3_BARRIER_BUFFER_W;
-       stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+   stg->barrier_class = IR3_BARRIER_BUFFER_W;
+   stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
  }
  
  const struct ir3_context_funcs ir3_a6xx_funcs = {
-               .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
-               .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
-               .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
-               .emit_intrinsic_load_image = emit_intrinsic_load_image,
-               .emit_intrinsic_store_image = emit_intrinsic_store_image,
-               .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
-               .emit_intrinsic_image_size = emit_intrinsic_image_size,
-               .emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
-               .emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
+   .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
+   .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
+   .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
+   .emit_intrinsic_load_image = emit_intrinsic_load_image,
+   .emit_intrinsic_store_image = emit_intrinsic_store_image,
+   .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
+   .emit_intrinsic_image_size = emit_intrinsic_image_size,
+   .emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
+   .emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
  };
-
diff --git a/src/freedreno/ir3/ir3_array_to_ssa.c b/src/freedreno/ir3/ir3_array_to_ssa.c

index 6fdf0ee..7f7be38 100644 (file)
--- a/src/freedreno/ir3/ir3_array_to_ssa.c
+++ b/src/freedreno/ir3/ir3_array_to_ssa.c
@@ -42,270 +42,274 @@
   * so that we don't have to rewrite (and keep track of) users.
   */
  
-#include "ir3.h"
  #include <stdlib.h>
+#include "ir3.h"
  
  struct array_state {
-       struct ir3_register *live_in_definition;
-       struct ir3_register *live_out_definition;
-       bool constructed;
-       bool optimized;
+   struct ir3_register *live_in_definition;
+   struct ir3_register *live_out_definition;
+   bool constructed;
+   bool optimized;
  };
  
  struct array_ctx {
-       struct array_state *states;
-       struct ir3 *ir;
-       unsigned array_count;
+   struct array_state *states;
+   struct ir3 *ir;
+   unsigned array_count;
  };
  
  static struct array_state *
  get_state(struct array_ctx *ctx, struct ir3_block *block, unsigned id)
  {
-       return &ctx->states[ctx->array_count * block->index + id];
+   return &ctx->states[ctx->array_count * block->index + id];
  }
  
-static struct ir3_register *
-read_value_beginning(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr);
+static struct ir3_register *read_value_beginning(struct array_ctx *ctx,
+                                                 struct ir3_block *block,
+                                                 struct ir3_array *arr);
  
  static struct ir3_register *
-read_value_end(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr)
+read_value_end(struct array_ctx *ctx, struct ir3_block *block,
+               struct ir3_array *arr)
  {
-       struct array_state *state = get_state(ctx, block, arr->id);
-       if (state->live_out_definition)
-               return state->live_out_definition;
+   struct array_state *state = get_state(ctx, block, arr->id);
+   if (state->live_out_definition)
+      return state->live_out_definition;
  
-       state->live_out_definition = read_value_beginning(ctx, block, arr);
-       return state->live_out_definition;
+   state->live_out_definition = read_value_beginning(ctx, block, arr);
+   return state->live_out_definition;
  }
  
  /* Roughly equivalent to readValueRecursive from the paper: */
  static struct ir3_register *
-read_value_beginning(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr)
+read_value_beginning(struct array_ctx *ctx, struct ir3_block *block,
+                     struct ir3_array *arr)
  {
-       struct array_state *state = get_state(ctx, block, arr->id);
-
-       if (state->constructed)
-               return state->live_in_definition;
-
-       if (block->predecessors_count == 0) {
-               state->constructed = true;
-               return NULL;
-       }
-
-       if (block->predecessors_count == 1) {
-               state->live_in_definition = read_value_end(ctx, block->predecessors[0], arr);
-               state->constructed = true;
-               return state->live_in_definition;
-       }
-
-       unsigned flags = IR3_REG_ARRAY | (arr->half ? IR3_REG_HALF : 0);
-       struct ir3_instruction *phi =
-               ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
-       list_del(&phi->node);
-       list_add(&phi->node, &block->instr_list);
-
-       struct ir3_register *dst = __ssa_dst(phi);
-       dst->flags |= flags;
-       dst->array.id = arr->id;
-       dst->size = arr->length;
-
-       state->live_in_definition = phi->dsts[0];
-       state->constructed = true;
-
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               struct ir3_register *src = read_value_end(ctx, block->predecessors[i], arr);
-               struct ir3_register *src_reg;
-               if (src) {
-                       src_reg = __ssa_src(phi, src->instr, flags);
-               } else {
-                       src_reg = ir3_src_create(phi, INVALID_REG, flags | IR3_REG_SSA);
-               }
-               src_reg->array.id = arr->id;
-               src_reg->size = arr->length;
-       }
-       return phi->dsts[0];
+   struct array_state *state = get_state(ctx, block, arr->id);
+
+   if (state->constructed)
+      return state->live_in_definition;
+
+   if (block->predecessors_count == 0) {
+      state->constructed = true;
+      return NULL;
+   }
+
+   if (block->predecessors_count == 1) {
+      state->live_in_definition =
+         read_value_end(ctx, block->predecessors[0], arr);
+      state->constructed = true;
+      return state->live_in_definition;
+   }
+
+   unsigned flags = IR3_REG_ARRAY | (arr->half ? IR3_REG_HALF : 0);
+   struct ir3_instruction *phi =
+      ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
+   list_del(&phi->node);
+   list_add(&phi->node, &block->instr_list);
+
+   struct ir3_register *dst = __ssa_dst(phi);
+   dst->flags |= flags;
+   dst->array.id = arr->id;
+   dst->size = arr->length;
+
+   state->live_in_definition = phi->dsts[0];
+   state->constructed = true;
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_register *src =
+         read_value_end(ctx, block->predecessors[i], arr);
+      struct ir3_register *src_reg;
+      if (src) {
+         src_reg = __ssa_src(phi, src->instr, flags);
+      } else {
+         src_reg = ir3_src_create(phi, INVALID_REG, flags | IR3_REG_SSA);
+      }
+      src_reg->array.id = arr->id;
+      src_reg->size = arr->length;
+   }
+   return phi->dsts[0];
  }
  
  static struct ir3_register *
  remove_trivial_phi(struct ir3_instruction *phi)
  {
-       /* Break cycles */
-       if (phi->data)
-               return phi->data;
-       
-       phi->data = phi->dsts[0];
-
-       struct ir3_register *unique_def = NULL;
-       bool unique = true;
-       for (unsigned i = 0; i < phi->block->predecessors_count; i++) {
-               struct ir3_register *src = phi->srcs[i];
-
-               /* If there are any undef sources, then the remaining sources may not
-                * dominate the phi node, even if they are all equal. So we need to
-                * bail out in this case.
-                *
-                * This seems to be a bug in the original paper.
-                */
-               if (!src->def) {
-                       unique = false;
-                       break;
-               }
-
-               struct ir3_instruction *src_instr = src->def->instr;
-               
-               /* phi sources which point to the phi itself don't count for
-                * figuring out if the phi is trivial
-                */
-               if (src_instr == phi)
-                       continue;
-
-               if (src_instr->opc == OPC_META_PHI) {
-                       src->def = remove_trivial_phi(src->def->instr);
-               }
-
-               if (unique_def) {
-                       if (unique_def != src->def) {
-                               unique = false;
-                               break;
-                       }
-               } else {
-                       unique_def = src->def;
-               }
-       }
-
-       if (unique) {
-               phi->data = unique_def;
-               return unique_def;
-       } else {
-               return phi->dsts[0];
-       }
+   /* Break cycles */
+   if (phi->data)
+      return phi->data;
+
+   phi->data = phi->dsts[0];
+
+   struct ir3_register *unique_def = NULL;
+   bool unique = true;
+   for (unsigned i = 0; i < phi->block->predecessors_count; i++) {
+      struct ir3_register *src = phi->srcs[i];
+
+      /* If there are any undef sources, then the remaining sources may not
+       * dominate the phi node, even if they are all equal. So we need to
+       * bail out in this case.
+       *
+       * This seems to be a bug in the original paper.
+       */
+      if (!src->def) {
+         unique = false;
+         break;
+      }
+
+      struct ir3_instruction *src_instr = src->def->instr;
+
+      /* phi sources which point to the phi itself don't count for
+       * figuring out if the phi is trivial
+       */
+      if (src_instr == phi)
+         continue;
+
+      if (src_instr->opc == OPC_META_PHI) {
+         src->def = remove_trivial_phi(src->def->instr);
+      }
+
+      if (unique_def) {
+         if (unique_def != src->def) {
+            unique = false;
+            break;
+         }
+      } else {
+         unique_def = src->def;
+      }
+   }
+
+   if (unique) {
+      phi->data = unique_def;
+      return unique_def;
+   } else {
+      return phi->dsts[0];
+   }
  }
  
  static struct ir3_register *
  lookup_value(struct ir3_register *reg)
  {
-       if (reg->instr->opc == OPC_META_PHI)
-               return reg->instr->data;
-       return reg;
+   if (reg->instr->opc == OPC_META_PHI)
+      return reg->instr->data;
+   return reg;
  }
  
  static struct ir3_register *
  lookup_live_in(struct array_ctx *ctx, struct ir3_block *block, unsigned id)
  {
-       struct array_state *state = get_state(ctx, block, id);
-       if (state->live_in_definition)
-               return lookup_value(state->live_in_definition);
+   struct array_state *state = get_state(ctx, block, id);
+   if (state->live_in_definition)
+      return lookup_value(state->live_in_definition);
  
-       return NULL;
+   return NULL;
  }
  
  bool
  ir3_array_to_ssa(struct ir3 *ir)
  {
-       struct array_ctx ctx = {};
-
-       foreach_array (array, &ir->array_list) {
-               ctx.array_count = MAX2(ctx.array_count, array->id + 1);
-       }
-
-       if (ctx.array_count == 0)
-               return false;
-
-       unsigned i = 0;
-       foreach_block (block, &ir->block_list) {
-               block->index = i++;
-       }
-
-       ctx.ir = ir;
-       ctx.states = calloc(ctx.array_count * i, sizeof(struct array_state));
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       foreach_dst (dst, instr) {
-                               if (dst->flags & IR3_REG_ARRAY) {
-                                       struct array_state *state =
-                                               get_state(&ctx, block, dst->array.id);
-                                       state->live_out_definition = dst;
-                               }
-                       }
-               }
-       }
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (instr->opc == OPC_META_PHI)
-                               continue;
-
-                       foreach_dst (reg, instr) {
-                               if ((reg->flags & IR3_REG_ARRAY) && !reg->tied) {
-                                       struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
-
-                                       /* Construct any phi nodes necessary to read this value */
-                                       read_value_beginning(&ctx, block, arr);
-                               }
-                       }
-                       foreach_src (reg, instr) {
-                               if ((reg->flags & IR3_REG_ARRAY) && !reg->def) {
-                                       struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
-
-                                       /* Construct any phi nodes necessary to read this value */
-                                       read_value_beginning(&ctx, block, arr);
-                               }
-                       }
-               }
-       }
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (instr->opc == OPC_META_PHI)
-                               remove_trivial_phi(instr);
-                       else
-                               break;
-               }
-       }
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (instr->opc == OPC_META_PHI) {
-                               if (!(instr->flags & IR3_REG_ARRAY))
-                                       continue;
-                               if (instr->data != instr->dsts[0]) {
-                                       list_del(&instr->node);
-                                       continue;
-                               }
-                               for (unsigned i = 0; i < instr->srcs_count; i++) {
-                                       instr->srcs[i] = lookup_value(instr->srcs[i]);
-                               }
-                       } else {
-                               foreach_dst (reg, instr) {
-                                       if ((reg->flags & IR3_REG_ARRAY)) {
-                                               if (!reg->tied) {
-                                                       struct ir3_register *def =
-                                                               lookup_live_in(&ctx, block, reg->array.id);
-                                                       if (def)
-                                                               ir3_reg_set_last_array(instr, reg, def);
-                                               }
-                                               reg->flags |= IR3_REG_SSA;
-                                       }
-                               }
-                               foreach_src (reg, instr) {
-                                       if ((reg->flags & IR3_REG_ARRAY)) {
-                                               /* It is assumed that before calling
-                                                * ir3_array_to_ssa(), reg->def was set to the
-                                                * previous writer of the array within the current
-                                                * block or NULL if none.
-                                                */
-                                               if (!reg->def) {
-                                                       reg->def = lookup_live_in(&ctx, block, reg->array.id);
-                                               }
-                                               reg->flags |= IR3_REG_SSA;
-                                       }
-                               }
-                       }
-               }
-       }
-
-       free(ctx.states);
-       return true;
+   struct array_ctx ctx = {};
+
+   foreach_array (array, &ir->array_list) {
+      ctx.array_count = MAX2(ctx.array_count, array->id + 1);
+   }
+
+   if (ctx.array_count == 0)
+      return false;
+
+   unsigned i = 0;
+   foreach_block (block, &ir->block_list) {
+      block->index = i++;
+   }
+
+   ctx.ir = ir;
+   ctx.states = calloc(ctx.array_count * i, sizeof(struct array_state));
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         foreach_dst (dst, instr) {
+            if (dst->flags & IR3_REG_ARRAY) {
+               struct array_state *state =
+                  get_state(&ctx, block, dst->array.id);
+               state->live_out_definition = dst;
+            }
+         }
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc == OPC_META_PHI)
+            continue;
+
+         foreach_dst (reg, instr) {
+            if ((reg->flags & IR3_REG_ARRAY) && !reg->tied) {
+               struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
+
+               /* Construct any phi nodes necessary to read this value */
+               read_value_beginning(&ctx, block, arr);
+            }
+         }
+         foreach_src (reg, instr) {
+            if ((reg->flags & IR3_REG_ARRAY) && !reg->def) {
+               struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
+
+               /* Construct any phi nodes necessary to read this value */
+               read_value_beginning(&ctx, block, arr);
+            }
+         }
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_META_PHI)
+            remove_trivial_phi(instr);
+         else
+            break;
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_META_PHI) {
+            if (!(instr->flags & IR3_REG_ARRAY))
+               continue;
+            if (instr->data != instr->dsts[0]) {
+               list_del(&instr->node);
+               continue;
+            }
+            for (unsigned i = 0; i < instr->srcs_count; i++) {
+               instr->srcs[i] = lookup_value(instr->srcs[i]);
+            }
+         } else {
+            foreach_dst (reg, instr) {
+               if ((reg->flags & IR3_REG_ARRAY)) {
+                  if (!reg->tied) {
+                     struct ir3_register *def =
+                        lookup_live_in(&ctx, block, reg->array.id);
+                     if (def)
+                        ir3_reg_set_last_array(instr, reg, def);
+                  }
+                  reg->flags |= IR3_REG_SSA;
+               }
+            }
+            foreach_src (reg, instr) {
+               if ((reg->flags & IR3_REG_ARRAY)) {
+                  /* It is assumed that before calling
+                   * ir3_array_to_ssa(), reg->def was set to the
+                   * previous writer of the array within the current
+                   * block or NULL if none.
+                   */
+                  if (!reg->def) {
+                     reg->def = lookup_live_in(&ctx, block, reg->array.id);
+                  }
+                  reg->flags |= IR3_REG_SSA;
+               }
+            }
+         }
+      }
+   }
+
+   free(ctx.states);
+   return true;
  }
-
diff --git a/src/freedreno/ir3/ir3_assembler.c b/src/freedreno/ir3/ir3_assembler.c

index e71d4a3..dd46f88 100644 (file)
--- a/src/freedreno/ir3/ir3_assembler.c
+++ b/src/freedreno/ir3/ir3_assembler.c
@@ -22,8 +22,8 @@
   */
  
  #include "ir3_assembler.h"
-#include "ir3_shader.h"
  #include "ir3_parser.h"
+#include "ir3_shader.h"
  
  /**
   * A helper to go from ir3 assembly to assembled shader.  The shader has a
@@ -32,43 +32,43 @@
  struct ir3_shader *
  ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in)
  {
-       struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
-       shader->compiler = c;
-       shader->type = MESA_SHADER_COMPUTE;
-       mtx_init(&shader->variants_lock, mtx_plain);
+   struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
+   shader->compiler = c;
+   shader->type = MESA_SHADER_COMPUTE;
+   mtx_init(&shader->variants_lock, mtx_plain);
  
-       struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v));
-       v->type = MESA_SHADER_COMPUTE;
-       v->shader = shader;
-       v->const_state = rzalloc_size(v, sizeof(*v->const_state));
+   struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v));
+   v->type = MESA_SHADER_COMPUTE;
+   v->shader = shader;
+   v->const_state = rzalloc_size(v, sizeof(*v->const_state));
  
-       shader->variants = v;
-       shader->variant_count = 1;
+   shader->variants = v;
+   shader->variant_count = 1;
  
-       info->numwg = INVALID_REG;
+   info->numwg = INVALID_REG;
  
-       for (int i = 0; i < MAX_BUFS; i++) {
-               info->buf_addr_regs[i] = INVALID_REG;
-       }
+   for (int i = 0; i < MAX_BUFS; i++) {
+      info->buf_addr_regs[i] = INVALID_REG;
+   }
  
-       /* Provide a default local_size in case the shader doesn't set it, so that
-        * we don't crash at least.
-        */
-       v->local_size[0] = v->local_size[1] = v->local_size[2] = 1;
+   /* Provide a default local_size in case the shader doesn't set it, so that
+    * we don't crash at least.
+    */
+   v->local_size[0] = v->local_size[1] = v->local_size[2] = 1;
  
-       v->ir = ir3_parse(v, info, in);
-       if (!v->ir)
-               goto error;
+   v->ir = ir3_parse(v, info, in);
+   if (!v->ir)
+      goto error;
  
-       ir3_debug_print(v->ir, "AFTER PARSING");
+   ir3_debug_print(v->ir, "AFTER PARSING");
  
-       v->bin = ir3_shader_assemble(v);
-       if (!v->bin)
-               goto error;
+   v->bin = ir3_shader_assemble(v);
+   if (!v->bin)
+      goto error;
  
-       return shader;
+   return shader;
  
  error:
-       ralloc_free(shader);
-       return NULL;
+   ralloc_free(shader);
+   return NULL;
  }
diff --git a/src/freedreno/ir3/ir3_assembler.h b/src/freedreno/ir3/ir3_assembler.h

index 1bbe4f6..3547e74 100644 (file)
--- a/src/freedreno/ir3/ir3_assembler.h
+++ b/src/freedreno/ir3/ir3_assembler.h
@@ -30,17 +30,18 @@
  #define MAX_BUFS 4
  
  struct ir3_kernel_info {
-       uint32_t num_bufs;
-       uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
-       uint32_t buf_addr_regs[MAX_BUFS]; 
+   uint32_t num_bufs;
+   uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
+   uint32_t buf_addr_regs[MAX_BUFS];
  
-       /* driver-param uniforms: */
-       unsigned numwg;
+   /* driver-param uniforms: */
+   unsigned numwg;
  };
  
  struct ir3_shader;
  struct ir3_compiler;
  
-struct ir3_shader * ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in);
+struct ir3_shader *ir3_parse_asm(struct ir3_compiler *c,
+                                 struct ir3_kernel_info *info, FILE *in);
  
  #endif /* __IR3_ASSEMBLER_H__ */
diff --git a/src/freedreno/ir3/ir3_cf.c b/src/freedreno/ir3/ir3_cf.c

index d0f9041..dc05738 100644 (file)
--- a/src/freedreno/ir3/ir3_cf.c
+++ b/src/freedreno/ir3/ir3_cf.c
@@ -26,75 +26,74 @@
  #include "ir3.h"
  
  static bool
-is_safe_conv(struct ir3_instruction *instr, type_t src_type,
-                        opc_t *src_opc)
+is_safe_conv(struct ir3_instruction *instr, type_t src_type, opc_t *src_opc)
  {
-       if (instr->opc != OPC_MOV)
-               return false;
-
-       /* Only allow half->full or full->half without any type conversion (like
-        * int to float).
-        */
-       if (type_size(instr->cat1.src_type) == type_size(instr->cat1.dst_type) ||
-               full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
-               return false;
-
-       struct ir3_register *dst = instr->dsts[0];
-       struct ir3_register *src = instr->srcs[0];
-
-       /* disallow conversions that cannot be folded into
-        * alu instructions:
-        */
-       if (instr->cat1.round != ROUND_ZERO)
-               return false;
-
-       if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-               return false;
-       if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-               return false;
-
-       /* Check that the source of the conv matches the type of the src
-        * instruction.
-        */
-       if (src_type == instr->cat1.src_type)
-               return true;
-
-       /* We can handle mismatches with integer types by converting the opcode
-        * but not when an integer is reinterpreted as a float or vice-versa.
-        */
-       if (type_float(src_type) != type_float(instr->cat1.src_type))
-               return false;
-
-       /* We have types with mismatched signedness. Mismatches on the signedness
-        * don't matter when narrowing:
-        */
-       if (type_size(instr->cat1.dst_type) < type_size(instr->cat1.src_type))
-               return true;
-
-       /* Try swapping the opcode: */
-       bool can_swap = true;
-       *src_opc = ir3_try_swap_signedness(*src_opc, &can_swap);
-       return can_swap;
+   if (instr->opc != OPC_MOV)
+      return false;
+
+   /* Only allow half->full or full->half without any type conversion (like
+    * int to float).
+    */
+   if (type_size(instr->cat1.src_type) == type_size(instr->cat1.dst_type) ||
+       full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
+      return false;
+
+   struct ir3_register *dst = instr->dsts[0];
+   struct ir3_register *src = instr->srcs[0];
+
+   /* disallow conversions that cannot be folded into
+    * alu instructions:
+    */
+   if (instr->cat1.round != ROUND_ZERO)
+      return false;
+
+   if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+      return false;
+   if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+      return false;
+
+   /* Check that the source of the conv matches the type of the src
+    * instruction.
+    */
+   if (src_type == instr->cat1.src_type)
+      return true;
+
+   /* We can handle mismatches with integer types by converting the opcode
+    * but not when an integer is reinterpreted as a float or vice-versa.
+    */
+   if (type_float(src_type) != type_float(instr->cat1.src_type))
+      return false;
+
+   /* We have types with mismatched signedness. Mismatches on the signedness
+    * don't matter when narrowing:
+    */
+   if (type_size(instr->cat1.dst_type) < type_size(instr->cat1.src_type))
+      return true;
+
+   /* Try swapping the opcode: */
+   bool can_swap = true;
+   *src_opc = ir3_try_swap_signedness(*src_opc, &can_swap);
+   return can_swap;
  }
  
  static bool
  all_uses_safe_conv(struct ir3_instruction *conv_src, type_t src_type)
  {
-       opc_t opc = conv_src->opc;
-       bool first = true;
-       foreach_ssa_use (use, conv_src) {
-               opc_t new_opc = opc;
-               if (!is_safe_conv(use, src_type, &new_opc))
-                       return false;
-               /* Check if multiple uses have conflicting requirements on the opcode.
-                */
-               if (!first && opc != new_opc)
-                       return false;
-               first = false;
-               opc = new_opc;
-       }
-       conv_src->opc = opc;
-       return true;
+   opc_t opc = conv_src->opc;
+   bool first = true;
+   foreach_ssa_use (use, conv_src) {
+      opc_t new_opc = opc;
+      if (!is_safe_conv(use, src_type, &new_opc))
+         return false;
+      /* Check if multiple uses have conflicting requirements on the opcode.
+       */
+      if (!first && opc != new_opc)
+         return false;
+      first = false;
+      opc = new_opc;
+   }
+   conv_src->opc = opc;
+   return true;
  }
  
  /* For an instruction which has a conversion folded in, re-write the
@@ -105,74 +104,74 @@ all_uses_safe_conv(struct ir3_instruction *conv_src, type_t src_type)
  static void
  rewrite_src_uses(struct ir3_instruction *src)
  {
-       foreach_ssa_use (use, src) {
-               assert(use->opc == OPC_MOV);
+   foreach_ssa_use (use, src) {
+      assert(use->opc == OPC_MOV);
  
-               if (is_half(src)) {
-                       use->srcs[0]->flags |= IR3_REG_HALF;
-               } else {
-                       use->srcs[0]->flags &= ~IR3_REG_HALF;
-               }
+      if (is_half(src)) {
+         use->srcs[0]->flags |= IR3_REG_HALF;
+      } else {
+         use->srcs[0]->flags &= ~IR3_REG_HALF;
+      }
  
-               use->cat1.src_type = use->cat1.dst_type;
-       }
+      use->cat1.src_type = use->cat1.dst_type;
+   }
  }
  
  static bool
  try_conversion_folding(struct ir3_instruction *conv)
  {
-       struct ir3_instruction *src;
+   struct ir3_instruction *src;
  
-       if (conv->opc != OPC_MOV)
-               return false;
+   if (conv->opc != OPC_MOV)
+      return false;
  
-       /* NOTE: we can have non-ssa srcs after copy propagation: */
-       src = ssa(conv->srcs[0]);
-       if (!src)
-               return false;
+   /* NOTE: we can have non-ssa srcs after copy propagation: */
+   src = ssa(conv->srcs[0]);
+   if (!src)
+      return false;
  
-       if (!is_alu(src))
-               return false;
+   if (!is_alu(src))
+      return false;
  
-       bool can_fold;
-       type_t base_type = ir3_output_conv_type(src, &can_fold);
-       if (!can_fold)
-               return false;
+   bool can_fold;
+   type_t base_type = ir3_output_conv_type(src, &can_fold);
+   if (!can_fold)
+      return false;
  
-       type_t src_type = ir3_output_conv_src_type(src, base_type);
-       type_t dst_type = ir3_output_conv_dst_type(src, base_type);
+   type_t src_type = ir3_output_conv_src_type(src, base_type);
+   type_t dst_type = ir3_output_conv_dst_type(src, base_type);
  
-       /* Avoid cases where we've already folded in a conversion. We assume that
-        * if there is a chain of conversions that's foldable then it's been
-        * folded in NIR already.
-        */
-       if (src_type != dst_type)
-               return false;
+   /* Avoid cases where we've already folded in a conversion. We assume that
+    * if there is a chain of conversions that's foldable then it's been
+    * folded in NIR already.
+    */
+   if (src_type != dst_type)
+      return false;
  
-       if (!all_uses_safe_conv(src, src_type))
-               return false;
+   if (!all_uses_safe_conv(src, src_type))
+      return false;
  
-       ir3_set_dst_type(src, is_half(conv));
-       rewrite_src_uses(src);
+   ir3_set_dst_type(src, is_half(conv));
+   rewrite_src_uses(src);
  
-       return true;
+   return true;
  }
  
  bool
  ir3_cf(struct ir3 *ir)
  {
-       void *mem_ctx = ralloc_context(NULL);
-       bool progress = false;
+   void *mem_ctx = ralloc_context(NULL);
+   bool progress = false;
  
-       ir3_find_ssa_uses(ir, mem_ctx, false);
+   ir3_find_ssa_uses(ir, mem_ctx, false);
  
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       progress |= try_conversion_folding(instr);
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         progress |= try_conversion_folding(instr);
+      }
+   }
  
-       ralloc_free(mem_ctx);
+   ralloc_free(mem_ctx);
  
-       return progress;
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c

index 879260a..c37eae7 100644 (file)
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -51,8 +51,10 @@ static const struct debug_named_value shader_debug_options[] = {
     /* clang-format on */
  };
  
-DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
-DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH", NULL)
+DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
+                            shader_debug_options, 0)
+DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
+                      NULL)
  
  enum ir3_shader_debug ir3_shader_debug = 0;
  const char *ir3_shader_override_path = NULL;
@@ -60,126 +62,127 @@ const char *ir3_shader_override_path = NULL;
  void
  ir3_compiler_destroy(struct ir3_compiler *compiler)
  {
-       disk_cache_destroy(compiler->disk_cache);
-       ralloc_free(compiler);
+   disk_cache_destroy(compiler->disk_cache);
+   ralloc_free(compiler);
  }
  
  struct ir3_compiler *
-ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_access)
+ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
+                    bool robust_ubo_access)
  {
-       struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
-
-       ir3_shader_debug = debug_get_option_ir3_shader_debug();
-       ir3_shader_override_path =
-               !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
-
-       if (ir3_shader_override_path) {
-               ir3_shader_debug |= IR3_DBG_NOCACHE;
-       }
-
-       compiler->dev = dev;
-       compiler->gpu_id = gpu_id;
-       compiler->robust_ubo_access = robust_ubo_access;
-
-       /* All known GPU's have 32k local memory (aka shared) */
-       compiler->local_mem_size = 32 * 1024;
-       /* TODO see if older GPU's were different here */
-       compiler->branchstack_size = 64;
-       compiler->wave_granularity = 2;
-       compiler->max_waves = 16;
-
-       if (compiler->gpu_id >= 600) {
-               compiler->samgq_workaround = true;
-               /* a6xx split the pipeline state into geometry and fragment state, in
-                * order to let the VS run ahead of the FS. As a result there are now
-                * separate const files for the the fragment shader and everything
-                * else, and separate limits. There seems to be a shared limit, but
-                * it's higher than the vert or frag limits.
-                *
-                * TODO: The shared limit seems to be different on different on
-                * different models.
-                */
-               compiler->max_const_pipeline = 640;
-               compiler->max_const_frag = 512;
-               compiler->max_const_geom = 512;
-               compiler->max_const_safe = 128;
-
-               /* Compute shaders don't share a const file with the FS. Instead they
-                * have their own file, which is smaller than the FS one.
-                *
-                * TODO: is this true on earlier gen's?
-                */
-               compiler->max_const_compute = 256;
-
-               /* TODO: implement clip+cull distances on earlier gen's */
-               compiler->has_clip_cull = true;
-
-               /* TODO: implement private memory on earlier gen's */
-               compiler->has_pvtmem = true;
-
-               if (compiler->gpu_id == 650)
-                       compiler->tess_use_shared = true;
-       } else {
-               compiler->max_const_pipeline = 512;
-               compiler->max_const_geom = 512;
-               compiler->max_const_frag = 512;
-               compiler->max_const_compute = 512;
-
-               /* Note: this will have to change if/when we support tess+GS on
-                * earlier gen's.
-                */
-               compiler->max_const_safe = 256;
-       }
-
-       if (compiler->gpu_id == 650) {
-               /* This changed mid-generation for a650, so that using r32.x and above
-                * requires using the smallest threadsize.
-                */
-               compiler->reg_size_vec4 = 64;
-       } else if (compiler->gpu_id >= 600) {
-               compiler->reg_size_vec4 = 96;
-       } else if (compiler->gpu_id >= 400) {
-               /* On a4xx-a5xx, using r24.x and above requires using the smallest
-                * threadsize.
-                */
-               compiler->reg_size_vec4 = 48;
-       } else {
-               /* TODO: confirm this */
-               compiler->reg_size_vec4 = 96;
-       }
-
-       if (compiler->gpu_id >= 600) {
-               compiler->threadsize_base = 64;
-       } else if (compiler->gpu_id >= 400) {
-               /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
-                * 1.1 subgroupSize which is 32.
-                */
-               compiler->threadsize_base = 32;
-       } else {
-               compiler->threadsize_base = 8;
-       }
-
-       if (compiler->gpu_id >= 400) {
-               /* need special handling for "flat" */
-               compiler->flat_bypass = true;
-               compiler->levels_add_one = false;
-               compiler->unminify_coords = false;
-               compiler->txf_ms_with_isaml = false;
-               compiler->array_index_add_half = true;
-               compiler->instr_align = 16;
-               compiler->const_upload_unit = 4;
-       } else {
-               /* no special handling for "flat" */
-               compiler->flat_bypass = false;
-               compiler->levels_add_one = true;
-               compiler->unminify_coords = true;
-               compiler->txf_ms_with_isaml = true;
-               compiler->array_index_add_half = false;
-               compiler->instr_align = 4;
-               compiler->const_upload_unit = 8;
-       }
-
-       ir3_disk_cache_init(compiler);
-
-       return compiler;
+   struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+
+   ir3_shader_debug = debug_get_option_ir3_shader_debug();
+   ir3_shader_override_path =
+      !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
+
+   if (ir3_shader_override_path) {
+      ir3_shader_debug |= IR3_DBG_NOCACHE;
+   }
+
+   compiler->dev = dev;
+   compiler->gpu_id = gpu_id;
+   compiler->robust_ubo_access = robust_ubo_access;
+
+   /* All known GPU's have 32k local memory (aka shared) */
+   compiler->local_mem_size = 32 * 1024;
+   /* TODO see if older GPU's were different here */
+   compiler->branchstack_size = 64;
+   compiler->wave_granularity = 2;
+   compiler->max_waves = 16;
+
+   if (compiler->gpu_id >= 600) {
+      compiler->samgq_workaround = true;
+      /* a6xx split the pipeline state into geometry and fragment state, in
+       * order to let the VS run ahead of the FS. As a result there are now
+       * separate const files for the the fragment shader and everything
+       * else, and separate limits. There seems to be a shared limit, but
+       * it's higher than the vert or frag limits.
+       *
+       * TODO: The shared limit seems to be different on different on
+       * different models.
+       */
+      compiler->max_const_pipeline = 640;
+      compiler->max_const_frag = 512;
+      compiler->max_const_geom = 512;
+      compiler->max_const_safe = 128;
+
+      /* Compute shaders don't share a const file with the FS. Instead they
+       * have their own file, which is smaller than the FS one.
+       *
+       * TODO: is this true on earlier gen's?
+       */
+      compiler->max_const_compute = 256;
+
+      /* TODO: implement clip+cull distances on earlier gen's */
+      compiler->has_clip_cull = true;
+
+      /* TODO: implement private memory on earlier gen's */
+      compiler->has_pvtmem = true;
+
+      if (compiler->gpu_id == 650)
+         compiler->tess_use_shared = true;
+   } else {
+      compiler->max_const_pipeline = 512;
+      compiler->max_const_geom = 512;
+      compiler->max_const_frag = 512;
+      compiler->max_const_compute = 512;
+
+      /* Note: this will have to change if/when we support tess+GS on
+       * earlier gen's.
+       */
+      compiler->max_const_safe = 256;
+   }
+
+   if (compiler->gpu_id == 650) {
+      /* This changed mid-generation for a650, so that using r32.x and above
+       * requires using the smallest threadsize.
+       */
+      compiler->reg_size_vec4 = 64;
+   } else if (compiler->gpu_id >= 600) {
+      compiler->reg_size_vec4 = 96;
+   } else if (compiler->gpu_id >= 400) {
+      /* On a4xx-a5xx, using r24.x and above requires using the smallest
+       * threadsize.
+       */
+      compiler->reg_size_vec4 = 48;
+   } else {
+      /* TODO: confirm this */
+      compiler->reg_size_vec4 = 96;
+   }
+
+   if (compiler->gpu_id >= 600) {
+      compiler->threadsize_base = 64;
+   } else if (compiler->gpu_id >= 400) {
+      /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
+       * 1.1 subgroupSize which is 32.
+       */
+      compiler->threadsize_base = 32;
+   } else {
+      compiler->threadsize_base = 8;
+   }
+
+   if (compiler->gpu_id >= 400) {
+      /* need special handling for "flat" */
+      compiler->flat_bypass = true;
+      compiler->levels_add_one = false;
+      compiler->unminify_coords = false;
+      compiler->txf_ms_with_isaml = false;
+      compiler->array_index_add_half = true;
+      compiler->instr_align = 16;
+      compiler->const_upload_unit = 4;
+   } else {
+      /* no special handling for "flat" */
+      compiler->flat_bypass = false;
+      compiler->levels_add_one = true;
+      compiler->unminify_coords = true;
+      compiler->txf_ms_with_isaml = true;
+      compiler->array_index_add_half = false;
+      compiler->instr_align = 4;
+      compiler->const_upload_unit = 8;
+   }
+
+   ir3_disk_cache_init(compiler);
+
+   return compiler;
  }
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h

index 62dfc62..427a3b2 100644 (file)
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -36,167 +36,167 @@ struct ir3_ra_reg_set;
  struct ir3_shader;
  
  struct ir3_compiler {
-       struct fd_device *dev;
-       uint32_t gpu_id;
-       uint32_t shader_count;
-
-       struct disk_cache *disk_cache;
-
-       /* If true, UBO accesses are assumed to be bounds-checked as defined by
-        * VK_EXT_robustness2 and optimizations may have to be more conservative.
-        */
-       bool robust_ubo_access;
-
-       /*
-        * Configuration options for things that are handled differently on
-        * different generations:
-        */
-
-       /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
-        * so we need to use ldlv.u32 to load the varying directly:
-        */
-       bool flat_bypass;
-
-       /* on a3xx, we need to add one to # of array levels:
-        */
-       bool levels_add_one;
-
-       /* on a3xx, we need to scale up integer coords for isaml based
-        * on LoD:
-        */
-       bool unminify_coords;
-
-       /* on a3xx do txf_ms w/ isaml and scaled coords: */
-       bool txf_ms_with_isaml;
-
-       /* on a4xx, for array textures we need to add 0.5 to the array
-        * index coordinate:
-        */
-       bool array_index_add_half;
-
-       /* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
-        */
-       bool samgq_workaround;
-
-       /* on a650, vertex shader <-> tess control io uses LDL/STL */
-       bool tess_use_shared;
-
-       /* The maximum number of constants, in vec4's, across the entire graphics
-        * pipeline.
-        */
-       uint16_t max_const_pipeline;
-
-       /* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
-       uint16_t max_const_geom;
-
-       /* The maximum number of constants, in vec4's, for FS. */
-       uint16_t max_const_frag;
-
-       /* A "safe" max constlen that can be applied to each shader in the
-        * pipeline which we guarantee will never exceed any combined limits.
-        */
-       uint16_t max_const_safe;
-
-       /* The maximum number of constants, in vec4's, for compute shaders. */
-       uint16_t max_const_compute;
-
-       /* Number of instructions that the shader's base address and length
-        * (instrlen divides instruction count by this) must be aligned to.
-        */
-       uint32_t instr_align;
-
-       /* on a3xx, the unit of indirect const load is higher than later gens (in
-        * vec4 units):
-        */
-       uint32_t const_upload_unit;
-
-       /* The base number of threads per wave. Some stages may be able to double
-        * this.
-        */
-       uint32_t threadsize_base;
-
-       /* On at least a6xx, waves are always launched in pairs. In calculations
-        * about occupancy, we pretend that each wave pair is actually one wave,
-        * which simplifies many of the calculations, but means we have to
-        * multiply threadsize_base by this number.
-        */
-       uint32_t wave_granularity;
-
-       /* The maximum number of simultaneous waves per core. */
-       uint32_t max_waves;
-
-       /* This is theoretical maximum number of vec4 registers that one wave of
-        * the base threadsize could use. To get the actual size of the register
-        * file in bytes one would need to compute:
-        *
-        * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
-        *
-        * However this number is more often what we actually need. For example, a
-        * max_reg more than half of this will result in a doubled threadsize
-        * being impossible (because double-sized waves take up twice as many
-        * registers). Also, the formula for the occupancy given a particular
-        * register footprint is simpler.
-        *
-        * It is in vec4 units because the register file is allocated
-        * with vec4 granularity, so it's in the same units as max_reg.
-        */
-       uint32_t reg_size_vec4;
-
-       /* The size of local memory in bytes */
-       uint32_t local_mem_size;
-
-       /* The number of total branch stack entries, divided by wave_granularity. */
-       uint32_t branchstack_size;
-
-       /* Whether clip+cull distances are supported */
-       bool has_clip_cull;
-
-       /* Whether private memory is supported */
-       bool has_pvtmem;
+   struct fd_device *dev;
+   uint32_t gpu_id;
+   uint32_t shader_count;
+
+   struct disk_cache *disk_cache;
+
+   /* If true, UBO accesses are assumed to be bounds-checked as defined by
+    * VK_EXT_robustness2 and optimizations may have to be more conservative.
+    */
+   bool robust_ubo_access;
+
+   /*
+    * Configuration options for things that are handled differently on
+    * different generations:
+    */
+
+   /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
+    * so we need to use ldlv.u32 to load the varying directly:
+    */
+   bool flat_bypass;
+
+   /* on a3xx, we need to add one to # of array levels:
+    */
+   bool levels_add_one;
+
+   /* on a3xx, we need to scale up integer coords for isaml based
+    * on LoD:
+    */
+   bool unminify_coords;
+
+   /* on a3xx do txf_ms w/ isaml and scaled coords: */
+   bool txf_ms_with_isaml;
+
+   /* on a4xx, for array textures we need to add 0.5 to the array
+    * index coordinate:
+    */
+   bool array_index_add_half;
+
+   /* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
+    */
+   bool samgq_workaround;
+
+   /* on a650, vertex shader <-> tess control io uses LDL/STL */
+   bool tess_use_shared;
+
+   /* The maximum number of constants, in vec4's, across the entire graphics
+    * pipeline.
+    */
+   uint16_t max_const_pipeline;
+
+   /* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
+   uint16_t max_const_geom;
+
+   /* The maximum number of constants, in vec4's, for FS. */
+   uint16_t max_const_frag;
+
+   /* A "safe" max constlen that can be applied to each shader in the
+    * pipeline which we guarantee will never exceed any combined limits.
+    */
+   uint16_t max_const_safe;
+
+   /* The maximum number of constants, in vec4's, for compute shaders. */
+   uint16_t max_const_compute;
+
+   /* Number of instructions that the shader's base address and length
+    * (instrlen divides instruction count by this) must be aligned to.
+    */
+   uint32_t instr_align;
+
+   /* on a3xx, the unit of indirect const load is higher than later gens (in
+    * vec4 units):
+    */
+   uint32_t const_upload_unit;
+
+   /* The base number of threads per wave. Some stages may be able to double
+    * this.
+    */
+   uint32_t threadsize_base;
+
+   /* On at least a6xx, waves are always launched in pairs. In calculations
+    * about occupancy, we pretend that each wave pair is actually one wave,
+    * which simplifies many of the calculations, but means we have to
+    * multiply threadsize_base by this number.
+    */
+   uint32_t wave_granularity;
+
+   /* The maximum number of simultaneous waves per core. */
+   uint32_t max_waves;
+
+   /* This is theoretical maximum number of vec4 registers that one wave of
+    * the base threadsize could use. To get the actual size of the register
+    * file in bytes one would need to compute:
+    *
+    * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
+    *
+    * However this number is more often what we actually need. For example, a
+    * max_reg more than half of this will result in a doubled threadsize
+    * being impossible (because double-sized waves take up twice as many
+    * registers). Also, the formula for the occupancy given a particular
+    * register footprint is simpler.
+    *
+    * It is in vec4 units because the register file is allocated
+    * with vec4 granularity, so it's in the same units as max_reg.
+    */
+   uint32_t reg_size_vec4;
+
+   /* The size of local memory in bytes */
+   uint32_t local_mem_size;
+
+   /* The number of total branch stack entries, divided by wave_granularity. */
+   uint32_t branchstack_size;
+
+   /* Whether clip+cull distances are supported */
+   bool has_clip_cull;
+
+   /* Whether private memory is supported */
+   bool has_pvtmem;
  };
  
  void ir3_compiler_destroy(struct ir3_compiler *compiler);
-struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
-                                                                                 bool robust_ubo_access);
+struct ir3_compiler *ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
+                                         bool robust_ubo_access);
  
  void ir3_disk_cache_init(struct ir3_compiler *compiler);
  void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
-               struct ir3_shader *shader);
+                                    struct ir3_shader *shader);
  bool ir3_disk_cache_retrieve(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *v);
+                             struct ir3_shader_variant *v);
  void ir3_disk_cache_store(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *v);
+                          struct ir3_shader_variant *v);
  
  int ir3_compile_shader_nir(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so);
+                           struct ir3_shader_variant *so);
  
  /* gpu pointer size in units of 32bit registers/slots */
-static inline
-unsigned ir3_pointer_size(struct ir3_compiler *compiler)
+static inline unsigned
+ir3_pointer_size(struct ir3_compiler *compiler)
  {
-       return (compiler->gpu_id >= 500) ? 2 : 1;
+   return (compiler->gpu_id >= 500) ? 2 : 1;
  }
  
  enum ir3_shader_debug {
-       IR3_DBG_SHADER_VS  = BITFIELD_BIT(0),
-       IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
-       IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
-       IR3_DBG_SHADER_GS  = BITFIELD_BIT(3),
-       IR3_DBG_SHADER_FS  = BITFIELD_BIT(4),
-       IR3_DBG_SHADER_CS  = BITFIELD_BIT(5),
-       IR3_DBG_DISASM     = BITFIELD_BIT(6),
-       IR3_DBG_OPTMSGS    = BITFIELD_BIT(7),
-       IR3_DBG_FORCES2EN  = BITFIELD_BIT(8),
-       IR3_DBG_NOUBOOPT   = BITFIELD_BIT(9),
-       IR3_DBG_NOFP16     = BITFIELD_BIT(10),
-       IR3_DBG_NOCACHE    = BITFIELD_BIT(11),
-
-       /* DEBUG-only options: */
-       IR3_DBG_SCHEDMSGS  = BITFIELD_BIT(20),
-       IR3_DBG_RAMSGS     = BITFIELD_BIT(21),
-
-       /* Only used for the disk-caching logic: */
-       IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
+   IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
+   IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
+   IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
+   IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
+   IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
+   IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
+   IR3_DBG_DISASM = BITFIELD_BIT(6),
+   IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
+   IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
+   IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
+   IR3_DBG_NOFP16 = BITFIELD_BIT(10),
+   IR3_DBG_NOCACHE = BITFIELD_BIT(11),
+
+   /* DEBUG-only options: */
+   IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
+   IR3_DBG_RAMSGS = BITFIELD_BIT(21),
+
+   /* Only used for the disk-caching logic: */
+   IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
  };
  
  extern enum ir3_shader_debug ir3_shader_debug;
@@ -205,29 +205,35 @@ extern const char *ir3_shader_override_path;
  static inline bool
  shader_debug_enabled(gl_shader_stage type)
  {
-       if (ir3_shader_debug & IR3_DBG_DISASM)
-               return true;
-
-       switch (type) {
-       case MESA_SHADER_VERTEX:      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
-       case MESA_SHADER_TESS_CTRL:   return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
-       case MESA_SHADER_TESS_EVAL:   return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
-       case MESA_SHADER_GEOMETRY:    return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
-       case MESA_SHADER_FRAGMENT:    return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
-       case MESA_SHADER_COMPUTE:     return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
-       default:
-               debug_assert(0);
-               return false;
-       }
+   if (ir3_shader_debug & IR3_DBG_DISASM)
+      return true;
+
+   switch (type) {
+   case MESA_SHADER_VERTEX:
+      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
+   case MESA_SHADER_TESS_CTRL:
+      return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
+   case MESA_SHADER_TESS_EVAL:
+      return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
+   case MESA_SHADER_GEOMETRY:
+      return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
+   case MESA_SHADER_FRAGMENT:
+      return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
+   case MESA_SHADER_COMPUTE:
+      return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
+   default:
+      debug_assert(0);
+      return false;
+   }
  }
  
  static inline void
  ir3_debug_print(struct ir3 *ir, const char *when)
  {
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               mesa_logi("%s:", when);
-               ir3_print(ir);
-       }
+   if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+      mesa_logi("%s:", when);
+      ir3_print(ir);
+   }
  }
  
  #endif /* IR3_COMPILER_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c

index 8ffb929..0c5d052 100644 (file)
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -26,84 +26,86 @@
  
  #include <stdarg.h>
  
-#include "util/u_string.h"
-#include "util/u_memory.h"
  #include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
  
  #include "ir3_compiler.h"
  #include "ir3_image.h"
-#include "ir3_shader.h"
  #include "ir3_nir.h"
+#include "ir3_shader.h"
  
  #include "instr-a3xx.h"
  #include "ir3.h"
  #include "ir3_context.h"
  
  void
-ir3_handle_nonuniform(struct ir3_instruction *instr, nir_intrinsic_instr *intrin)
+ir3_handle_nonuniform(struct ir3_instruction *instr,
+                      nir_intrinsic_instr *intrin)
  {
-       if (nir_intrinsic_has_access(intrin) &&
-                       (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
-               instr->flags |= IR3_INSTR_NONUNIF;
-       }
+   if (nir_intrinsic_has_access(intrin) &&
+       (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
+      instr->flags |= IR3_INSTR_NONUNIF;
+   }
  }
  
  void
  ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
  {
-       nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
-       if (!intrin)
-               return;
+   nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
+   if (!intrin)
+      return;
  
-       instr->flags |= IR3_INSTR_B;
-       instr->cat6.base = nir_intrinsic_desc_set(intrin);
+   instr->flags |= IR3_INSTR_B;
+   instr->cat6.base = nir_intrinsic_desc_set(intrin);
  }
  
  static struct ir3_instruction *
  create_input(struct ir3_context *ctx, unsigned compmask)
  {
-       struct ir3_instruction *in;
+   struct ir3_instruction *in;
  
-       in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
-       in->input.sysval = ~0;
-       __ssa_dst(in)->wrmask = compmask;
+   in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
+   in->input.sysval = ~0;
+   __ssa_dst(in)->wrmask = compmask;
  
-       array_insert(ctx->ir, ctx->ir->inputs, in);
+   array_insert(ctx->ir, ctx->ir->inputs, in);
  
-       return in;
+   return in;
  }
  
  static struct ir3_instruction *
-create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord, unsigned n)
+create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
+                  unsigned n)
  {
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *instr;
-       /* packed inloc is fixed up later: */
-       struct ir3_instruction *inloc = create_immed(block, n);
-
-       if (coord) {
-               instr = ir3_BARY_F(block, inloc, 0, coord, 0);
-       } else if (ctx->compiler->flat_bypass) {
-               instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
-               instr->cat6.type = TYPE_U32;
-               instr->cat6.iim_val = 1;
-       } else {
-               instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
-               instr->srcs[1]->wrmask = 0x3;
-       }
-
-       return instr;
+   struct ir3_block *block = ctx->block;
+   struct ir3_instruction *instr;
+   /* packed inloc is fixed up later: */
+   struct ir3_instruction *inloc = create_immed(block, n);
+
+   if (coord) {
+      instr = ir3_BARY_F(block, inloc, 0, coord, 0);
+   } else if (ctx->compiler->flat_bypass) {
+      instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
+      instr->cat6.type = TYPE_U32;
+      instr->cat6.iim_val = 1;
+   } else {
+      instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
+      instr->srcs[1]->wrmask = 0x3;
+   }
+
+   return instr;
  }
  
  static struct ir3_instruction *
  create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
  {
-       /* first four vec4 sysval's reserved for UBOs: */
-       /* NOTE: dp is in scalar, but there can be >4 dp components: */
-       struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-       unsigned n = const_state->offsets.driver_param;
-       unsigned r = regid(n + dp / 4, dp % 4);
-       return create_uniform(ctx->block, r);
+   /* first four vec4 sysval's reserved for UBOs: */
+   /* NOTE: dp is in scalar, but there can be >4 dp components: */
+   struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   unsigned n = const_state->offsets.driver_param;
+   unsigned r = regid(n + dp / 4, dp % 4);
+   return create_uniform(ctx->block, r);
  }
  
  /*
@@ -120,884 +122,880 @@ create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
  
  static struct ir3_instruction *
  create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
-               unsigned src_bitsize, nir_op op)
+           unsigned src_bitsize, nir_op op)
  {
-       type_t src_type, dst_type;
-
-       switch (op) {
-       case nir_op_f2f32:
-       case nir_op_f2f16_rtne:
-       case nir_op_f2f16_rtz:
-       case nir_op_f2f16:
-       case nir_op_f2i32:
-       case nir_op_f2i16:
-       case nir_op_f2i8:
-       case nir_op_f2u32:
-       case nir_op_f2u16:
-       case nir_op_f2u8:
-               switch (src_bitsize) {
-               case 32:
-                       src_type = TYPE_F32;
-                       break;
-               case 16:
-                       src_type = TYPE_F16;
-                       break;
-               default:
-                       ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
-               }
-               break;
-
-       case nir_op_i2f32:
-       case nir_op_i2f16:
-       case nir_op_i2i32:
-       case nir_op_i2i16:
-       case nir_op_i2i8:
-               switch (src_bitsize) {
-               case 32:
-                       src_type = TYPE_S32;
-                       break;
-               case 16:
-                       src_type = TYPE_S16;
-                       break;
-               case 8:
-                       src_type = TYPE_S8;
-                       break;
-               default:
-                       ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
-               }
-               break;
-
-       case nir_op_u2f32:
-       case nir_op_u2f16:
-       case nir_op_u2u32:
-       case nir_op_u2u16:
-       case nir_op_u2u8:
-               switch (src_bitsize) {
-               case 32:
-                       src_type = TYPE_U32;
-                       break;
-               case 16:
-                       src_type = TYPE_U16;
-                       break;
-               case 8:
-                       src_type = TYPE_U8;
-                       break;
-               default:
-                       ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
-               }
-               break;
-
-       case nir_op_b2f16:
-       case nir_op_b2f32:
-       case nir_op_b2i8:
-       case nir_op_b2i16:
-       case nir_op_b2i32:
-               src_type = TYPE_U32;
-               break;
-
-       default:
-               ir3_context_error(ctx, "invalid conversion op: %u", op);
-       }
-
-       switch (op) {
-       case nir_op_f2f32:
-       case nir_op_i2f32:
-       case nir_op_u2f32:
-       case nir_op_b2f32:
-               dst_type = TYPE_F32;
-               break;
-
-       case nir_op_f2f16_rtne:
-       case nir_op_f2f16_rtz:
-       case nir_op_f2f16:
-       case nir_op_i2f16:
-       case nir_op_u2f16:
-       case nir_op_b2f16:
-               dst_type = TYPE_F16;
-               break;
-
-       case nir_op_f2i32:
-       case nir_op_i2i32:
-       case nir_op_b2i32:
-               dst_type = TYPE_S32;
-               break;
-
-       case nir_op_f2i16:
-       case nir_op_i2i16:
-       case nir_op_b2i16:
-               dst_type = TYPE_S16;
-               break;
-
-       case nir_op_f2i8:
-       case nir_op_i2i8:
-       case nir_op_b2i8:
-               dst_type = TYPE_S8;
-               break;
-
-       case nir_op_f2u32:
-       case nir_op_u2u32:
-               dst_type = TYPE_U32;
-               break;
-
-       case nir_op_f2u16:
-       case nir_op_u2u16:
-               dst_type = TYPE_U16;
-               break;
-
-       case nir_op_f2u8:
-       case nir_op_u2u8:
-               dst_type = TYPE_U8;
-               break;
-
-       default:
-               ir3_context_error(ctx, "invalid conversion op: %u", op);
-       }
-
-       if (src_type == dst_type)
-               return src;
-
-       struct ir3_instruction *cov =
-               ir3_COV(ctx->block, src, src_type, dst_type);
-
-       if (op == nir_op_f2f16_rtne) {
-               cov->cat1.round = ROUND_EVEN;
-       } else if (op == nir_op_f2f16) {
-               unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
-               nir_rounding_mode rounding_mode =
-                       nir_get_rounding_mode_from_float_controls(execution_mode, nir_type_float16);
-               if (rounding_mode == nir_rounding_mode_rtne)
-                       cov->cat1.round = ROUND_EVEN;
-       }
-
-       return cov;
+   type_t src_type, dst_type;
+
+   switch (op) {
+   case nir_op_f2f32:
+   case nir_op_f2f16_rtne:
+   case nir_op_f2f16_rtz:
+   case nir_op_f2f16:
+   case nir_op_f2i32:
+   case nir_op_f2i16:
+   case nir_op_f2i8:
+   case nir_op_f2u32:
+   case nir_op_f2u16:
+   case nir_op_f2u8:
+      switch (src_bitsize) {
+      case 32:
+         src_type = TYPE_F32;
+         break;
+      case 16:
+         src_type = TYPE_F16;
+         break;
+      default:
+         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
+      }
+      break;
+
+   case nir_op_i2f32:
+   case nir_op_i2f16:
+   case nir_op_i2i32:
+   case nir_op_i2i16:
+   case nir_op_i2i8:
+      switch (src_bitsize) {
+      case 32:
+         src_type = TYPE_S32;
+         break;
+      case 16:
+         src_type = TYPE_S16;
+         break;
+      case 8:
+         src_type = TYPE_S8;
+         break;
+      default:
+         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
+      }
+      break;
+
+   case nir_op_u2f32:
+   case nir_op_u2f16:
+   case nir_op_u2u32:
+   case nir_op_u2u16:
+   case nir_op_u2u8:
+      switch (src_bitsize) {
+      case 32:
+         src_type = TYPE_U32;
+         break;
+      case 16:
+         src_type = TYPE_U16;
+         break;
+      case 8:
+         src_type = TYPE_U8;
+         break;
+      default:
+         ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
+      }
+      break;
+
+   case nir_op_b2f16:
+   case nir_op_b2f32:
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+      src_type = TYPE_U32;
+      break;
+
+   default:
+      ir3_context_error(ctx, "invalid conversion op: %u", op);
+   }
+
+   switch (op) {
+   case nir_op_f2f32:
+   case nir_op_i2f32:
+   case nir_op_u2f32:
+   case nir_op_b2f32:
+      dst_type = TYPE_F32;
+      break;
+
+   case nir_op_f2f16_rtne:
+   case nir_op_f2f16_rtz:
+   case nir_op_f2f16:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
+   case nir_op_b2f16:
+      dst_type = TYPE_F16;
+      break;
+
+   case nir_op_f2i32:
+   case nir_op_i2i32:
+   case nir_op_b2i32:
+      dst_type = TYPE_S32;
+      break;
+
+   case nir_op_f2i16:
+   case nir_op_i2i16:
+   case nir_op_b2i16:
+      dst_type = TYPE_S16;
+      break;
+
+   case nir_op_f2i8:
+   case nir_op_i2i8:
+   case nir_op_b2i8:
+      dst_type = TYPE_S8;
+      break;
+
+   case nir_op_f2u32:
+   case nir_op_u2u32:
+      dst_type = TYPE_U32;
+      break;
+
+   case nir_op_f2u16:
+   case nir_op_u2u16:
+      dst_type = TYPE_U16;
+      break;
+
+   case nir_op_f2u8:
+   case nir_op_u2u8:
+      dst_type = TYPE_U8;
+      break;
+
+   default:
+      ir3_context_error(ctx, "invalid conversion op: %u", op);
+   }
+
+   if (src_type == dst_type)
+      return src;
+
+   struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
+
+   if (op == nir_op_f2f16_rtne) {
+      cov->cat1.round = ROUND_EVEN;
+   } else if (op == nir_op_f2f16) {
+      unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
+      nir_rounding_mode rounding_mode =
+         nir_get_rounding_mode_from_float_controls(execution_mode,
+                                                   nir_type_float16);
+      if (rounding_mode == nir_rounding_mode_rtne)
+         cov->cat1.round = ROUND_EVEN;
+   }
+
+   return cov;
  }
  
  /* For shift instructions NIR always has shift amount as 32 bit integer */
  static struct ir3_instruction *
-resize_shift_amount(struct ir3_context *ctx,
-                                       struct ir3_instruction *src, unsigned bs)
+resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
+                    unsigned bs)
  {
-       if (bs != 16)
-               return src;
+   if (bs != 16)
+      return src;
  
-       return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
+   return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
  }
  
  static void
  emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
  {
-       const nir_op_info *info = &nir_op_infos[alu->op];
-       struct ir3_instruction **dst, *src[info->num_inputs];
-       unsigned bs[info->num_inputs];     /* bit size */
-       struct ir3_block *b = ctx->block;
-       unsigned dst_sz, wrmask;
-       type_t dst_type = nir_dest_bit_size(alu->dest.dest) == 16 ?
-                       TYPE_U16 : TYPE_U32;
-
-       if (alu->dest.dest.is_ssa) {
-               dst_sz = alu->dest.dest.ssa.num_components;
-               wrmask = (1 << dst_sz) - 1;
-       } else {
-               dst_sz = alu->dest.dest.reg.reg->num_components;
-               wrmask = alu->dest.write_mask;
-       }
-
-       dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);
-
-       /* Vectors are special in that they have non-scalarized writemasks,
-        * and just take the first swizzle channel for each argument in
-        * order into each writemask channel.
-        */
-       if ((alu->op == nir_op_vec2) ||
-                       (alu->op == nir_op_vec3) ||
-                       (alu->op == nir_op_vec4)) {
-
-               for (int i = 0; i < info->num_inputs; i++) {
-                       nir_alu_src *asrc = &alu->src[i];
-
-                       compile_assert(ctx, !asrc->abs);
-                       compile_assert(ctx, !asrc->negate);
-
-                       src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
-                       if (!src[i])
-                               src[i] = create_immed_typed(ctx->block, 0, dst_type);
-                       dst[i] = ir3_MOV(b, src[i], dst_type);
-               }
-
-               ir3_put_dst(ctx, &alu->dest.dest);
-               return;
-       }
-
-       /* We also get mov's with more than one component for mov's so
-        * handle those specially:
-        */
-       if (alu->op == nir_op_mov) {
-               nir_alu_src *asrc = &alu->src[0];
-               struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
-
-               for (unsigned i = 0; i < dst_sz; i++) {
-                       if (wrmask & (1 << i)) {
-                               dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
-                       } else {
-                               dst[i] = NULL;
-                       }
-               }
-
-               ir3_put_dst(ctx, &alu->dest.dest);
-               return;
-       }
-
-       /* General case: We can just grab the one used channel per src. */
-       for (int i = 0; i < info->num_inputs; i++) {
-               unsigned chan = ffs(alu->dest.write_mask) - 1;
-               nir_alu_src *asrc = &alu->src[i];
-
-               compile_assert(ctx, !asrc->abs);
-               compile_assert(ctx, !asrc->negate);
-
-               src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
-               bs[i] = nir_src_bit_size(asrc->src);
-
-               compile_assert(ctx, src[i]);
-       }
-
-       switch (alu->op) {
-       case nir_op_f2f32:
-       case nir_op_f2f16_rtne:
-       case nir_op_f2f16_rtz:
-       case nir_op_f2f16:
-       case nir_op_f2i32:
-       case nir_op_f2i16:
-       case nir_op_f2i8:
-       case nir_op_f2u32:
-       case nir_op_f2u16:
-       case nir_op_f2u8:
-       case nir_op_i2f32:
-       case nir_op_i2f16:
-       case nir_op_i2i32:
-       case nir_op_i2i16:
-       case nir_op_i2i8:
-       case nir_op_u2f32:
-       case nir_op_u2f16:
-       case nir_op_u2u32:
-       case nir_op_u2u16:
-       case nir_op_u2u8:
-       case nir_op_b2f16:
-       case nir_op_b2f32:
-       case nir_op_b2i8:
-       case nir_op_b2i16:
-       case nir_op_b2i32:
-               dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
-               break;
-
-       case nir_op_fquantize2f16:
-               dst[0] = create_cov(ctx,
-                                                       create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
-                                                       16, nir_op_f2f32);
-               break;
-       case nir_op_f2b1:
-               dst[0] = ir3_CMPS_F(b,
-                               src[0], 0,
-                               create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               break;
-
-       case nir_op_i2b1:
-               /* i2b1 will appear when translating from nir_load_ubo or
-                * nir_intrinsic_load_ssbo, where any non-zero value is true.
-                */
-               dst[0] = ir3_CMPS_S(b,
-                               src[0], 0,
-                               create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               break;
-
-       case nir_op_b2b1:
-               /* b2b1 will appear when translating from
-                *
-                * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
-                * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
-                *
-                * A negate can turn those into a 1 or 0 for us.
-                */
-               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
-               break;
-
-       case nir_op_b2b32:
-               /* b2b32 will appear when converting our 1-bit bools to a store_shared
-                * argument.
-                *
-                * A negate can turn those into a ~0 for us.
-                */
-               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
-               break;
-
-       case nir_op_fneg:
-               dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
-               break;
-       case nir_op_fabs:
-               dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
-               break;
-       case nir_op_fmax:
-               dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fmin:
-               dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fsat:
-               /* if there is just a single use of the src, and it supports
-                * (sat) bit, we can just fold the (sat) flag back to the
-                * src instruction and create a mov.  This is easier for cp
-                * to eliminate.
-                */
-               if (alu->src[0].src.is_ssa &&
-                               is_sat_compatible(src[0]->opc) &&
-                               (list_length(&alu->src[0].src.ssa->uses) == 1)) {
-                       src[0]->flags |= IR3_INSTR_SAT;
-                       dst[0] = ir3_MOV(b, src[0], dst_type);
-               } else {
-                       /* otherwise generate a max.f that saturates.. blob does
-                        * similar (generating a cat2 mov using max.f)
-                        */
-                       dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
-                       dst[0]->flags |= IR3_INSTR_SAT;
-               }
-               break;
-       case nir_op_fmul:
-               dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fadd:
-               dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fsub:
-               dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
-               break;
-       case nir_op_ffma:
-               dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
-               break;
-       case nir_op_fddx:
-       case nir_op_fddx_coarse:
-               dst[0] = ir3_DSX(b, src[0], 0);
-               dst[0]->cat5.type = TYPE_F32;
-               break;
-       case nir_op_fddx_fine:
-               dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
-               dst[0]->cat5.type = TYPE_F32;
-               break;
-       case nir_op_fddy:
-       case nir_op_fddy_coarse:
-               dst[0] = ir3_DSY(b, src[0], 0);
-               dst[0]->cat5.type = TYPE_F32;
-               break;
-               break;
-       case nir_op_fddy_fine:
-               dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
-               dst[0]->cat5.type = TYPE_F32;
-               break;
-       case nir_op_flt:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_LT;
-               break;
-       case nir_op_fge:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_GE;
-               break;
-       case nir_op_feq:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_EQ;
-               break;
-       case nir_op_fneu:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               break;
-       case nir_op_fceil:
-               dst[0] = ir3_CEIL_F(b, src[0], 0);
-               break;
-       case nir_op_ffloor:
-               dst[0] = ir3_FLOOR_F(b, src[0], 0);
-               break;
-       case nir_op_ftrunc:
-               dst[0] = ir3_TRUNC_F(b, src[0], 0);
-               break;
-       case nir_op_fround_even:
-               dst[0] = ir3_RNDNE_F(b, src[0], 0);
-               break;
-       case nir_op_fsign:
-               dst[0] = ir3_SIGN_F(b, src[0], 0);
-               break;
-
-       case nir_op_fsin:
-               dst[0] = ir3_SIN(b, src[0], 0);
-               break;
-       case nir_op_fcos:
-               dst[0] = ir3_COS(b, src[0], 0);
-               break;
-       case nir_op_frsq:
-               dst[0] = ir3_RSQ(b, src[0], 0);
-               break;
-       case nir_op_frcp:
-               dst[0] = ir3_RCP(b, src[0], 0);
-               break;
-       case nir_op_flog2:
-               dst[0] = ir3_LOG2(b, src[0], 0);
-               break;
-       case nir_op_fexp2:
-               dst[0] = ir3_EXP2(b, src[0], 0);
-               break;
-       case nir_op_fsqrt:
-               dst[0] = ir3_SQRT(b, src[0], 0);
-               break;
-
-       case nir_op_iabs:
-               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
-               break;
-       case nir_op_iadd:
-               dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_iand:
-               dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imax:
-               dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_umax:
-               dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imin:
-               dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_umin:
-               dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_umul_low:
-               dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imadsh_mix16:
-               dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
-               break;
-       case nir_op_imad24_ir3:
-               dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
-               break;
-       case nir_op_imul:
-               compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);
-               dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imul24:
-               dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ineg:
-               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
-               break;
-       case nir_op_inot:
-               if (bs[0] == 1) {
-                       dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);
-               } else {
-                       dst[0] = ir3_NOT_B(b, src[0], 0);
-               }
-               break;
-       case nir_op_ior:
-               dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ishl:
-               dst[0] = ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
-               break;
-       case nir_op_ishr:
-               dst[0] = ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
-               break;
-       case nir_op_isub:
-               dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ixor:
-               dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ushr:
-               dst[0] = ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
-               break;
-       case nir_op_ilt:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_LT;
-               break;
-       case nir_op_ige:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_GE;
-               break;
-       case nir_op_ieq:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_EQ;
-               break;
-       case nir_op_ine:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               break;
-       case nir_op_ult:
-               dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_LT;
-               break;
-       case nir_op_uge:
-               dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_GE;
-               break;
-
-       case nir_op_bcsel: {
-               struct ir3_instruction *cond = src[0];
-
-               /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
-                * we can ignore that and use original cond, since the nonzero-ness of
-                * cond stays the same.
-                */
-               if (cond->opc == OPC_ABSNEG_S &&
-                               cond->flags == 0 &&
-                               (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) == IR3_REG_SNEG) {
-                       cond = cond->srcs[0]->def->instr;
-               }
-
-               compile_assert(ctx, bs[1] == bs[2]);
-               /* The condition's size has to match the other two arguments' size, so
-                * convert down if necessary.
-                */
-               if (bs[1] == 16) {
-                       struct hash_entry *prev_entry =
-                               _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
-                       if (prev_entry) {
-                               cond = prev_entry->data;
-                       } else {
-                               cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
-                               _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
-                       }
-               }
-
-               if (bs[1] != 16)
-                       dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
-               else
-                       dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
-               break;
-       }
-       case nir_op_bit_count: {
-               // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
-               // double check on earlier gen's.  Once half-precision support is
-               // in place, this should probably move to a NIR lowering pass:
-               struct ir3_instruction *hi, *lo;
-
-               hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0),
-                               TYPE_U32, TYPE_U16);
-               lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
-
-               hi = ir3_CBITS_B(b, hi, 0);
-               lo = ir3_CBITS_B(b, lo, 0);
-
-               // TODO maybe the builders should default to making dst half-precision
-               // if the src's were half precision, to make this less awkward.. otoh
-               // we should probably just do this lowering in NIR.
-               hi->dsts[0]->flags |= IR3_REG_HALF;
-               lo->dsts[0]->flags |= IR3_REG_HALF;
-
-               dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
-               dst[0]->dsts[0]->flags |= IR3_REG_HALF;
-               dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
-               break;
-       }
-       case nir_op_ifind_msb: {
-               struct ir3_instruction *cmp;
-               dst[0] = ir3_CLZ_S(b, src[0], 0);
-               cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
-               cmp->cat2.condition = IR3_COND_GE;
-               dst[0] = ir3_SEL_B32(b,
-                               ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
-                               cmp, 0, dst[0], 0);
-               break;
-       }
-       case nir_op_ufind_msb:
-               dst[0] = ir3_CLZ_B(b, src[0], 0);
-               dst[0] = ir3_SEL_B32(b,
-                               ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
-                               src[0], 0, dst[0], 0);
-               break;
-       case nir_op_find_lsb:
-               dst[0] = ir3_BFREV_B(b, src[0], 0);
-               dst[0] = ir3_CLZ_B(b, dst[0], 0);
-               break;
-       case nir_op_bitfield_reverse:
-               dst[0] = ir3_BFREV_B(b, src[0], 0);
-               break;
-
-       default:
-               ir3_context_error(ctx, "Unhandled ALU op: %s\n",
-                               nir_op_infos[alu->op].name);
-               break;
-       }
-
-       if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
-               assert(nir_dest_bit_size(alu->dest.dest) == 1 ||
-                               alu->op == nir_op_b2b32);
-               assert(dst_sz == 1);
-       } else {
-               /* 1-bit values stored in 32-bit registers are only valid for certain
-                * ALU ops.
-                */
-               switch (alu->op) {
-               case nir_op_iand:
-               case nir_op_ior:
-               case nir_op_ixor:
-               case nir_op_inot:
-               case nir_op_bcsel:
-                       break;
-               default:
-                       compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);
-               }
-       }
-
-       ir3_put_dst(ctx, &alu->dest.dest);
+   const nir_op_info *info = &nir_op_infos[alu->op];
+   struct ir3_instruction **dst, *src[info->num_inputs];
+   unsigned bs[info->num_inputs]; /* bit size */
+   struct ir3_block *b = ctx->block;
+   unsigned dst_sz, wrmask;
+   type_t dst_type =
+      nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32;
+
+   if (alu->dest.dest.is_ssa) {
+      dst_sz = alu->dest.dest.ssa.num_components;
+      wrmask = (1 << dst_sz) - 1;
+   } else {
+      dst_sz = alu->dest.dest.reg.reg->num_components;
+      wrmask = alu->dest.write_mask;
+   }
+
+   dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);
+
+   /* Vectors are special in that they have non-scalarized writemasks,
+    * and just take the first swizzle channel for each argument in
+    * order into each writemask channel.
+    */
+   if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
+       (alu->op == nir_op_vec4)) {
+
+      for (int i = 0; i < info->num_inputs; i++) {
+         nir_alu_src *asrc = &alu->src[i];
+
+         compile_assert(ctx, !asrc->abs);
+         compile_assert(ctx, !asrc->negate);
+
+         src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
+         if (!src[i])
+            src[i] = create_immed_typed(ctx->block, 0, dst_type);
+         dst[i] = ir3_MOV(b, src[i], dst_type);
+      }
+
+      ir3_put_dst(ctx, &alu->dest.dest);
+      return;
+   }
+
+   /* We also get mov's with more than one component for mov's so
+    * handle those specially:
+    */
+   if (alu->op == nir_op_mov) {
+      nir_alu_src *asrc = &alu->src[0];
+      struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
+
+      for (unsigned i = 0; i < dst_sz; i++) {
+         if (wrmask & (1 << i)) {
+            dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
+         } else {
+            dst[i] = NULL;
+         }
+      }
+
+      ir3_put_dst(ctx, &alu->dest.dest);
+      return;
+   }
+
+   /* General case: We can just grab the one used channel per src. */
+   for (int i = 0; i < info->num_inputs; i++) {
+      unsigned chan = ffs(alu->dest.write_mask) - 1;
+      nir_alu_src *asrc = &alu->src[i];
+
+      compile_assert(ctx, !asrc->abs);
+      compile_assert(ctx, !asrc->negate);
+
+      src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
+      bs[i] = nir_src_bit_size(asrc->src);
+
+      compile_assert(ctx, src[i]);
+   }
+
+   switch (alu->op) {
+   case nir_op_f2f32:
+   case nir_op_f2f16_rtne:
+   case nir_op_f2f16_rtz:
+   case nir_op_f2f16:
+   case nir_op_f2i32:
+   case nir_op_f2i16:
+   case nir_op_f2i8:
+   case nir_op_f2u32:
+   case nir_op_f2u16:
+   case nir_op_f2u8:
+   case nir_op_i2f32:
+   case nir_op_i2f16:
+   case nir_op_i2i32:
+   case nir_op_i2i16:
+   case nir_op_i2i8:
+   case nir_op_u2f32:
+   case nir_op_u2f16:
+   case nir_op_u2u32:
+   case nir_op_u2u16:
+   case nir_op_u2u8:
+   case nir_op_b2f16:
+   case nir_op_b2f32:
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+      dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
+      break;
+
+   case nir_op_fquantize2f16:
+      dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
+                          16, nir_op_f2f32);
+      break;
+   case nir_op_f2b1:
+      dst[0] = ir3_CMPS_F(
+         b, src[0], 0,
+         create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);
+      dst[0]->cat2.condition = IR3_COND_NE;
+      break;
+
+   case nir_op_i2b1:
+      /* i2b1 will appear when translating from nir_load_ubo or
+       * nir_intrinsic_load_ssbo, where any non-zero value is true.
+       */
+      dst[0] = ir3_CMPS_S(
+         b, src[0], 0,
+         create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);
+      dst[0]->cat2.condition = IR3_COND_NE;
+      break;
+
+   case nir_op_b2b1:
+      /* b2b1 will appear when translating from
+       *
+       * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
+       * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
+       *
+       * A negate can turn those into a 1 or 0 for us.
+       */
+      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+      break;
+
+   case nir_op_b2b32:
+      /* b2b32 will appear when converting our 1-bit bools to a store_shared
+       * argument.
+       *
+       * A negate can turn those into a ~0 for us.
+       */
+      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+      break;
+
+   case nir_op_fneg:
+      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
+      break;
+   case nir_op_fabs:
+      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
+      break;
+   case nir_op_fmax:
+      dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_fmin:
+      dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_fsat:
+      /* if there is just a single use of the src, and it supports
+       * (sat) bit, we can just fold the (sat) flag back to the
+       * src instruction and create a mov.  This is easier for cp
+       * to eliminate.
+       */
+      if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) &&
+          (list_length(&alu->src[0].src.ssa->uses) == 1)) {
+         src[0]->flags |= IR3_INSTR_SAT;
+         dst[0] = ir3_MOV(b, src[0], dst_type);
+      } else {
+         /* otherwise generate a max.f that saturates.. blob does
+          * similar (generating a cat2 mov using max.f)
+          */
+         dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
+         dst[0]->flags |= IR3_INSTR_SAT;
+      }
+      break;
+   case nir_op_fmul:
+      dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_fadd:
+      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_fsub:
+      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
+      break;
+   case nir_op_ffma:
+      dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
+      break;
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+      dst[0] = ir3_DSX(b, src[0], 0);
+      dst[0]->cat5.type = TYPE_F32;
+      break;
+   case nir_op_fddx_fine:
+      dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
+      dst[0]->cat5.type = TYPE_F32;
+      break;
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+      dst[0] = ir3_DSY(b, src[0], 0);
+      dst[0]->cat5.type = TYPE_F32;
+      break;
+      break;
+   case nir_op_fddy_fine:
+      dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
+      dst[0]->cat5.type = TYPE_F32;
+      break;
+   case nir_op_flt:
+      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_LT;
+      break;
+   case nir_op_fge:
+      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_GE;
+      break;
+   case nir_op_feq:
+      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_EQ;
+      break;
+   case nir_op_fneu:
+      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_NE;
+      break;
+   case nir_op_fceil:
+      dst[0] = ir3_CEIL_F(b, src[0], 0);
+      break;
+   case nir_op_ffloor:
+      dst[0] = ir3_FLOOR_F(b, src[0], 0);
+      break;
+   case nir_op_ftrunc:
+      dst[0] = ir3_TRUNC_F(b, src[0], 0);
+      break;
+   case nir_op_fround_even:
+      dst[0] = ir3_RNDNE_F(b, src[0], 0);
+      break;
+   case nir_op_fsign:
+      dst[0] = ir3_SIGN_F(b, src[0], 0);
+      break;
+
+   case nir_op_fsin:
+      dst[0] = ir3_SIN(b, src[0], 0);
+      break;
+   case nir_op_fcos:
+      dst[0] = ir3_COS(b, src[0], 0);
+      break;
+   case nir_op_frsq:
+      dst[0] = ir3_RSQ(b, src[0], 0);
+      break;
+   case nir_op_frcp:
+      dst[0] = ir3_RCP(b, src[0], 0);
+      break;
+   case nir_op_flog2:
+      dst[0] = ir3_LOG2(b, src[0], 0);
+      break;
+   case nir_op_fexp2:
+      dst[0] = ir3_EXP2(b, src[0], 0);
+      break;
+   case nir_op_fsqrt:
+      dst[0] = ir3_SQRT(b, src[0], 0);
+      break;
+
+   case nir_op_iabs:
+      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
+      break;
+   case nir_op_iadd:
+      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_iand:
+      dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_imax:
+      dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_umax:
+      dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_imin:
+      dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_umin:
+      dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_umul_low:
+      dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_imadsh_mix16:
+      dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
+      break;
+   case nir_op_imad24_ir3:
+      dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
+      break;
+   case nir_op_imul:
+      compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);
+      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_imul24:
+      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_ineg:
+      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+      break;
+   case nir_op_inot:
+      if (bs[0] == 1) {
+         dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);
+      } else {
+         dst[0] = ir3_NOT_B(b, src[0], 0);
+      }
+      break;
+   case nir_op_ior:
+      dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_ishl:
+      dst[0] =
+         ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
+      break;
+   case nir_op_ishr:
+      dst[0] =
+         ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
+      break;
+   case nir_op_isub:
+      dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_ixor:
+      dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
+      break;
+   case nir_op_ushr:
+      dst[0] =
+         ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
+      break;
+   case nir_op_ilt:
+      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_LT;
+      break;
+   case nir_op_ige:
+      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_GE;
+      break;
+   case nir_op_ieq:
+      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_EQ;
+      break;
+   case nir_op_ine:
+      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_NE;
+      break;
+   case nir_op_ult:
+      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_LT;
+      break;
+   case nir_op_uge:
+      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+      dst[0]->cat2.condition = IR3_COND_GE;
+      break;
+
+   case nir_op_bcsel: {
+      struct ir3_instruction *cond = src[0];
+
+      /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
+       * we can ignore that and use original cond, since the nonzero-ness of
+       * cond stays the same.
+       */
+      if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&
+          (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
+             IR3_REG_SNEG) {
+         cond = cond->srcs[0]->def->instr;
+      }
+
+      compile_assert(ctx, bs[1] == bs[2]);
+      /* The condition's size has to match the other two arguments' size, so
+       * convert down if necessary.
+       */
+      if (bs[1] == 16) {
+         struct hash_entry *prev_entry =
+            _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
+         if (prev_entry) {
+            cond = prev_entry->data;
+         } else {
+            cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+            _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
+         }
+      }
+
+      if (bs[1] != 16)
+         dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
+      else
+         dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
+      break;
+   }
+   case nir_op_bit_count: {
+      // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
+      // double check on earlier gen's.  Once half-precision support is
+      // in place, this should probably move to a NIR lowering pass:
+      struct ir3_instruction *hi, *lo;
+
+      hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
+                   TYPE_U16);
+      lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
+
+      hi = ir3_CBITS_B(b, hi, 0);
+      lo = ir3_CBITS_B(b, lo, 0);
+
+      // TODO maybe the builders should default to making dst half-precision
+      // if the src's were half precision, to make this less awkward.. otoh
+      // we should probably just do this lowering in NIR.
+      hi->dsts[0]->flags |= IR3_REG_HALF;
+      lo->dsts[0]->flags |= IR3_REG_HALF;
+
+      dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
+      dst[0]->dsts[0]->flags |= IR3_REG_HALF;
+      dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
+      break;
+   }
+   case nir_op_ifind_msb: {
+      struct ir3_instruction *cmp;
+      dst[0] = ir3_CLZ_S(b, src[0], 0);
+      cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+      cmp->cat2.condition = IR3_COND_GE;
+      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
+                           0, cmp, 0, dst[0], 0);
+      break;
+   }
+   case nir_op_ufind_msb:
+      dst[0] = ir3_CLZ_B(b, src[0], 0);
+      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
+                           0, src[0], 0, dst[0], 0);
+      break;
+   case nir_op_find_lsb:
+      dst[0] = ir3_BFREV_B(b, src[0], 0);
+      dst[0] = ir3_CLZ_B(b, dst[0], 0);
+      break;
+   case nir_op_bitfield_reverse:
+      dst[0] = ir3_BFREV_B(b, src[0], 0);
+      break;
+
+   default:
+      ir3_context_error(ctx, "Unhandled ALU op: %s\n",
+                        nir_op_infos[alu->op].name);
+      break;
+   }
+
+   if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
+      assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32);
+      assert(dst_sz == 1);
+   } else {
+      /* 1-bit values stored in 32-bit registers are only valid for certain
+       * ALU ops.
+       */
+      switch (alu->op) {
+      case nir_op_iand:
+      case nir_op_ior:
+      case nir_op_ixor:
+      case nir_op_inot:
+      case nir_op_bcsel:
+         break;
+      default:
+         compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);
+      }
+   }
+
+   ir3_put_dst(ctx, &alu->dest.dest);
  }
  
  static void
  emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-                                                       struct ir3_instruction **dst)
+                            struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-
-       unsigned ncomp = intr->num_components;
-       struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
-       struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
-       struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
-       ldc->dsts[0]->wrmask = MASK(ncomp);
-       ldc->cat6.iim_val = ncomp;
-       ldc->cat6.d = nir_intrinsic_component(intr);
-       ldc->cat6.type = TYPE_U32;
-
-       ir3_handle_bindless_cat6(ldc, intr->src[0]);
-       if (ldc->flags & IR3_INSTR_B)
-               ctx->so->bindless_ubo = true;
-       ir3_handle_nonuniform(ldc, intr);
-
-       ir3_split_dest(b, dst, ldc, 0, ncomp);
+   struct ir3_block *b = ctx->block;
+
+   unsigned ncomp = intr->num_components;
+   struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
+   struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
+   struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
+   ldc->dsts[0]->wrmask = MASK(ncomp);
+   ldc->cat6.iim_val = ncomp;
+   ldc->cat6.d = nir_intrinsic_component(intr);
+   ldc->cat6.type = TYPE_U32;
+
+   ir3_handle_bindless_cat6(ldc, intr->src[0]);
+   if (ldc->flags & IR3_INSTR_B)
+      ctx->so->bindless_ubo = true;
+   ir3_handle_nonuniform(ldc, intr);
+
+   ir3_split_dest(b, dst, ldc, 0, ncomp);
  }
  
-
  /* handles direct/indirect UBO reads: */
  static void
  emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                        struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
-       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-       unsigned ubo = regid(const_state->offsets.ubo, 0);
-       const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
-
-       int off = 0;
-
-       /* First src is ubo index, which could either be an immed or not: */
-       src0 = ir3_get_src(ctx, &intr->src[0])[0];
-       if (is_same_type_mov(src0) &&
-                       (src0->srcs[0]->flags & IR3_REG_IMMED)) {
-               base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
-               base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
-       } else {
-               base_lo = create_uniform_indirect(b, ubo, TYPE_U32, ir3_get_addr0(ctx, src0, ptrsz));
-               base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32, ir3_get_addr0(ctx, src0, ptrsz));
-
-               /* NOTE: since relative addressing is used, make sure constlen is
-                * at least big enough to cover all the UBO addresses, since the
-                * assembler won't know what the max address reg is.
-                */
-               ctx->so->constlen = MAX2(ctx->so->constlen,
-                       const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
-       }
-
-       /* note: on 32bit gpu's base_hi is ignored and DCE'd */
-       addr = base_lo;
-
-       if (nir_src_is_const(intr->src[1])) {
-               off += nir_src_as_uint(intr->src[1]);
-       } else {
-               /* For load_ubo_indirect, second src is indirect offset: */
-               src1 = ir3_get_src(ctx, &intr->src[1])[0];
-
-               /* and add offset to addr: */
-               addr = ir3_ADD_S(b, addr, 0, src1, 0);
-       }
-
-       /* if offset is to large to encode in the ldg, split it out: */
-       if ((off + (intr->num_components * 4)) > 1024) {
-               /* split out the minimal amount to improve the odds that
-                * cp can fit the immediate in the add.s instruction:
-                */
-               unsigned off2 = off + (intr->num_components * 4) - 1024;
-               addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
-               off -= off2;
-       }
-
-       if (ptrsz == 2) {
-               struct ir3_instruction *carry;
-
-               /* handle 32b rollover, ie:
-                *   if (addr < base_lo)
-                *      base_hi++
-                */
-               carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
-               carry->cat2.condition = IR3_COND_LT;
-               base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
-
-               addr = ir3_collect(ctx, addr, base_hi);
-       }
-
-       for (int i = 0; i < intr->num_components; i++) {
-               struct ir3_instruction *load =
-                       ir3_LDG(b, addr, 0,
-                                       create_immed(b, off + i * 4), 0,
-                                       create_immed(b, 1), 0); /* num components */
-               load->cat6.type = TYPE_U32;
-               dst[i] = load;
-       }
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
+   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   unsigned ubo = regid(const_state->offsets.ubo, 0);
+   const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
+
+   int off = 0;
+
+   /* First src is ubo index, which could either be an immed or not: */
+   src0 = ir3_get_src(ctx, &intr->src[0])[0];
+   if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
+      base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
+      base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
+   } else {
+      base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
+                                        ir3_get_addr0(ctx, src0, ptrsz));
+      base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
+                                        ir3_get_addr0(ctx, src0, ptrsz));
+
+      /* NOTE: since relative addressing is used, make sure constlen is
+       * at least big enough to cover all the UBO addresses, since the
+       * assembler won't know what the max address reg is.
+       */
+      ctx->so->constlen =
+         MAX2(ctx->so->constlen,
+              const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
+   }
+
+   /* note: on 32bit gpu's base_hi is ignored and DCE'd */
+   addr = base_lo;
+
+   if (nir_src_is_const(intr->src[1])) {
+      off += nir_src_as_uint(intr->src[1]);
+   } else {
+      /* For load_ubo_indirect, second src is indirect offset: */
+      src1 = ir3_get_src(ctx, &intr->src[1])[0];
+
+      /* and add offset to addr: */
+      addr = ir3_ADD_S(b, addr, 0, src1, 0);
+   }
+
+   /* if offset is to large to encode in the ldg, split it out: */
+   if ((off + (intr->num_components * 4)) > 1024) {
+      /* split out the minimal amount to improve the odds that
+       * cp can fit the immediate in the add.s instruction:
+       */
+      unsigned off2 = off + (intr->num_components * 4) - 1024;
+      addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
+      off -= off2;
+   }
+
+   if (ptrsz == 2) {
+      struct ir3_instruction *carry;
+
+      /* handle 32b rollover, ie:
+       *   if (addr < base_lo)
+       *      base_hi++
+       */
+      carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
+      carry->cat2.condition = IR3_COND_LT;
+      base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
+
+      addr = ir3_collect(ctx, addr, base_hi);
+   }
+
+   for (int i = 0; i < intr->num_components; i++) {
+      struct ir3_instruction *load =
+         ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
+                 create_immed(b, 1), 0); /* num components */
+      load->cat6.type = TYPE_U32;
+      dst[i] = load;
+   }
  }
  
  /* src[] = { block_index } */
  static void
  emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                         struct ir3_instruction **dst)
  {
-       if (ir3_bindless_resource(intr->src[0])) {
-               struct ir3_block *b = ctx->block;
-               struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
-               struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
-               resinfo->cat6.iim_val = 1;
-               resinfo->cat6.d = 1;
-               resinfo->cat6.type = TYPE_U32;
-               resinfo->cat6.typed = false;
-               /* resinfo has no writemask and always writes out 3 components */
-               resinfo->dsts[0]->wrmask = MASK(3);
-               ir3_handle_bindless_cat6(resinfo, intr->src[0]);
-               struct ir3_instruction *resinfo_dst;
-               ir3_split_dest(b, &resinfo_dst, resinfo, 0, 1);
-               /* Unfortunately resinfo returns the array length, i.e. in dwords,
-                * while NIR expects us to return the size in bytes.
-                *
-                * TODO: fix this in NIR.
-                */
-               *dst = ir3_SHL_B(b, resinfo_dst, 0, create_immed(b, 2), 0);
-               return;
-       }
-
-       /* SSBO size stored as a const starting at ssbo_sizes: */
-       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-       unsigned blk_idx = nir_src_as_uint(intr->src[0]);
-       unsigned idx = regid(const_state->offsets.ssbo_sizes, 0) +
-               const_state->ssbo_size.off[blk_idx];
-
-       debug_assert(const_state->ssbo_size.mask & (1 << blk_idx));
-
-       dst[0] = create_uniform(ctx->block, idx);
+   if (ir3_bindless_resource(intr->src[0])) {
+      struct ir3_block *b = ctx->block;
+      struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
+      struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
+      resinfo->cat6.iim_val = 1;
+      resinfo->cat6.d = 1;
+      resinfo->cat6.type = TYPE_U32;
+      resinfo->cat6.typed = false;
+      /* resinfo has no writemask and always writes out 3 components */
+      resinfo->dsts[0]->wrmask = MASK(3);
+      ir3_handle_bindless_cat6(resinfo, intr->src[0]);
+      struct ir3_instruction *resinfo_dst;
+      ir3_split_dest(b, &resinfo_dst, resinfo, 0, 1);
+      /* Unfortunately resinfo returns the array length, i.e. in dwords,
+       * while NIR expects us to return the size in bytes.
+       *
+       * TODO: fix this in NIR.
+       */
+      *dst = ir3_SHL_B(b, resinfo_dst, 0, create_immed(b, 2), 0);
+      return;
+   }
+
+   /* SSBO size stored as a const starting at ssbo_sizes: */
+   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   unsigned blk_idx = nir_src_as_uint(intr->src[0]);
+   unsigned idx = regid(const_state->offsets.ssbo_sizes, 0) +
+                  const_state->ssbo_size.off[blk_idx];
+
+   debug_assert(const_state->ssbo_size.mask & (1 << blk_idx));
+
+   dst[0] = create_uniform(ctx->block, idx);
  }
  
  /* src[] = { offset }. const_index[] = { base } */
  static void
  emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                           struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ldl, *offset;
-       unsigned base;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *ldl, *offset;
+   unsigned base;
  
-       offset = ir3_get_src(ctx, &intr->src[0])[0];
-       base   = nir_intrinsic_base(intr);
+   offset = ir3_get_src(ctx, &intr->src[0])[0];
+   base = nir_intrinsic_base(intr);
  
-       ldl = ir3_LDL(b, offset, 0,
-                       create_immed(b, base), 0,
-                       create_immed(b, intr->num_components), 0);
+   ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
+                 create_immed(b, intr->num_components), 0);
  
-       ldl->cat6.type = utype_dst(intr->dest);
-       ldl->dsts[0]->wrmask = MASK(intr->num_components);
+   ldl->cat6.type = utype_dst(intr->dest);
+   ldl->dsts[0]->wrmask = MASK(intr->num_components);
  
-       ldl->barrier_class = IR3_BARRIER_SHARED_R;
-       ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
+   ldl->barrier_class = IR3_BARRIER_SHARED_R;
+   ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
  
-       ir3_split_dest(b, dst, ldl, 0, intr->num_components);
+   ir3_split_dest(b, dst, ldl, 0, intr->num_components);
  }
  
  /* src[] = { value, offset }. const_index[] = { base, write_mask } */
  static void
  emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stl, *offset;
-       struct ir3_instruction * const *value;
-       unsigned base, wrmask, ncomp;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *stl, *offset;
+   struct ir3_instruction *const *value;
+   unsigned base, wrmask, ncomp;
  
-       value  = ir3_get_src(ctx, &intr->src[0]);
-       offset = ir3_get_src(ctx, &intr->src[1])[0];
+   value = ir3_get_src(ctx, &intr->src[0]);
+   offset = ir3_get_src(ctx, &intr->src[1])[0];
  
-       base   = nir_intrinsic_base(intr);
-       wrmask = nir_intrinsic_write_mask(intr);
-       ncomp  = ffs(~wrmask) - 1;
+   base = nir_intrinsic_base(intr);
+   wrmask = nir_intrinsic_write_mask(intr);
+   ncomp = ffs(~wrmask) - 1;
  
-       assert(wrmask == BITFIELD_MASK(intr->num_components));
+   assert(wrmask == BITFIELD_MASK(intr->num_components));
  
-       stl = ir3_STL(b, offset, 0,
-               ir3_create_collect(ctx, value, ncomp), 0,
-               create_immed(b, ncomp), 0);
-       stl->cat6.dst_offset = base;
-       stl->cat6.type = utype_src(intr->src[0]);
-       stl->barrier_class = IR3_BARRIER_SHARED_W;
-       stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+   stl = ir3_STL(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,
+                 create_immed(b, ncomp), 0);
+   stl->cat6.dst_offset = base;
+   stl->cat6.type = utype_src(intr->src[0]);
+   stl->barrier_class = IR3_BARRIER_SHARED_W;
+   stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
  
-       array_insert(b, b->keeps, stl);
+   array_insert(b, b->keeps, stl);
  }
  
  /* src[] = { offset }. const_index[] = { base } */
  static void
-emit_intrinsic_load_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
+                               nir_intrinsic_instr *intr,
+                               struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *load, *offset;
-       unsigned base;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *load, *offset;
+   unsigned base;
  
-       offset = ir3_get_src(ctx, &intr->src[0])[0];
-       base   = nir_intrinsic_base(intr);
+   offset = ir3_get_src(ctx, &intr->src[0])[0];
+   base = nir_intrinsic_base(intr);
  
-       load = ir3_LDLW(b, offset, 0,
-                       create_immed(b, base), 0,
-                       create_immed(b, intr->num_components), 0);
+   load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
+                   create_immed(b, intr->num_components), 0);
  
-       /* for a650, use LDL for tess ctrl inputs: */
-       if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
-               load->opc = OPC_LDL;
+   /* for a650, use LDL for tess ctrl inputs: */
+   if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
+      load->opc = OPC_LDL;
  
-       load->cat6.type = utype_dst(intr->dest);
-       load->dsts[0]->wrmask = MASK(intr->num_components);
+   load->cat6.type = utype_dst(intr->dest);
+   load->dsts[0]->wrmask = MASK(intr->num_components);
  
-       load->barrier_class = IR3_BARRIER_SHARED_R;
-       load->barrier_conflict = IR3_BARRIER_SHARED_W;
+   load->barrier_class = IR3_BARRIER_SHARED_R;
+   load->barrier_conflict = IR3_BARRIER_SHARED_W;
  
-       ir3_split_dest(b, dst, load, 0, intr->num_components);
+   ir3_split_dest(b, dst, load, 0, intr->num_components);
  }
  
  /* src[] = { value, offset }. const_index[] = { base } */
  static void
-emit_intrinsic_store_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
+                                nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *store, *offset;
-       struct ir3_instruction * const *value;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *store, *offset;
+   struct ir3_instruction *const *value;
  
-       value  = ir3_get_src(ctx, &intr->src[0]);
-       offset = ir3_get_src(ctx, &intr->src[1])[0];
+   value = ir3_get_src(ctx, &intr->src[0]);
+   offset = ir3_get_src(ctx, &intr->src[1])[0];
  
-       store = ir3_STLW(b, offset, 0,
-               ir3_create_collect(ctx, value, intr->num_components), 0,
-               create_immed(b, intr->num_components), 0);
+   store = ir3_STLW(b, offset, 0,
+                    ir3_create_collect(ctx, value, intr->num_components), 0,
+                    create_immed(b, intr->num_components), 0);
  
-       /* for a650, use STL for vertex outputs used by tess ctrl shader: */
-       if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
-               ctx->compiler->tess_use_shared)
-               store->opc = OPC_STL;
+   /* for a650, use STL for vertex outputs used by tess ctrl shader: */
+   if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
+       ctx->compiler->tess_use_shared)
+      store->opc = OPC_STL;
  
-       store->cat6.dst_offset = nir_intrinsic_base(intr);
-       store->cat6.type = utype_src(intr->src[0]);
-       store->barrier_class = IR3_BARRIER_SHARED_W;
-       store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+   store->cat6.dst_offset = nir_intrinsic_base(intr);
+   store->cat6.type = utype_src(intr->src[0]);
+   store->barrier_class = IR3_BARRIER_SHARED_W;
+   store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
  
-       array_insert(b, b->keeps, store);
+   array_insert(b, b->keeps, store);
  }
  
  /*
@@ -1019,121 +1017,119 @@ emit_intrinsic_store_shared_ir3(struct ir3_context *ctx, nir_intrinsic_instr *in
  static struct ir3_instruction *
  emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *src0, *src1;
-       type_t type = TYPE_U32;
-
-       src0 = ir3_get_src(ctx, &intr->src[0])[0];   /* offset */
-       src1 = ir3_get_src(ctx, &intr->src[1])[0];   /* value */
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_shared_atomic_add:
-               atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_imin:
-               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_shared_atomic_umin:
-               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_imax:
-               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_shared_atomic_umax:
-               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_and:
-               atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_or:
-               atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_xor:
-               atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_exchange:
-               atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_comp_swap:
-               /* for cmpxchg, src1 is [ui]vec2(data, compare): */
-               src1 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[2])[0], src1);
-               atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = 1;
-       atomic->cat6.type = type;
-       atomic->barrier_class = IR3_BARRIER_SHARED_W;
-       atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       return atomic;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *atomic, *src0, *src1;
+   type_t type = TYPE_U32;
+
+   src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
+   src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_shared_atomic_add:
+      atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_imin:
+      atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+      type = TYPE_S32;
+      break;
+   case nir_intrinsic_shared_atomic_umin:
+      atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_imax:
+      atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+      type = TYPE_S32;
+      break;
+   case nir_intrinsic_shared_atomic_umax:
+      atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_and:
+      atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_or:
+      atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_xor:
+      atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_exchange:
+      atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+      break;
+   case nir_intrinsic_shared_atomic_comp_swap:
+      /* for cmpxchg, src1 is [ui]vec2(data, compare): */
+      src1 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[2])[0], src1);
+      atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+      break;
+   default:
+      unreachable("boo");
+   }
+
+   atomic->cat6.iim_val = 1;
+   atomic->cat6.d = 1;
+   atomic->cat6.type = type;
+   atomic->barrier_class = IR3_BARRIER_SHARED_W;
+   atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+   /* even if nothing consume the result, we can't DCE the instruction: */
+   array_insert(b, b->keeps, atomic);
+
+   return atomic;
  }
  
  /* src[] = { offset }. */
  static void
  emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                            struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ldp, *offset;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *ldp, *offset;
  
-       offset = ir3_get_src(ctx, &intr->src[0])[0];
+   offset = ir3_get_src(ctx, &intr->src[0])[0];
  
-       ldp = ir3_LDP(b, offset, 0,
-                       create_immed(b, 0), 0,
-                       create_immed(b, intr->num_components), 0);
+   ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,
+                 create_immed(b, intr->num_components), 0);
  
-       ldp->cat6.type = utype_dst(intr->dest);
-       ldp->dsts[0]->wrmask = MASK(intr->num_components);
+   ldp->cat6.type = utype_dst(intr->dest);
+   ldp->dsts[0]->wrmask = MASK(intr->num_components);
  
-       ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
-       ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
+   ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
+   ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
  
-       ir3_split_dest(b, dst, ldp, 0, intr->num_components);
+   ir3_split_dest(b, dst, ldp, 0, intr->num_components);
  }
  
  /* src[] = { value, offset }. const_index[] = { write_mask } */
  static void
  emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stp, *offset;
-       struct ir3_instruction * const *value;
-       unsigned wrmask, ncomp;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *stp, *offset;
+   struct ir3_instruction *const *value;
+   unsigned wrmask, ncomp;
  
-       value  = ir3_get_src(ctx, &intr->src[0]);
-       offset = ir3_get_src(ctx, &intr->src[1])[0];
+   value = ir3_get_src(ctx, &intr->src[0]);
+   offset = ir3_get_src(ctx, &intr->src[1])[0];
  
-       wrmask = nir_intrinsic_write_mask(intr);
-       ncomp  = ffs(~wrmask) - 1;
+   wrmask = nir_intrinsic_write_mask(intr);
+   ncomp = ffs(~wrmask) - 1;
  
-       assert(wrmask == BITFIELD_MASK(intr->num_components));
+   assert(wrmask == BITFIELD_MASK(intr->num_components));
  
-       stp = ir3_STP(b, offset, 0,
-               ir3_create_collect(ctx, value, ncomp), 0,
-               create_immed(b, ncomp), 0);
-       stp->cat6.dst_offset = 0;
-       stp->cat6.type = utype_src(intr->src[0]);
-       stp->barrier_class = IR3_BARRIER_PRIVATE_W;
-       stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
+   stp = ir3_STP(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,
+                 create_immed(b, ncomp), 0);
+   stp->cat6.dst_offset = 0;
+   stp->cat6.type = utype_src(intr->src[0]);
+   stp->barrier_class = IR3_BARRIER_PRIVATE_W;
+   stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
  
-       array_insert(b, b->keeps, stp);
+   array_insert(b, b->keeps, stp);
  }
  
  struct tex_src_info {
-       /* For prefetch */
-       unsigned tex_base, samp_base, tex_idx, samp_idx;
-       /* For normal tex instructions */
-       unsigned base, combined_idx, a1_val, flags;
-       struct ir3_instruction *samp_tex;
+   /* For prefetch */
+   unsigned tex_base, samp_base, tex_idx, samp_idx;
+   /* For normal tex instructions */
+   unsigned base, combined_idx, a1_val, flags;
+   struct ir3_instruction *samp_tex;
  };
  
  /* TODO handle actual indirect/dynamic case.. which is going to be weird
@@ -1142,482 +1138,485 @@ struct tex_src_info {
  static struct tex_src_info
  get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct tex_src_info info = { 0 };
-       nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(intr->src[0]);
-       ctx->so->bindless_tex = true;
-
-       if (bindless_tex) {
-               /* Bindless case */
-               info.flags |= IR3_INSTR_B;
-
-               /* Gather information required to determine which encoding to
-                * choose as well as for prefetch.
-                */
-               info.tex_base = nir_intrinsic_desc_set(bindless_tex);
-               bool tex_const = nir_src_is_const(bindless_tex->src[0]);
-               if (tex_const)
-                       info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
-               info.samp_idx = 0;
-
-               /* Choose encoding. */
-               if (tex_const && info.tex_idx < 256) {
-                       if (info.tex_idx < 16) {
-                               /* Everything fits within the instruction */
-                               info.base = info.tex_base;
-                               info.combined_idx = info.samp_idx | (info.tex_idx << 4);
-                       } else {
-                               info.base = info.tex_base;
-                               info.a1_val = info.tex_idx << 3;
-                               info.combined_idx = 0;
-                               info.flags |= IR3_INSTR_A1EN;
-                       }
-                       info.samp_tex = NULL;
-               } else {
-                       info.flags |= IR3_INSTR_S2EN;
-                       info.base = info.tex_base;
-
-                       /* Note: the indirect source is now a vec2 instead of hvec2 */
-                       struct ir3_instruction *texture, *sampler;
-
-                       texture = ir3_get_src(ctx, &intr->src[0])[0];
-                       sampler = create_immed(b, 0);
-                       info.samp_tex = ir3_collect(ctx, texture, sampler);
-               }
-       } else {
-               info.flags |= IR3_INSTR_S2EN;
-               unsigned slot = nir_src_as_uint(intr->src[0]);
-               unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);
-               struct ir3_instruction *texture, *sampler;
-
-               texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
-               sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
-
-               info.samp_tex = ir3_collect(ctx, sampler, texture);
-       }
-       
-       return info;
+   struct ir3_block *b = ctx->block;
+   struct tex_src_info info = {0};
+   nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(intr->src[0]);
+   ctx->so->bindless_tex = true;
+
+   if (bindless_tex) {
+      /* Bindless case */
+      info.flags |= IR3_INSTR_B;
+
+      /* Gather information required to determine which encoding to
+       * choose as well as for prefetch.
+       */
+      info.tex_base = nir_intrinsic_desc_set(bindless_tex);
+      bool tex_const = nir_src_is_const(bindless_tex->src[0]);
+      if (tex_const)
+         info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
+      info.samp_idx = 0;
+
+      /* Choose encoding. */
+      if (tex_const && info.tex_idx < 256) {
+         if (info.tex_idx < 16) {
+            /* Everything fits within the instruction */
+            info.base = info.tex_base;
+            info.combined_idx = info.samp_idx | (info.tex_idx << 4);
+         } else {
+            info.base = info.tex_base;
+            info.a1_val = info.tex_idx << 3;
+            info.combined_idx = 0;
+            info.flags |= IR3_INSTR_A1EN;
+         }
+         info.samp_tex = NULL;
+      } else {
+         info.flags |= IR3_INSTR_S2EN;
+         info.base = info.tex_base;
+
+         /* Note: the indirect source is now a vec2 instead of hvec2 */
+         struct ir3_instruction *texture, *sampler;
+
+         texture = ir3_get_src(ctx, &intr->src[0])[0];
+         sampler = create_immed(b, 0);
+         info.samp_tex = ir3_collect(ctx, texture, sampler);
+      }
+   } else {
+      info.flags |= IR3_INSTR_S2EN;
+      unsigned slot = nir_src_as_uint(intr->src[0]);
+      unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);
+      struct ir3_instruction *texture, *sampler;
+
+      texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
+      sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
+
+      info.samp_tex = ir3_collect(ctx, sampler, texture);
+   }
+
+   return info;
  }
  
  static struct ir3_instruction *
  emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
-                type_t type, unsigned wrmask, struct ir3_instruction *src0,
-                struct ir3_instruction *src1)
+         type_t type, unsigned wrmask, struct ir3_instruction *src0,
+         struct ir3_instruction *src1)
  {
-       struct ir3_instruction *sam, *addr;
-       if (info.flags & IR3_INSTR_A1EN) {
-               addr = ir3_get_addr1(ctx, info.a1_val);
-       }
-       sam = ir3_SAM(ctx->block, opc, type, 0b1111, info.flags,
-                       info.samp_tex, src0, src1);
-       if (info.flags & IR3_INSTR_A1EN) {
-               ir3_instr_set_address(sam, addr);
-       }
-       if (info.flags & IR3_INSTR_B) {
-               sam->cat5.tex_base = info.base;
-               sam->cat5.samp = info.combined_idx;
-       }
-       return sam;
+   struct ir3_instruction *sam, *addr;
+   if (info.flags & IR3_INSTR_A1EN) {
+      addr = ir3_get_addr1(ctx, info.a1_val);
+   }
+   sam = ir3_SAM(ctx->block, opc, type, 0b1111, info.flags, info.samp_tex, src0,
+                 src1);
+   if (info.flags & IR3_INSTR_A1EN) {
+      ir3_instr_set_address(sam, addr);
+   }
+   if (info.flags & IR3_INSTR_B) {
+      sam->cat5.tex_base = info.base;
+      sam->cat5.samp = info.combined_idx;
+   }
+   return sam;
  }
  
  /* src[] = { deref, coord, sample_index }. const_index[] = {} */
  static void
  emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                          struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
-       struct ir3_instruction *sam;
-       struct ir3_instruction * const *src0 = ir3_get_src(ctx, &intr->src[1]);
-       struct ir3_instruction *coords[4];
-       unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
-       type_t type = ir3_get_type_for_image_intrinsic(intr);
-
-       /* hmm, this seems a bit odd, but it is what blob does and (at least
-        * a5xx) just faults on bogus addresses otherwise:
-        */
-       if (flags & IR3_INSTR_3D) {
-               flags &= ~IR3_INSTR_3D;
-               flags |= IR3_INSTR_A;
-       }
-       info.flags |= flags;
-
-       for (unsigned i = 0; i < ncoords; i++)
-               coords[i] = src0[i];
-
-       if (ncoords == 1)
-               coords[ncoords++] = create_immed(b, 0);
-
-       sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
-                                  ir3_create_collect(ctx, coords, ncoords), NULL);
-
-       ir3_handle_nonuniform(sam, intr);
-
-       sam->barrier_class = IR3_BARRIER_IMAGE_R;
-       sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
-
-       ir3_split_dest(b, dst, sam, 0, 4);
+   struct ir3_block *b = ctx->block;
+   struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
+   struct ir3_instruction *sam;
+   struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
+   struct ir3_instruction *coords[4];
+   unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
+   type_t type = ir3_get_type_for_image_intrinsic(intr);
+
+   /* hmm, this seems a bit odd, but it is what blob does and (at least
+    * a5xx) just faults on bogus addresses otherwise:
+    */
+   if (flags & IR3_INSTR_3D) {
+      flags &= ~IR3_INSTR_3D;
+      flags |= IR3_INSTR_A;
+   }
+   info.flags |= flags;
+
+   for (unsigned i = 0; i < ncoords; i++)
+      coords[i] = src0[i];
+
+   if (ncoords == 1)
+      coords[ncoords++] = create_immed(b, 0);
+
+   sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
+                  ir3_create_collect(ctx, coords, ncoords), NULL);
+
+   ir3_handle_nonuniform(sam, intr);
+
+   sam->barrier_class = IR3_BARRIER_IMAGE_R;
+   sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
+   ir3_split_dest(b, dst, sam, 0, 4);
  }
  
  /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
  void
-emit_intrinsic_image_size_tex(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+emit_intrinsic_image_size_tex(struct ir3_context *ctx,
+                              nir_intrinsic_instr *intr,
+                              struct ir3_instruction **dst)
  {
-       struct ir3_block *b = ctx->block;
-       struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
-       struct ir3_instruction *sam, *lod;
-       unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
-       type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ?
-                       TYPE_U16 : TYPE_U32;
-
-       info.flags |= flags;
-       assert(nir_src_as_uint(intr->src[1]) == 0);
-       lod = create_immed(b, 0);
-       sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
-
-       /* Array size actually ends up in .w rather than .z. This doesn't
-        * matter for miplevel 0, but for higher mips the value in z is
-        * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-        * returned, which means that we have to add 1 to it for arrays for
-        * a3xx.
-        *
-        * Note use a temporary dst and then copy, since the size of the dst
-        * array that is passed in is based on nir's understanding of the
-        * result size, not the hardware's
-        */
-       struct ir3_instruction *tmp[4];
-
-       ir3_split_dest(b, tmp, sam, 0, 4);
-
-       for (unsigned i = 0; i < ncoords; i++)
-               dst[i] = tmp[i];
-
-       if (flags & IR3_INSTR_A) {
-               if (ctx->compiler->levels_add_one) {
-                       dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
-               } else {
-                       dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
-               }
-       }
+   struct ir3_block *b = ctx->block;
+   struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
+   struct ir3_instruction *sam, *lod;
+   unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
+   type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32;
+
+   info.flags |= flags;
+   assert(nir_src_as_uint(intr->src[1]) == 0);
+   lod = create_immed(b, 0);
+   sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
+
+   /* Array size actually ends up in .w rather than .z. This doesn't
+    * matter for miplevel 0, but for higher mips the value in z is
+    * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+    * returned, which means that we have to add 1 to it for arrays for
+    * a3xx.
+    *
+    * Note use a temporary dst and then copy, since the size of the dst
+    * array that is passed in is based on nir's understanding of the
+    * result size, not the hardware's
+    */
+   struct ir3_instruction *tmp[4];
+
+   ir3_split_dest(b, tmp, sam, 0, 4);
+
+   for (unsigned i = 0; i < ncoords; i++)
+      dst[i] = tmp[i];
+
+   if (flags & IR3_INSTR_A) {
+      if (ctx->compiler->levels_add_one) {
+         dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
+      } else {
+         dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
+      }
+   }
  }
  
  static void
  emit_control_barrier(struct ir3_context *ctx)
  {
-       /* Hull shaders dispatch 32 wide so an entire patch will always
-        * fit in a single warp and execute in lock-step. Consequently,
-        * we don't need to do anything for TCS barriers. Emitting
-        * barrier instruction will deadlock.
-        */
-       if (ctx->so->type == MESA_SHADER_TESS_CTRL)
-               return;
-
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *barrier = ir3_BAR(b);
-       barrier->cat7.g = true;
-       if (ctx->compiler->gpu_id < 600)
-               barrier->cat7.l = true;
-       barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
-       barrier->barrier_class = IR3_BARRIER_EVERYTHING;
-       array_insert(b, b->keeps, barrier);
+   /* Hull shaders dispatch 32 wide so an entire patch will always
+    * fit in a single warp and execute in lock-step. Consequently,
+    * we don't need to do anything for TCS barriers. Emitting
+    * barrier instruction will deadlock.
+    */
+   if (ctx->so->type == MESA_SHADER_TESS_CTRL)
+      return;
+
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *barrier = ir3_BAR(b);
+   barrier->cat7.g = true;
+   if (ctx->compiler->gpu_id < 600)
+      barrier->cat7.l = true;
+   barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+   barrier->barrier_class = IR3_BARRIER_EVERYTHING;
+   array_insert(b, b->keeps, barrier);
  }
  
  static void
  emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *barrier;
-
-       /* TODO: find out why there is a major difference of .l usage
-        * between a5xx and a6xx,
-        */
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_control_barrier:
-               emit_control_barrier(ctx);
-               return;
-       case nir_intrinsic_scoped_barrier: {
-               nir_scope exec_scope = nir_intrinsic_execution_scope(intr);
-               nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
-
-               if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
-                       /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,
-                        * because hull shaders dispatch 32 wide so an entire patch will
-                        * always fit in a single warp and execute in lock-step.
-                        *
-                        * TODO: memory barrier also tells us not to reorder stores, this
-                        * information is lost here (backend doesn't reorder stores so we
-                        * are safe for now).
-                        */
-                       modes &= ~nir_var_shader_out;
-               }
-
-               assert(!(modes & nir_var_shader_out));
-
-               if ((modes & (nir_var_mem_shared | nir_var_mem_ssbo |
-                               nir_var_mem_global))) {
-                       barrier = ir3_FENCE(b);
-                       barrier->cat7.r = true;
-                       barrier->cat7.w = true;
-
-                       if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
-                               barrier->cat7.g = true;
-                       }
-
-                       if (ctx->compiler->gpu_id > 600) {
-                               if (modes & nir_var_mem_ssbo) {
-                                       barrier->cat7.l = true;
-                               }
-                       } else {
-                               if (modes & (nir_var_mem_shared | nir_var_mem_ssbo)) {
-                                       barrier->cat7.l = true;
-                               }
-                       }
-
-                       barrier->barrier_class = 0;
-                       barrier->barrier_conflict = 0;
-
-                       if (modes & nir_var_mem_shared) {
-                               barrier->barrier_class |= IR3_BARRIER_SHARED_W;
-                               barrier->barrier_conflict |= IR3_BARRIER_SHARED_R |
-                                               IR3_BARRIER_SHARED_W;
-                       }
-
-                       if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
-                               barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
-                               barrier->barrier_conflict |=
-                                               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-                       }
-
-                       /* TODO: check for image mode when it has a separate one */
-                       if (modes & nir_var_mem_ssbo) {
-                               barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
-                               barrier->barrier_conflict |=
-                                               IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
-                       }
-                       array_insert(b, b->keeps, barrier);
-               }
-
-               if (exec_scope >= NIR_SCOPE_WORKGROUP) {
-                       emit_control_barrier(ctx);
-               }
-
-               return;
-       }
-       case nir_intrinsic_memory_barrier_tcs_patch:
-               /* Not applicable, see explanation for scoped_barrier + shader_out */
-               return;
-       case nir_intrinsic_memory_barrier_buffer:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               if (ctx->compiler->gpu_id > 600)
-                       barrier->cat7.l = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_BUFFER_W;
-               barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
-                               IR3_BARRIER_BUFFER_W;
-               break;
-       case nir_intrinsic_memory_barrier_image:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.l = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_IMAGE_W;
-               barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
-                               IR3_BARRIER_IMAGE_W;
-               break;
-       case nir_intrinsic_memory_barrier_shared:
-               barrier = ir3_FENCE(b);
-               if (ctx->compiler->gpu_id < 600)
-                       barrier->cat7.l = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_SHARED_W;
-               barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
-                               IR3_BARRIER_SHARED_W;
-               break;
-       case nir_intrinsic_memory_barrier:
-       case nir_intrinsic_group_memory_barrier:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.l = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_SHARED_W |
-                               IR3_BARRIER_IMAGE_W |
-                               IR3_BARRIER_BUFFER_W;
-               barrier->barrier_conflict =
-                               IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
-                               IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
-                               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       /* make sure barrier doesn't get DCE'd */
-       array_insert(b, b->keeps, barrier);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *barrier;
+
+   /* TODO: find out why there is a major difference of .l usage
+    * between a5xx and a6xx,
+    */
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_control_barrier:
+      emit_control_barrier(ctx);
+      return;
+   case nir_intrinsic_scoped_barrier: {
+      nir_scope exec_scope = nir_intrinsic_execution_scope(intr);
+      nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
+
+      if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
+         /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,
+          * because hull shaders dispatch 32 wide so an entire patch will
+          * always fit in a single warp and execute in lock-step.
+          *
+          * TODO: memory barrier also tells us not to reorder stores, this
+          * information is lost here (backend doesn't reorder stores so we
+          * are safe for now).
+          */
+         modes &= ~nir_var_shader_out;
+      }
+
+      assert(!(modes & nir_var_shader_out));
+
+      if ((modes &
+           (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global))) {
+         barrier = ir3_FENCE(b);
+         barrier->cat7.r = true;
+         barrier->cat7.w = true;
+
+         if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
+            barrier->cat7.g = true;
+         }
+
+         if (ctx->compiler->gpu_id > 600) {
+            if (modes & nir_var_mem_ssbo) {
+               barrier->cat7.l = true;
+            }
+         } else {
+            if (modes & (nir_var_mem_shared | nir_var_mem_ssbo)) {
+               barrier->cat7.l = true;
+            }
+         }
+
+         barrier->barrier_class = 0;
+         barrier->barrier_conflict = 0;
+
+         if (modes & nir_var_mem_shared) {
+            barrier->barrier_class |= IR3_BARRIER_SHARED_W;
+            barrier->barrier_conflict |=
+               IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+         }
+
+         if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
+            barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
+            barrier->barrier_conflict |=
+               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+         }
+
+         /* TODO: check for image mode when it has a separate one */
+         if (modes & nir_var_mem_ssbo) {
+            barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
+            barrier->barrier_conflict |=
+               IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
+         }
+         array_insert(b, b->keeps, barrier);
+      }
+
+      if (exec_scope >= NIR_SCOPE_WORKGROUP) {
+         emit_control_barrier(ctx);
+      }
+
+      return;
+   }
+   case nir_intrinsic_memory_barrier_tcs_patch:
+      /* Not applicable, see explanation for scoped_barrier + shader_out */
+      return;
+   case nir_intrinsic_memory_barrier_buffer:
+      barrier = ir3_FENCE(b);
+      barrier->cat7.g = true;
+      if (ctx->compiler->gpu_id > 600)
+         barrier->cat7.l = true;
+      barrier->cat7.r = true;
+      barrier->cat7.w = true;
+      barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+      barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+      break;
+   case nir_intrinsic_memory_barrier_image:
+      barrier = ir3_FENCE(b);
+      barrier->cat7.g = true;
+      barrier->cat7.l = true;
+      barrier->cat7.r = true;
+      barrier->cat7.w = true;
+      barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+      barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+      break;
+   case nir_intrinsic_memory_barrier_shared:
+      barrier = ir3_FENCE(b);
+      if (ctx->compiler->gpu_id < 600)
+         barrier->cat7.l = true;
+      barrier->cat7.r = true;
+      barrier->cat7.w = true;
+      barrier->barrier_class = IR3_BARRIER_SHARED_W;
+      barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+      break;
+   case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_group_memory_barrier:
+      barrier = ir3_FENCE(b);
+      barrier->cat7.g = true;
+      barrier->cat7.l = true;
+      barrier->cat7.r = true;
+      barrier->cat7.w = true;
+      barrier->barrier_class =
+         IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
+      barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+                                  IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+                                  IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+      break;
+   default:
+      unreachable("boo");
+   }
+
+   /* make sure barrier doesn't get DCE'd */
+   array_insert(b, b->keeps, barrier);
  }
  
-static void add_sysval_input_compmask(struct ir3_context *ctx,
-               gl_system_value slot, unsigned compmask,
-               struct ir3_instruction *instr)
+static void
+add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
+                          unsigned compmask, struct ir3_instruction *instr)
  {
-       struct ir3_shader_variant *so = ctx->so;
-       unsigned n = so->inputs_count++;
+   struct ir3_shader_variant *so = ctx->so;
+   unsigned n = so->inputs_count++;
  
-       assert(instr->opc == OPC_META_INPUT);
-       instr->input.inidx = n;
-       instr->input.sysval = slot;
+   assert(instr->opc == OPC_META_INPUT);
+   instr->input.inidx = n;
+   instr->input.sysval = slot;
  
-       so->inputs[n].sysval = true;
-       so->inputs[n].slot = slot;
-       so->inputs[n].compmask = compmask;
-       so->total_in++;
+   so->inputs[n].sysval = true;
+   so->inputs[n].slot = slot;
+   so->inputs[n].compmask = compmask;
+   so->total_in++;
  
-       so->sysval_in += util_last_bit(compmask);
+   so->sysval_in += util_last_bit(compmask);
  }
  
  static struct ir3_instruction *
  create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
-               unsigned compmask)
+                    unsigned compmask)
  {
-       assert(compmask);
-       struct ir3_instruction *sysval = create_input(ctx, compmask);
-       add_sysval_input_compmask(ctx, slot, compmask, sysval);
-       return sysval;
+   assert(compmask);
+   struct ir3_instruction *sysval = create_input(ctx, compmask);
+   add_sysval_input_compmask(ctx, slot, compmask, sysval);
+   return sysval;
  }
  
  static struct ir3_instruction *
  get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
  {
-       static const gl_system_value sysval_base = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
-
-       STATIC_ASSERT(sysval_base + IJ_PERSP_PIXEL == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
-       STATIC_ASSERT(sysval_base + IJ_PERSP_SAMPLE == SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
-       STATIC_ASSERT(sysval_base + IJ_PERSP_CENTROID == SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
-       STATIC_ASSERT(sysval_base + IJ_PERSP_SIZE == SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE);
-       STATIC_ASSERT(sysval_base + IJ_LINEAR_PIXEL == SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
-       STATIC_ASSERT(sysval_base + IJ_LINEAR_CENTROID == SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
-       STATIC_ASSERT(sysval_base + IJ_LINEAR_SAMPLE == SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
-
-       if (!ctx->ij[bary]) {
-               struct ir3_instruction *xy[2];
-               struct ir3_instruction *ij;
-
-               ij = create_sysval_input(ctx, sysval_base + bary, 0x3);
-               ir3_split_dest(ctx->block, xy, ij, 0, 2);
-
-               ctx->ij[bary] = ir3_create_collect(ctx, xy, 2);
-       }
-
-       return ctx->ij[bary];
+   static const gl_system_value sysval_base =
+      SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
+
+   STATIC_ASSERT(sysval_base + IJ_PERSP_PIXEL ==
+                 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
+   STATIC_ASSERT(sysval_base + IJ_PERSP_SAMPLE ==
+                 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
+   STATIC_ASSERT(sysval_base + IJ_PERSP_CENTROID ==
+                 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
+   STATIC_ASSERT(sysval_base + IJ_PERSP_SIZE ==
+                 SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE);
+   STATIC_ASSERT(sysval_base + IJ_LINEAR_PIXEL ==
+                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
+   STATIC_ASSERT(sysval_base + IJ_LINEAR_CENTROID ==
+                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
+   STATIC_ASSERT(sysval_base + IJ_LINEAR_SAMPLE ==
+                 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
+
+   if (!ctx->ij[bary]) {
+      struct ir3_instruction *xy[2];
+      struct ir3_instruction *ij;
+
+      ij = create_sysval_input(ctx, sysval_base + bary, 0x3);
+      ir3_split_dest(ctx->block, xy, ij, 0, 2);
+
+      ctx->ij[bary] = ir3_create_collect(ctx, xy, 2);
+   }
+
+   return ctx->ij[bary];
  }
  
  /* TODO: make this a common NIR helper?
- * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so it
- * can't be extended to work with this
+ * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
+ * it can't be extended to work with this
   */
  static gl_system_value
  nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
  {
-       enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
-       gl_system_value sysval;
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_load_barycentric_pixel:
-               if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
-               else
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
-               break;
-       case nir_intrinsic_load_barycentric_centroid:
-               if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
-               else
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
-               break;
-       case nir_intrinsic_load_barycentric_sample:
-               if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
-               else
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
-               break;
-       default:
-               unreachable("invalid barycentric intrinsic");
-       }
-
-       return sysval;
+   enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
+   gl_system_value sysval;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
+         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
+      else
+         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
+         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
+      else
+         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+      if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
+         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
+      else
+         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
+      break;
+   default:
+      unreachable("invalid barycentric intrinsic");
+   }
+
+   return sysval;
  }
  
  static void
  emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
+                           struct ir3_instruction **dst)
  {
-       gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
-
-       if (!ctx->so->key.msaa) {
-               switch (sysval) {
-               case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
-                       break;
-               case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
-                       if (ctx->compiler->gpu_id < 600)
-                               sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
-                       break;
-               case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
-                       sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
-                       break;
-               case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
-                       if (ctx->compiler->gpu_id < 600)
-                               sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
-                       break;
-               default:
-                       break;
-               }
-       }
-
-       enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
-
-       struct ir3_instruction *ij = get_barycentric(ctx, bary);
-       ir3_split_dest(ctx->block, dst, ij, 0, 2);
+   gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
+
+   if (!ctx->so->key.msaa) {
+      switch (sysval) {
+      case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
+         sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
+         break;
+      case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
+         if (ctx->compiler->gpu_id < 600)
+            sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
+         break;
+      case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
+         sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
+         break;
+      case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
+         if (ctx->compiler->gpu_id < 600)
+            sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
+         break;
+      default:
+         break;
+      }
+   }
+
+   enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
+
+   struct ir3_instruction *ij = get_barycentric(ctx, bary);
+   ir3_split_dest(ctx->block, dst, ij, 0, 2);
  }
  
  static struct ir3_instruction *
  get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       if (!ctx->frag_coord) {
-               struct ir3_block *b = ctx->in_block;
-               struct ir3_instruction *xyzw[4];
-               struct ir3_instruction *hw_frag_coord;
-
-               hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
-               ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
-
-               /* for frag_coord.xy, we get unsigned values.. we need
-                * to subtract (integer) 8 and divide by 16 (right-
-                * shift by 4) then convert to float:
-                *
-                *    sub.s tmp, src, 8
-                *    shr.b tmp, tmp, 4
-                *    mov.u32f32 dst, tmp
-                *
-                */
-               for (int i = 0; i < 2; i++) {
-                       xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
-                       xyzw[i] = ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
-               }
-
-               ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4);
-       }
-
-       ctx->so->fragcoord_compmask |=
-                       nir_ssa_def_components_read(&intr->dest.ssa);
-
-       return ctx->frag_coord;
+   if (!ctx->frag_coord) {
+      struct ir3_block *b = ctx->in_block;
+      struct ir3_instruction *xyzw[4];
+      struct ir3_instruction *hw_frag_coord;
+
+      hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
+      ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
+
+      /* for frag_coord.xy, we get unsigned values.. we need
+       * to subtract (integer) 8 and divide by 16 (right-
+       * shift by 4) then convert to float:
+       *
+       *    sub.s tmp, src, 8
+       *    shr.b tmp, tmp, 4
+       *    mov.u32f32 dst, tmp
+       *
+       */
+      for (int i = 0; i < 2; i++) {
+         xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
+         xyzw[i] =
+            ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
+      }
+
+      ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4);
+   }
+
+   ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);
+
+   return ctx->frag_coord;
  }
  
  static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
@@ -1626,562 +1625,561 @@ static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
  static void
  emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
-       struct ir3_instruction **dst;
-       struct ir3_instruction * const *src;
-       struct ir3_block *b = ctx->block;
-       unsigned dest_components = nir_intrinsic_dest_components(intr);
-       int idx;
-
-       if (info->has_dest) {
-               dst = ir3_get_dst(ctx, &intr->dest, dest_components);
-       } else {
-               dst = NULL;
-       }
-
-       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-       const unsigned primitive_param = const_state->offsets.primitive_param * 4;
-       const unsigned primitive_map = const_state->offsets.primitive_map * 4;
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_load_uniform:
-               idx = nir_intrinsic_base(intr);
-               if (nir_src_is_const(intr->src[0])) {
-                       idx += nir_src_as_uint(intr->src[0]);
-                       for (int i = 0; i < dest_components; i++) {
-                               dst[i] = create_uniform_typed(b, idx + i,
-                                       nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);
-                       }
-               } else {
-                       src = ir3_get_src(ctx, &intr->src[0]);
-                       for (int i = 0; i < dest_components; i++) {
-                               dst[i] = create_uniform_indirect(b, idx + i,
-                                               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,
-                                               ir3_get_addr0(ctx, src[0], 1));
-                       }
-                       /* NOTE: if relative addressing is used, we set
-                        * constlen in the compiler (to worst-case value)
-                        * since we don't know in the assembler what the max
-                        * addr reg value can be:
-                        */
-                       ctx->so->constlen = MAX2(ctx->so->constlen,
-                                       const_state->ubo_state.size / 16);
-               }
-               break;
-
-       case nir_intrinsic_load_vs_primitive_stride_ir3:
-               dst[0] = create_uniform(b, primitive_param + 0);
-               break;
-       case nir_intrinsic_load_vs_vertex_stride_ir3:
-               dst[0] = create_uniform(b, primitive_param + 1);
-               break;
-       case nir_intrinsic_load_hs_patch_stride_ir3:
-               dst[0] = create_uniform(b, primitive_param + 2);
-               break;
-       case nir_intrinsic_load_patch_vertices_in:
-               dst[0] = create_uniform(b, primitive_param + 3);
-               break;
-       case nir_intrinsic_load_tess_param_base_ir3:
-               dst[0] = create_uniform(b, primitive_param + 4);
-               dst[1] = create_uniform(b, primitive_param + 5);
-               break;
-       case nir_intrinsic_load_tess_factor_base_ir3:
-               dst[0] = create_uniform(b, primitive_param + 6);
-               dst[1] = create_uniform(b, primitive_param + 7);
-               break;
-
-       case nir_intrinsic_load_primitive_location_ir3:
-               idx = nir_intrinsic_driver_location(intr);
-               dst[0] = create_uniform(b, primitive_map + idx);
-               break;
-
-       case nir_intrinsic_load_gs_header_ir3:
-               dst[0] = ctx->gs_header;
-               break;
-       case nir_intrinsic_load_tcs_header_ir3:
-               dst[0] = ctx->tcs_header;
-               break;
-
-       case nir_intrinsic_load_primitive_id:
-               dst[0] = ctx->primitive_id;
-               break;
-
-       case nir_intrinsic_load_tess_coord:
-               if (!ctx->tess_coord) {
-                       ctx->tess_coord =
-                               create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
-               }
-               ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
-
-               /* Unused, but ir3_put_dst() below wants to free something */
-               dst[2] = create_immed(b, 0);
-               break;
-
-       case nir_intrinsic_end_patch_ir3:
-               assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
-               struct ir3_instruction *end = ir3_PREDE(b);
-               array_insert(b, b->keeps, end);
-
-               end->barrier_class = IR3_BARRIER_EVERYTHING;
-               end->barrier_conflict = IR3_BARRIER_EVERYTHING;
-               break;
-
-       case nir_intrinsic_store_global_ir3:
-               ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
-               break;
-       case nir_intrinsic_load_global_ir3:
-               ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
-               break;
-
-       case nir_intrinsic_load_ubo:
-               emit_intrinsic_load_ubo(ctx, intr, dst);
-               break;
-       case nir_intrinsic_load_ubo_vec4:
-               emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
-               break;
-       case nir_intrinsic_load_frag_coord:
-               ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
-               break;
-       case nir_intrinsic_load_sample_pos_from_id: {
-               /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
-                * but that doesn't seem necessary.
-                */
-               struct ir3_instruction *offset =
-                       ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
-               offset->dsts[0]->wrmask = 0x3;
-               offset->cat5.type = TYPE_F32;
-
-               ir3_split_dest(b, dst, offset, 0, 2);
-
-               break;
-       }
-       case nir_intrinsic_load_size_ir3:
-               if (!ctx->ij[IJ_PERSP_SIZE]) {
-                       ctx->ij[IJ_PERSP_SIZE] =
-                               create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, 0x1);
-               }
-               dst[0] = ctx->ij[IJ_PERSP_SIZE];
-               break;
-       case nir_intrinsic_load_barycentric_centroid:
-       case nir_intrinsic_load_barycentric_sample:
-       case nir_intrinsic_load_barycentric_pixel:
-               emit_intrinsic_barycentric(ctx, intr, dst);
-               break;
-       case nir_intrinsic_load_interpolated_input:
-       case nir_intrinsic_load_input:
-               setup_input(ctx, intr);
-               break;
-       /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
-        * pass and replaced by an ir3-specifc version that adds the
-        * dword-offset in the last source.
-        */
-       case nir_intrinsic_load_ssbo_ir3:
-               ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
-               break;
-       case nir_intrinsic_store_ssbo_ir3:
-               if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
-                               !ctx->s->info.fs.early_fragment_tests)
-                       ctx->so->no_earlyz = true;
-               ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
-               break;
-       case nir_intrinsic_get_ssbo_size:
-               emit_intrinsic_ssbo_size(ctx, intr, dst);
-               break;
-       case nir_intrinsic_ssbo_atomic_add_ir3:
-       case nir_intrinsic_ssbo_atomic_imin_ir3:
-       case nir_intrinsic_ssbo_atomic_umin_ir3:
-       case nir_intrinsic_ssbo_atomic_imax_ir3:
-       case nir_intrinsic_ssbo_atomic_umax_ir3:
-       case nir_intrinsic_ssbo_atomic_and_ir3:
-       case nir_intrinsic_ssbo_atomic_or_ir3:
-       case nir_intrinsic_ssbo_atomic_xor_ir3:
-       case nir_intrinsic_ssbo_atomic_exchange_ir3:
-       case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
-               if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
-                               !ctx->s->info.fs.early_fragment_tests)
-                       ctx->so->no_earlyz = true;
-               dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
-               break;
-       case nir_intrinsic_load_shared:
-               emit_intrinsic_load_shared(ctx, intr, dst);
-               break;
-       case nir_intrinsic_store_shared:
-               emit_intrinsic_store_shared(ctx, intr);
-               break;
-       case nir_intrinsic_shared_atomic_add:
-       case nir_intrinsic_shared_atomic_imin:
-       case nir_intrinsic_shared_atomic_umin:
-       case nir_intrinsic_shared_atomic_imax:
-       case nir_intrinsic_shared_atomic_umax:
-       case nir_intrinsic_shared_atomic_and:
-       case nir_intrinsic_shared_atomic_or:
-       case nir_intrinsic_shared_atomic_xor:
-       case nir_intrinsic_shared_atomic_exchange:
-       case nir_intrinsic_shared_atomic_comp_swap:
-               dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
-               break;
-       case nir_intrinsic_load_scratch:
-               emit_intrinsic_load_scratch(ctx, intr, dst);
-               break;
-       case nir_intrinsic_store_scratch:
-               emit_intrinsic_store_scratch(ctx, intr);
-               break;
-       case nir_intrinsic_image_load:
-               emit_intrinsic_load_image(ctx, intr, dst);
-               break;
-       case nir_intrinsic_bindless_image_load:
-               /* Bindless uses the IBO state, which doesn't have swizzle filled out,
-                * so using isam doesn't work.
-                *
-                * TODO: can we use isam if we fill out more fields?
-                */
-               ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
-               break;
-       case nir_intrinsic_image_store:
-       case nir_intrinsic_bindless_image_store:
-               if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
-                               !ctx->s->info.fs.early_fragment_tests)
-                       ctx->so->no_earlyz = true;
-               ctx->funcs->emit_intrinsic_store_image(ctx, intr);
-               break;
-       case nir_intrinsic_image_size:
-       case nir_intrinsic_bindless_image_size:
-               ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
-               break;
-       case nir_intrinsic_image_atomic_add:
-       case nir_intrinsic_bindless_image_atomic_add:
-       case nir_intrinsic_image_atomic_imin:
-       case nir_intrinsic_bindless_image_atomic_imin:
-       case nir_intrinsic_image_atomic_umin:
-       case nir_intrinsic_bindless_image_atomic_umin:
-       case nir_intrinsic_image_atomic_imax:
-       case nir_intrinsic_bindless_image_atomic_imax:
-       case nir_intrinsic_image_atomic_umax:
-       case nir_intrinsic_bindless_image_atomic_umax:
-       case nir_intrinsic_image_atomic_and:
-       case nir_intrinsic_bindless_image_atomic_and:
-       case nir_intrinsic_image_atomic_or:
-       case nir_intrinsic_bindless_image_atomic_or:
-       case nir_intrinsic_image_atomic_xor:
-       case nir_intrinsic_bindless_image_atomic_xor:
-       case nir_intrinsic_image_atomic_exchange:
-       case nir_intrinsic_bindless_image_atomic_exchange:
-       case nir_intrinsic_image_atomic_comp_swap:
-       case nir_intrinsic_bindless_image_atomic_comp_swap:
-               if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
-                               !ctx->s->info.fs.early_fragment_tests)
-                       ctx->so->no_earlyz = true;
-               dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
-               break;
-       case nir_intrinsic_scoped_barrier:
-       case nir_intrinsic_control_barrier:
-       case nir_intrinsic_memory_barrier:
-       case nir_intrinsic_group_memory_barrier:
-       case nir_intrinsic_memory_barrier_buffer:
-       case nir_intrinsic_memory_barrier_image:
-       case nir_intrinsic_memory_barrier_shared:
-       case nir_intrinsic_memory_barrier_tcs_patch:
-               emit_intrinsic_barrier(ctx, intr);
-               /* note that blk ptr no longer valid, make that obvious: */
-               b = NULL;
-               break;
-       case nir_intrinsic_store_output:
-               setup_output(ctx, intr);
-               break;
-       case nir_intrinsic_load_base_vertex:
-       case nir_intrinsic_load_first_vertex:
-               if (!ctx->basevertex) {
-                       ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
-               }
-               dst[0] = ctx->basevertex;
-               break;
-       case nir_intrinsic_load_draw_id:
-               if (!ctx->draw_id) {
-                       ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
-               }
-               dst[0] = ctx->draw_id;
-               break;
-       case nir_intrinsic_load_base_instance:
-               if (!ctx->base_instance) {
-                       ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
-               }
-               dst[0] = ctx->base_instance;
-               break;
-       case nir_intrinsic_load_view_index:
-               if (!ctx->view_index) {
-                       ctx->view_index = create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
-               }
-               dst[0] = ctx->view_index;
-               break;
-       case nir_intrinsic_load_vertex_id_zero_base:
-       case nir_intrinsic_load_vertex_id:
-               if (!ctx->vertex_id) {
-                       gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
-                               SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
-                       ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
-               }
-               dst[0] = ctx->vertex_id;
-               break;
-       case nir_intrinsic_load_instance_id:
-               if (!ctx->instance_id) {
-                       ctx->instance_id = create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
-               }
-               dst[0] = ctx->instance_id;
-               break;
-       case nir_intrinsic_load_sample_id:
-               ctx->so->per_samp = true;
-               FALLTHROUGH;
-       case nir_intrinsic_load_sample_id_no_per_sample:
-               if (!ctx->samp_id) {
-                       ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
-                       ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
-               }
-               dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
-               break;
-       case nir_intrinsic_load_sample_mask_in:
-               if (!ctx->samp_mask_in) {
-                       ctx->samp_mask_in = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
-               }
-               dst[0] = ctx->samp_mask_in;
-               break;
-       case nir_intrinsic_load_user_clip_plane:
-               idx = nir_intrinsic_ucp_id(intr);
-               for (int i = 0; i < dest_components; i++) {
-                       unsigned n = idx * 4 + i;
-                       dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
-               }
-               break;
-       case nir_intrinsic_load_front_face:
-               if (!ctx->frag_face) {
-                       ctx->so->frag_face = true;
-                       ctx->frag_face = create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
-                       ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
-               }
-               /* for fragface, we get -1 for back and 0 for front. However this is
-                * the inverse of what nir expects (where ~0 is true).
-                */
-               dst[0] = ir3_CMPS_S(b,
-                               ctx->frag_face, 0,
-                               create_immed_typed(b, 0, TYPE_U16), 0);
-               dst[0]->cat2.condition = IR3_COND_EQ;
-               break;
-       case nir_intrinsic_load_local_invocation_id:
-               if (!ctx->local_invocation_id) {
-                       ctx->local_invocation_id =
-                               create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
-               }
-               ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
-               break;
-       case nir_intrinsic_load_workgroup_id:
-       case nir_intrinsic_load_workgroup_id_zero_base:
-               if (!ctx->work_group_id) {
-                       ctx->work_group_id =
-                               create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
-                       ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
-               }
-               ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
-               break;
-       case nir_intrinsic_load_base_workgroup_id:
-               for (int i = 0; i < dest_components; i++) {
-                       dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
-               }
-               break;
-       case nir_intrinsic_load_num_workgroups:
-               for (int i = 0; i < dest_components; i++) {
-                       dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
-               }
-               break;
-       case nir_intrinsic_load_workgroup_size:
-               for (int i = 0; i < dest_components; i++) {
-                       dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
-               }
-               break;
-       case nir_intrinsic_load_subgroup_size:
-               dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
-               break;
-       case nir_intrinsic_load_subgroup_id_shift_ir3:
-               dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
-               break;
-       case nir_intrinsic_discard_if:
-       case nir_intrinsic_discard:
-       case nir_intrinsic_demote:
-       case nir_intrinsic_demote_if:
-       case nir_intrinsic_terminate:
-       case nir_intrinsic_terminate_if:
-       {
-               struct ir3_instruction *cond, *kill;
-
-               if (intr->intrinsic == nir_intrinsic_discard_if ||
-                       intr->intrinsic == nir_intrinsic_demote_if ||
-                       intr->intrinsic == nir_intrinsic_terminate_if) {
-                       /* conditional discard: */
-                       src = ir3_get_src(ctx, &intr->src[0]);
-                       cond = src[0];
-               } else {
-                       /* unconditional discard: */
-                       cond = create_immed(b, 1);
-               }
-
-               /* NOTE: only cmps.*.* can write p0.x: */
-               cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
-               cond->cat2.condition = IR3_COND_NE;
-
-               /* condition always goes in predicate register: */
-               cond->dsts[0]->num = regid(REG_P0, 0);
-               cond->dsts[0]->flags &= ~IR3_REG_SSA;
-
-               if (intr->intrinsic == nir_intrinsic_demote ||
-                       intr->intrinsic == nir_intrinsic_demote_if) {
-                       kill = ir3_DEMOTE(b, cond, 0);
-               } else {
-                       kill = ir3_KILL(b, cond, 0);
-               }
-
-               /* Side-effects should not be moved on a different side of the kill */
-               kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
-               kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
-               kill->srcs[0]->num = regid(REG_P0, 0);
-               array_insert(ctx->ir, ctx->ir->predicates, kill);
-
-               array_insert(b, b->keeps, kill);
-               ctx->so->has_kill = true;
-
-               break;
-       }
-
-       case nir_intrinsic_cond_end_ir3: {
-               struct ir3_instruction *cond, *kill;
-
-               src = ir3_get_src(ctx, &intr->src[0]);
-               cond = src[0];
-
-               /* NOTE: only cmps.*.* can write p0.x: */
-               cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
-               cond->cat2.condition = IR3_COND_NE;
-
-               /* condition always goes in predicate register: */
-               cond->dsts[0]->num = regid(REG_P0, 0);
-
-               kill = ir3_PREDT(b, cond, 0);
-
-               kill->barrier_class = IR3_BARRIER_EVERYTHING;
-               kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
-
-               array_insert(ctx->ir, ctx->ir->predicates, kill);
-               array_insert(b, b->keeps, kill);
-               break;
-       }
-
-       case nir_intrinsic_vote_any:
-       case nir_intrinsic_vote_all: {
-               struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
-               struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
-               if (intr->intrinsic == nir_intrinsic_vote_any)
-                       dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
-               else
-                       dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
-               dst[0]->srcs[0]->num = regid(REG_P0, 0);
-               array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
-               break;
-       }
-       case nir_intrinsic_elect:
-               dst[0] = ir3_ELECT_MACRO(ctx->block);
-               /* This may expand to a divergent if/then, so allocate stack space for
-                * it.
-                */
-               ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
-               break;
-
-       case nir_intrinsic_read_invocation_cond_ir3: {
-               struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
-               struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
-               dst[0] = ir3_READ_COND_MACRO(ctx->block,
-                                                                        ir3_get_predicate(ctx, cond), 0,
-                                                                        src, 0);
-               dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
-               dst[0]->srcs[0]->num = regid(REG_P0, 0);
-               array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
-               ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
-               break;
-       }
-
-       case nir_intrinsic_read_first_invocation: {
-               struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
-               dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
-               dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
-               ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
-               break;
-       }
-
-       case nir_intrinsic_ballot: {
-               struct ir3_instruction *ballot;
-               unsigned components = intr->dest.ssa.num_components;
-               if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
-                       /* ballot(true) is just MOVMSK */
-                       ballot = ir3_MOVMSK(ctx->block, components);
-               } else {
-                       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
-                       struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
-                       ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
-                       ballot->srcs[0]->num = regid(REG_P0, 0);
-                       array_insert(ctx->ir, ctx->ir->predicates, ballot);
-                       ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
-               }
-               ir3_split_dest(ctx->block, dst, ballot, 0, components);
-               break;
-       }
-
-       case nir_intrinsic_load_shared_ir3:
-               emit_intrinsic_load_shared_ir3(ctx, intr, dst);
-               break;
-       case nir_intrinsic_store_shared_ir3:
-               emit_intrinsic_store_shared_ir3(ctx, intr);
-               break;
-       case nir_intrinsic_bindless_resource_ir3:
-               dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
-               break;
-       default:
-               ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
-                               nir_intrinsic_infos[intr->intrinsic].name);
-               break;
-       }
-
-       if (info->has_dest)
-               ir3_put_dst(ctx, &intr->dest);
+   const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+   struct ir3_instruction **dst;
+   struct ir3_instruction *const *src;
+   struct ir3_block *b = ctx->block;
+   unsigned dest_components = nir_intrinsic_dest_components(intr);
+   int idx;
+
+   if (info->has_dest) {
+      dst = ir3_get_dst(ctx, &intr->dest, dest_components);
+   } else {
+      dst = NULL;
+   }
+
+   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   const unsigned primitive_param = const_state->offsets.primitive_param * 4;
+   const unsigned primitive_map = const_state->offsets.primitive_map * 4;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_uniform:
+      idx = nir_intrinsic_base(intr);
+      if (nir_src_is_const(intr->src[0])) {
+         idx += nir_src_as_uint(intr->src[0]);
+         for (int i = 0; i < dest_components; i++) {
+            dst[i] = create_uniform_typed(
+               b, idx + i,
+               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);
+         }
+      } else {
+         src = ir3_get_src(ctx, &intr->src[0]);
+         for (int i = 0; i < dest_components; i++) {
+            dst[i] = create_uniform_indirect(
+               b, idx + i,
+               nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,
+               ir3_get_addr0(ctx, src[0], 1));
+         }
+         /* NOTE: if relative addressing is used, we set
+          * constlen in the compiler (to worst-case value)
+          * since we don't know in the assembler what the max
+          * addr reg value can be:
+          */
+         ctx->so->constlen =
+            MAX2(ctx->so->constlen, const_state->ubo_state.size / 16);
+      }
+      break;
+
+   case nir_intrinsic_load_vs_primitive_stride_ir3:
+      dst[0] = create_uniform(b, primitive_param + 0);
+      break;
+   case nir_intrinsic_load_vs_vertex_stride_ir3:
+      dst[0] = create_uniform(b, primitive_param + 1);
+      break;
+   case nir_intrinsic_load_hs_patch_stride_ir3:
+      dst[0] = create_uniform(b, primitive_param + 2);
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      dst[0] = create_uniform(b, primitive_param + 3);
+      break;
+   case nir_intrinsic_load_tess_param_base_ir3:
+      dst[0] = create_uniform(b, primitive_param + 4);
+      dst[1] = create_uniform(b, primitive_param + 5);
+      break;
+   case nir_intrinsic_load_tess_factor_base_ir3:
+      dst[0] = create_uniform(b, primitive_param + 6);
+      dst[1] = create_uniform(b, primitive_param + 7);
+      break;
+
+   case nir_intrinsic_load_primitive_location_ir3:
+      idx = nir_intrinsic_driver_location(intr);
+      dst[0] = create_uniform(b, primitive_map + idx);
+      break;
+
+   case nir_intrinsic_load_gs_header_ir3:
+      dst[0] = ctx->gs_header;
+      break;
+   case nir_intrinsic_load_tcs_header_ir3:
+      dst[0] = ctx->tcs_header;
+      break;
+
+   case nir_intrinsic_load_primitive_id:
+      dst[0] = ctx->primitive_id;
+      break;
+
+   case nir_intrinsic_load_tess_coord:
+      if (!ctx->tess_coord) {
+         ctx->tess_coord =
+            create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
+      }
+      ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
+
+      /* Unused, but ir3_put_dst() below wants to free something */
+      dst[2] = create_immed(b, 0);
+      break;
+
+   case nir_intrinsic_end_patch_ir3:
+      assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
+      struct ir3_instruction *end = ir3_PREDE(b);
+      array_insert(b, b->keeps, end);
+
+      end->barrier_class = IR3_BARRIER_EVERYTHING;
+      end->barrier_conflict = IR3_BARRIER_EVERYTHING;
+      break;
+
+   case nir_intrinsic_store_global_ir3:
+      ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
+      break;
+   case nir_intrinsic_load_global_ir3:
+      ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
+      break;
+
+   case nir_intrinsic_load_ubo:
+      emit_intrinsic_load_ubo(ctx, intr, dst);
+      break;
+   case nir_intrinsic_load_ubo_vec4:
+      emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
+      break;
+   case nir_intrinsic_load_frag_coord:
+      ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
+      break;
+   case nir_intrinsic_load_sample_pos_from_id: {
+      /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
+       * but that doesn't seem necessary.
+       */
+      struct ir3_instruction *offset =
+         ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
+      offset->dsts[0]->wrmask = 0x3;
+      offset->cat5.type = TYPE_F32;
+
+      ir3_split_dest(b, dst, offset, 0, 2);
+
+      break;
+   }
+   case nir_intrinsic_load_size_ir3:
+      if (!ctx->ij[IJ_PERSP_SIZE]) {
+         ctx->ij[IJ_PERSP_SIZE] =
+            create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, 0x1);
+      }
+      dst[0] = ctx->ij[IJ_PERSP_SIZE];
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_pixel:
+      emit_intrinsic_barycentric(ctx, intr, dst);
+      break;
+   case nir_intrinsic_load_interpolated_input:
+   case nir_intrinsic_load_input:
+      setup_input(ctx, intr);
+      break;
+   /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
+    * pass and replaced by an ir3-specifc version that adds the
+    * dword-offset in the last source.
+    */
+   case nir_intrinsic_load_ssbo_ir3:
+      ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
+      break;
+   case nir_intrinsic_store_ssbo_ir3:
+      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+          !ctx->s->info.fs.early_fragment_tests)
+         ctx->so->no_earlyz = true;
+      ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
+      break;
+   case nir_intrinsic_get_ssbo_size:
+      emit_intrinsic_ssbo_size(ctx, intr, dst);
+      break;
+   case nir_intrinsic_ssbo_atomic_add_ir3:
+   case nir_intrinsic_ssbo_atomic_imin_ir3:
+   case nir_intrinsic_ssbo_atomic_umin_ir3:
+   case nir_intrinsic_ssbo_atomic_imax_ir3:
+   case nir_intrinsic_ssbo_atomic_umax_ir3:
+   case nir_intrinsic_ssbo_atomic_and_ir3:
+   case nir_intrinsic_ssbo_atomic_or_ir3:
+   case nir_intrinsic_ssbo_atomic_xor_ir3:
+   case nir_intrinsic_ssbo_atomic_exchange_ir3:
+   case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
+      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+          !ctx->s->info.fs.early_fragment_tests)
+         ctx->so->no_earlyz = true;
+      dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
+      break;
+   case nir_intrinsic_load_shared:
+      emit_intrinsic_load_shared(ctx, intr, dst);
+      break;
+   case nir_intrinsic_store_shared:
+      emit_intrinsic_store_shared(ctx, intr);
+      break;
+   case nir_intrinsic_shared_atomic_add:
+   case nir_intrinsic_shared_atomic_imin:
+   case nir_intrinsic_shared_atomic_umin:
+   case nir_intrinsic_shared_atomic_imax:
+   case nir_intrinsic_shared_atomic_umax:
+   case nir_intrinsic_shared_atomic_and:
+   case nir_intrinsic_shared_atomic_or:
+   case nir_intrinsic_shared_atomic_xor:
+   case nir_intrinsic_shared_atomic_exchange:
+   case nir_intrinsic_shared_atomic_comp_swap:
+      dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+      break;
+   case nir_intrinsic_load_scratch:
+      emit_intrinsic_load_scratch(ctx, intr, dst);
+      break;
+   case nir_intrinsic_store_scratch:
+      emit_intrinsic_store_scratch(ctx, intr);
+      break;
+   case nir_intrinsic_image_load:
+      emit_intrinsic_load_image(ctx, intr, dst);
+      break;
+   case nir_intrinsic_bindless_image_load:
+      /* Bindless uses the IBO state, which doesn't have swizzle filled out,
+       * so using isam doesn't work.
+       *
+       * TODO: can we use isam if we fill out more fields?
+       */
+      ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
+      break;
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_bindless_image_store:
+      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+          !ctx->s->info.fs.early_fragment_tests)
+         ctx->so->no_earlyz = true;
+      ctx->funcs->emit_intrinsic_store_image(ctx, intr);
+      break;
+   case nir_intrinsic_image_size:
+   case nir_intrinsic_bindless_image_size:
+      ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
+      break;
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_bindless_image_atomic_add:
+   case nir_intrinsic_image_atomic_imin:
+   case nir_intrinsic_bindless_image_atomic_imin:
+   case nir_intrinsic_image_atomic_umin:
+   case nir_intrinsic_bindless_image_atomic_umin:
+   case nir_intrinsic_image_atomic_imax:
+   case nir_intrinsic_bindless_image_atomic_imax:
+   case nir_intrinsic_image_atomic_umax:
+   case nir_intrinsic_bindless_image_atomic_umax:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_bindless_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_bindless_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_bindless_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_bindless_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap:
+   case nir_intrinsic_bindless_image_atomic_comp_swap:
+      if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+          !ctx->s->info.fs.early_fragment_tests)
+         ctx->so->no_earlyz = true;
+      dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
+      break;
+   case nir_intrinsic_scoped_barrier:
+   case nir_intrinsic_control_barrier:
+   case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_group_memory_barrier:
+   case nir_intrinsic_memory_barrier_buffer:
+   case nir_intrinsic_memory_barrier_image:
+   case nir_intrinsic_memory_barrier_shared:
+   case nir_intrinsic_memory_barrier_tcs_patch:
+      emit_intrinsic_barrier(ctx, intr);
+      /* note that blk ptr no longer valid, make that obvious: */
+      b = NULL;
+      break;
+   case nir_intrinsic_store_output:
+      setup_output(ctx, intr);
+      break;
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_first_vertex:
+      if (!ctx->basevertex) {
+         ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
+      }
+      dst[0] = ctx->basevertex;
+      break;
+   case nir_intrinsic_load_draw_id:
+      if (!ctx->draw_id) {
+         ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
+      }
+      dst[0] = ctx->draw_id;
+      break;
+   case nir_intrinsic_load_base_instance:
+      if (!ctx->base_instance) {
+         ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
+      }
+      dst[0] = ctx->base_instance;
+      break;
+   case nir_intrinsic_load_view_index:
+      if (!ctx->view_index) {
+         ctx->view_index =
+            create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
+      }
+      dst[0] = ctx->view_index;
+      break;
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_vertex_id:
+      if (!ctx->vertex_id) {
+         gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
+                                 ? SYSTEM_VALUE_VERTEX_ID
+                                 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+         ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
+      }
+      dst[0] = ctx->vertex_id;
+      break;
+   case nir_intrinsic_load_instance_id:
+      if (!ctx->instance_id) {
+         ctx->instance_id =
+            create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
+      }
+      dst[0] = ctx->instance_id;
+      break;
+   case nir_intrinsic_load_sample_id:
+      ctx->so->per_samp = true;
+      FALLTHROUGH;
+   case nir_intrinsic_load_sample_id_no_per_sample:
+      if (!ctx->samp_id) {
+         ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
+         ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
+      }
+      dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
+      break;
+   case nir_intrinsic_load_sample_mask_in:
+      if (!ctx->samp_mask_in) {
+         ctx->samp_mask_in =
+            create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
+      }
+      dst[0] = ctx->samp_mask_in;
+      break;
+   case nir_intrinsic_load_user_clip_plane:
+      idx = nir_intrinsic_ucp_id(intr);
+      for (int i = 0; i < dest_components; i++) {
+         unsigned n = idx * 4 + i;
+         dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
+      }
+      break;
+   case nir_intrinsic_load_front_face:
+      if (!ctx->frag_face) {
+         ctx->so->frag_face = true;
+         ctx->frag_face =
+            create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
+         ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
+      }
+      /* for fragface, we get -1 for back and 0 for front. However this is
+       * the inverse of what nir expects (where ~0 is true).
+       */
+      dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
+                          create_immed_typed(b, 0, TYPE_U16), 0);
+      dst[0]->cat2.condition = IR3_COND_EQ;
+      break;
+   case nir_intrinsic_load_local_invocation_id:
+      if (!ctx->local_invocation_id) {
+         ctx->local_invocation_id =
+            create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
+      }
+      ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
+      break;
+   case nir_intrinsic_load_workgroup_id:
+   case nir_intrinsic_load_workgroup_id_zero_base:
+      if (!ctx->work_group_id) {
+         ctx->work_group_id =
+            create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
+         ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
+      }
+      ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
+      break;
+   case nir_intrinsic_load_base_workgroup_id:
+      for (int i = 0; i < dest_components; i++) {
+         dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
+      }
+      break;
+   case nir_intrinsic_load_num_workgroups:
+      for (int i = 0; i < dest_components; i++) {
+         dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
+      }
+      break;
+   case nir_intrinsic_load_workgroup_size:
+      for (int i = 0; i < dest_components; i++) {
+         dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
+      }
+      break;
+   case nir_intrinsic_load_subgroup_size:
+      dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
+      break;
+   case nir_intrinsic_load_subgroup_id_shift_ir3:
+      dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
+      break;
+   case nir_intrinsic_discard_if:
+   case nir_intrinsic_discard:
+   case nir_intrinsic_demote:
+   case nir_intrinsic_demote_if:
+   case nir_intrinsic_terminate:
+   case nir_intrinsic_terminate_if: {
+      struct ir3_instruction *cond, *kill;
+
+      if (intr->intrinsic == nir_intrinsic_discard_if ||
+          intr->intrinsic == nir_intrinsic_demote_if ||
+          intr->intrinsic == nir_intrinsic_terminate_if) {
+         /* conditional discard: */
+         src = ir3_get_src(ctx, &intr->src[0]);
+         cond = src[0];
+      } else {
+         /* unconditional discard: */
+         cond = create_immed(b, 1);
+      }
+
+      /* NOTE: only cmps.*.* can write p0.x: */
+      cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+      cond->cat2.condition = IR3_COND_NE;
+
+      /* condition always goes in predicate register: */
+      cond->dsts[0]->num = regid(REG_P0, 0);
+      cond->dsts[0]->flags &= ~IR3_REG_SSA;
+
+      if (intr->intrinsic == nir_intrinsic_demote ||
+          intr->intrinsic == nir_intrinsic_demote_if) {
+         kill = ir3_DEMOTE(b, cond, 0);
+      } else {
+         kill = ir3_KILL(b, cond, 0);
+      }
+
+      /* Side-effects should not be moved on a different side of the kill */
+      kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
+      kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
+      kill->srcs[0]->num = regid(REG_P0, 0);
+      array_insert(ctx->ir, ctx->ir->predicates, kill);
+
+      array_insert(b, b->keeps, kill);
+      ctx->so->has_kill = true;
+
+      break;
+   }
+
+   case nir_intrinsic_cond_end_ir3: {
+      struct ir3_instruction *cond, *kill;
+
+      src = ir3_get_src(ctx, &intr->src[0]);
+      cond = src[0];
+
+      /* NOTE: only cmps.*.* can write p0.x: */
+      cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+      cond->cat2.condition = IR3_COND_NE;
+
+      /* condition always goes in predicate register: */
+      cond->dsts[0]->num = regid(REG_P0, 0);
+
+      kill = ir3_PREDT(b, cond, 0);
+
+      kill->barrier_class = IR3_BARRIER_EVERYTHING;
+      kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
+
+      array_insert(ctx->ir, ctx->ir->predicates, kill);
+      array_insert(b, b->keeps, kill);
+      break;
+   }
+
+   case nir_intrinsic_vote_any:
+   case nir_intrinsic_vote_all: {
+      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
+      struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
+      if (intr->intrinsic == nir_intrinsic_vote_any)
+         dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
+      else
+         dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
+      dst[0]->srcs[0]->num = regid(REG_P0, 0);
+      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
+      break;
+   }
+   case nir_intrinsic_elect:
+      dst[0] = ir3_ELECT_MACRO(ctx->block);
+      /* This may expand to a divergent if/then, so allocate stack space for
+       * it.
+       */
+      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
+      break;
+
+   case nir_intrinsic_read_invocation_cond_ir3: {
+      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
+      struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
+      dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
+                                   src, 0);
+      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
+      dst[0]->srcs[0]->num = regid(REG_P0, 0);
+      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
+      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
+      break;
+   }
+
+   case nir_intrinsic_read_first_invocation: {
+      struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
+      dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
+      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
+      ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
+      break;
+   }
+
+   case nir_intrinsic_ballot: {
+      struct ir3_instruction *ballot;
+      unsigned components = intr->dest.ssa.num_components;
+      if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
+         /* ballot(true) is just MOVMSK */
+         ballot = ir3_MOVMSK(ctx->block, components);
+      } else {
+         struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
+         struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
+         ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
+         ballot->srcs[0]->num = regid(REG_P0, 0);
+         array_insert(ctx->ir, ctx->ir->predicates, ballot);
+         ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
+      }
+      ir3_split_dest(ctx->block, dst, ballot, 0, components);
+      break;
+   }
+
+   case nir_intrinsic_load_shared_ir3:
+      emit_intrinsic_load_shared_ir3(ctx, intr, dst);
+      break;
+   case nir_intrinsic_store_shared_ir3:
+      emit_intrinsic_store_shared_ir3(ctx, intr);
+      break;
+   case nir_intrinsic_bindless_resource_ir3:
+      dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
+      break;
+   default:
+      ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
+                        nir_intrinsic_infos[intr->intrinsic].name);
+      break;
+   }
+
+   if (info->has_dest)
+      ir3_put_dst(ctx, &intr->dest);
  }
  
  static void
  emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
  {
-       struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &instr->def,
-                       instr->def.num_components);
-
-       if (instr->def.bit_size == 16) {
-               for (int i = 0; i < instr->def.num_components; i++)
-                       dst[i] = create_immed_typed(ctx->block,
-                                                                               instr->value[i].u16,
-                                                                               TYPE_U16);
-       } else {
-               for (int i = 0; i < instr->def.num_components; i++)
-                       dst[i] = create_immed_typed(ctx->block,
-                                                                               instr->value[i].u32,
-                                                                               TYPE_U32);
-       }
-
+   struct ir3_instruction **dst =
+      ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
+
+   if (instr->def.bit_size == 16) {
+      for (int i = 0; i < instr->def.num_components; i++)
+         dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
+   } else {
+      for (int i = 0; i < instr->def.num_components; i++)
+         dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
+   }
  }
  
  static void
  emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
  {
-       struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &undef->def,
-                       undef->def.num_components);
-       type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;
-
-       /* backend doesn't want undefined instructions, so just plug
-        * in 0.0..
-        */
-       for (int i = 0; i < undef->def.num_components; i++)
-               dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
+   struct ir3_instruction **dst =
+      ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
+   type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;
+
+   /* backend doesn't want undefined instructions, so just plug
+    * in 0.0..
+    */
+   for (int i = 0; i < undef->def.num_components; i++)
+      dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
  }
  
  /*
@@ -2191,52 +2189,53 @@ emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
  static type_t
  get_tex_dest_type(nir_tex_instr *tex)
  {
-       type_t type;
-
-       switch (tex->dest_type) {
-       case nir_type_float32:
-               return TYPE_F32;
-       case nir_type_float16:
-               return TYPE_F16;
-       case nir_type_int32:
-               return TYPE_S32;
-       case nir_type_int16:
-               return TYPE_S16;
-       case nir_type_bool32:
-       case nir_type_uint32:
-               return TYPE_U32;
-       case nir_type_bool16:
-       case nir_type_uint16:
-               return TYPE_U16;
-       case nir_type_invalid:
-       default:
-               unreachable("bad dest_type");
-       }
-
-       return type;
+   type_t type;
+
+   switch (tex->dest_type) {
+   case nir_type_float32:
+      return TYPE_F32;
+   case nir_type_float16:
+      return TYPE_F16;
+   case nir_type_int32:
+      return TYPE_S32;
+   case nir_type_int16:
+      return TYPE_S16;
+   case nir_type_bool32:
+   case nir_type_uint32:
+      return TYPE_U32;
+   case nir_type_bool16:
+   case nir_type_uint16:
+      return TYPE_U16;
+   case nir_type_invalid:
+   default:
+      unreachable("bad dest_type");
+   }
+
+   return type;
  }
  
  static void
  tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
  {
-       unsigned coords = glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
-       unsigned flags = 0;
+   unsigned coords =
+      glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
+   unsigned flags = 0;
  
-       /* note: would use tex->coord_components.. except txs.. also,
-        * since array index goes after shadow ref, we don't want to
-        * count it:
-        */
-       if (coords == 3)
-               flags |= IR3_INSTR_3D;
+   /* note: would use tex->coord_components.. except txs.. also,
+    * since array index goes after shadow ref, we don't want to
+    * count it:
+    */
+   if (coords == 3)
+      flags |= IR3_INSTR_3D;
  
-       if (tex->is_shadow && tex->op != nir_texop_lod)
-               flags |= IR3_INSTR_S;
+   if (tex->is_shadow && tex->op != nir_texop_lod)
+      flags |= IR3_INSTR_S;
  
-       if (tex->is_array && tex->op != nir_texop_lod)
-               flags |= IR3_INSTR_A;
+   if (tex->is_array && tex->op != nir_texop_lod)
+      flags |= IR3_INSTR_A;
  
-       *flagsp = flags;
-       *coordsp = coords;
+   *flagsp = flags;
+   *coordsp = coords;
  }
  
  /* Gets the sampler/texture idx as a hvec2.  Which could either be dynamic
@@ -2246,514 +2245,541 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
  static struct tex_src_info
  get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
  {
-       struct ir3_block *b = ctx->block;
-       struct tex_src_info info = { 0 };
-       int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
-       int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
-       struct ir3_instruction *texture, *sampler;
-
-       if (texture_idx >= 0 || sampler_idx >= 0) {
-               /* Bindless case */
-               info.flags |= IR3_INSTR_B;
-
-               if (tex->texture_non_uniform || tex->sampler_non_uniform)
-                       info.flags |= IR3_INSTR_NONUNIF;
-
-               /* Gather information required to determine which encoding to
-                * choose as well as for prefetch.
-                */
-               nir_intrinsic_instr *bindless_tex = NULL;
-               bool tex_const;
-               if (texture_idx >= 0) {
-                       ctx->so->bindless_tex = true;
-                       bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
-                       assert(bindless_tex);
-                       info.tex_base = nir_intrinsic_desc_set(bindless_tex);
-                       tex_const = nir_src_is_const(bindless_tex->src[0]);
-                       if (tex_const)
-                               info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
-               } else {
-                       /* To simplify some of the logic below, assume the index is
-                        * constant 0 when it's not enabled.
-                        */
-                       tex_const = true;
-                       info.tex_idx = 0;
-               }
-               nir_intrinsic_instr *bindless_samp = NULL;
-               bool samp_const;
-               if (sampler_idx >= 0) {
-                       ctx->so->bindless_samp = true;
-                       bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
-                       assert(bindless_samp);
-                       info.samp_base = nir_intrinsic_desc_set(bindless_samp);
-                       samp_const = nir_src_is_const(bindless_samp->src[0]);
-                       if (samp_const)
-                               info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
-               } else {
-                       samp_const = true;
-                       info.samp_idx = 0;
-               }
-
-               /* Choose encoding. */
-               if (tex_const && samp_const && info.tex_idx < 256 && info.samp_idx < 256) {
-                       if (info.tex_idx < 16 && info.samp_idx < 16 &&
-                               (!bindless_tex || !bindless_samp || info.tex_base == info.samp_base)) {
-                               /* Everything fits within the instruction */
-                               info.base = info.tex_base;
-                               info.combined_idx = info.samp_idx | (info.tex_idx << 4);
-                       } else {
-                               info.base = info.tex_base;
-                               info.a1_val = info.tex_idx << 3 | info.samp_base;
-                               info.combined_idx = info.samp_idx;
-                               info.flags |= IR3_INSTR_A1EN;
-                       }
-                       info.samp_tex = NULL;
-               } else {
-                       info.flags |= IR3_INSTR_S2EN;
-                       /* In the indirect case, we only use a1.x to store the sampler
-                        * base if it differs from the texture base.
-                        */
-                       if (!bindless_tex || !bindless_samp || info.tex_base == info.samp_base) {
-                               info.base = info.tex_base;
-                       } else {
-                               info.base = info.tex_base;
-                               info.a1_val = info.samp_base;
-                               info.flags |= IR3_INSTR_A1EN;
-                       }
-
-                       /* Note: the indirect source is now a vec2 instead of hvec2, and
-                        * for some reason the texture and sampler are swapped.
-                        */
-                       struct ir3_instruction *texture, *sampler;
-
-                       if (bindless_tex) {
-                               texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
-                       } else {
-                               texture = create_immed(b, 0);
-                       }
-
-                       if (bindless_samp) {
-                               sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
-                       } else {
-                               sampler = create_immed(b, 0);
-                       }
-                       info.samp_tex = ir3_collect(ctx, texture, sampler);
-               }
-       } else {
-               info.flags |= IR3_INSTR_S2EN;
-               texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
-               sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
-               if (texture_idx >= 0) {
-                       texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
-                       texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
-               } else {
-                       /* TODO what to do for dynamic case? I guess we only need the
-                        * max index for astc srgb workaround so maybe not a problem
-                        * to worry about if we don't enable indirect samplers for
-                        * a4xx?
-                        */
-                       ctx->max_texture_index = MAX2(ctx->max_texture_index, tex->texture_index);
-                       texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
-                       info.tex_idx = tex->texture_index;
-               }
-
-               if (sampler_idx >= 0) {
-                       sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
-                       sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
-               } else {
-                       sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
-                       info.samp_idx = tex->texture_index;
-               }
-
-               info.samp_tex = ir3_collect(ctx, sampler, texture);
-       }
-       
-       return info;
+   struct ir3_block *b = ctx->block;
+   struct tex_src_info info = {0};
+   int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
+   int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
+   struct ir3_instruction *texture, *sampler;
+
+   if (texture_idx >= 0 || sampler_idx >= 0) {
+      /* Bindless case */
+      info.flags |= IR3_INSTR_B;
+
+      if (tex->texture_non_uniform || tex->sampler_non_uniform)
+         info.flags |= IR3_INSTR_NONUNIF;
+
+      /* Gather information required to determine which encoding to
+       * choose as well as for prefetch.
+       */
+      nir_intrinsic_instr *bindless_tex = NULL;
+      bool tex_const;
+      if (texture_idx >= 0) {
+         ctx->so->bindless_tex = true;
+         bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
+         assert(bindless_tex);
+         info.tex_base = nir_intrinsic_desc_set(bindless_tex);
+         tex_const = nir_src_is_const(bindless_tex->src[0]);
+         if (tex_const)
+            info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
+      } else {
+         /* To simplify some of the logic below, assume the index is
+          * constant 0 when it's not enabled.
+          */
+         tex_const = true;
+         info.tex_idx = 0;
+      }
+      nir_intrinsic_instr *bindless_samp = NULL;
+      bool samp_const;
+      if (sampler_idx >= 0) {
+         ctx->so->bindless_samp = true;
+         bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
+         assert(bindless_samp);
+         info.samp_base = nir_intrinsic_desc_set(bindless_samp);
+         samp_const = nir_src_is_const(bindless_samp->src[0]);
+         if (samp_const)
+            info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
+      } else {
+         samp_const = true;
+         info.samp_idx = 0;
+      }
+
+      /* Choose encoding. */
+      if (tex_const && samp_const && info.tex_idx < 256 &&
+          info.samp_idx < 256) {
+         if (info.tex_idx < 16 && info.samp_idx < 16 &&
+             (!bindless_tex || !bindless_samp ||
+              info.tex_base == info.samp_base)) {
+            /* Everything fits within the instruction */
+            info.base = info.tex_base;
+            info.combined_idx = info.samp_idx | (info.tex_idx << 4);
+         } else {
+            info.base = info.tex_base;
+            info.a1_val = info.tex_idx << 3 | info.samp_base;
+            info.combined_idx = info.samp_idx;
+            info.flags |= IR3_INSTR_A1EN;
+         }
+         info.samp_tex = NULL;
+      } else {
+         info.flags |= IR3_INSTR_S2EN;
+         /* In the indirect case, we only use a1.x to store the sampler
+          * base if it differs from the texture base.
+          */
+         if (!bindless_tex || !bindless_samp ||
+             info.tex_base == info.samp_base) {
+            info.base = info.tex_base;
+         } else {
+            info.base = info.tex_base;
+            info.a1_val = info.samp_base;
+            info.flags |= IR3_INSTR_A1EN;
+         }
+
+         /* Note: the indirect source is now a vec2 instead of hvec2, and
+          * for some reason the texture and sampler are swapped.
+          */
+         struct ir3_instruction *texture, *sampler;
+
+         if (bindless_tex) {
+            texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
+         } else {
+            texture = create_immed(b, 0);
+         }
+
+         if (bindless_samp) {
+            sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
+         } else {
+            sampler = create_immed(b, 0);
+         }
+         info.samp_tex = ir3_collect(ctx, texture, sampler);
+      }
+   } else {
+      info.flags |= IR3_INSTR_S2EN;
+      texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
+      sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
+      if (texture_idx >= 0) {
+         texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
+         texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
+      } else {
+         /* TODO what to do for dynamic case? I guess we only need the
+          * max index for astc srgb workaround so maybe not a problem
+          * to worry about if we don't enable indirect samplers for
+          * a4xx?
+          */
+         ctx->max_texture_index =
+            MAX2(ctx->max_texture_index, tex->texture_index);
+         texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
+         info.tex_idx = tex->texture_index;
+      }
+
+      if (sampler_idx >= 0) {
+         sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
+         sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
+      } else {
+         sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
+         info.samp_idx = tex->texture_index;
+      }
+
+      info.samp_tex = ir3_collect(ctx, sampler, texture);
+   }
+
+   return info;
  }
  
  static void
  emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
-       struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
-       struct ir3_instruction *lod, *compare, *proj, *sample_index;
-       struct tex_src_info info = { 0 };
-       bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
-       unsigned i, coords, flags, ncomp;
-       unsigned nsrc0 = 0, nsrc1 = 0;
-       type_t type;
-       opc_t opc = 0;
-
-       ncomp = nir_dest_num_components(tex->dest);
-
-       coord = off = ddx = ddy = NULL;
-       lod = proj = compare = sample_index = NULL;
-
-       dst = ir3_get_dst(ctx, &tex->dest, ncomp);
-
-       for (unsigned i = 0; i < tex->num_srcs; i++) {
-               switch (tex->src[i].src_type) {
-               case nir_tex_src_coord:
-                       coord = ir3_get_src(ctx, &tex->src[i].src);
-                       break;
-               case nir_tex_src_bias:
-                       lod = ir3_get_src(ctx, &tex->src[i].src)[0];
-                       has_bias = true;
-                       break;
-               case nir_tex_src_lod:
-                       lod = ir3_get_src(ctx, &tex->src[i].src)[0];
-                       has_lod = true;
-                       break;
-               case nir_tex_src_comparator: /* shadow comparator */
-                       compare = ir3_get_src(ctx, &tex->src[i].src)[0];
-                       break;
-               case nir_tex_src_projector:
-                       proj = ir3_get_src(ctx, &tex->src[i].src)[0];
-                       has_proj = true;
-                       break;
-               case nir_tex_src_offset:
-                       off = ir3_get_src(ctx, &tex->src[i].src);
-                       has_off = true;
-                       break;
-               case nir_tex_src_ddx:
-                       ddx = ir3_get_src(ctx, &tex->src[i].src);
-                       break;
-               case nir_tex_src_ddy:
-                       ddy = ir3_get_src(ctx, &tex->src[i].src);
-                       break;
-               case nir_tex_src_ms_index:
-                       sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
-                       break;
-               case nir_tex_src_texture_offset:
-               case nir_tex_src_sampler_offset:
-               case nir_tex_src_texture_handle:
-               case nir_tex_src_sampler_handle:
-                       /* handled in get_tex_samp_src() */
-                       break;
-               default:
-                       ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
-                                       tex->src[i].src_type);
-                       return;
-               }
-       }
-
-       switch (tex->op) {
-       case nir_texop_tex_prefetch:
-               compile_assert(ctx, !has_bias);
-               compile_assert(ctx, !has_lod);
-               compile_assert(ctx, !compare);
-               compile_assert(ctx, !has_proj);
-               compile_assert(ctx, !has_off);
-               compile_assert(ctx, !ddx);
-               compile_assert(ctx, !ddy);
-               compile_assert(ctx, !sample_index);
-               compile_assert(ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
-               compile_assert(ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
-
-               if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
-                       opc = OPC_META_TEX_PREFETCH;
-                       ctx->so->num_sampler_prefetch++;
-                       break;
-               }
-               FALLTHROUGH;
-       case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
-       case nir_texop_txb:      opc = OPC_SAMB;     break;
-       case nir_texop_txl:      opc = OPC_SAML;     break;
-       case nir_texop_txd:      opc = OPC_SAMGQ;    break;
-       case nir_texop_txf:      opc = OPC_ISAML;    break;
-       case nir_texop_lod:      opc = OPC_GETLOD;   break;
-       case nir_texop_tg4:
-               /* NOTE: a4xx might need to emulate gather w/ txf (this is
-                * what blob does, seems gather  is broken?), and a3xx did
-                * not support it (but probably could also emulate).
-                */
-               switch (tex->component) {
-               case 0:              opc = OPC_GATHER4R; break;
-               case 1:              opc = OPC_GATHER4G; break;
-               case 2:              opc = OPC_GATHER4B; break;
-               case 3:              opc = OPC_GATHER4A; break;
-               }
-               break;
-       case nir_texop_txf_ms_fb:
-       case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
-       default:
-               ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
-               return;
-       }
-
-       tex_info(tex, &flags, &coords);
-
-       /*
-        * lay out the first argument in the proper order:
-        *  - actual coordinates first
-        *  - shadow reference
-        *  - array index
-        *  - projection w
-        *  - starting at offset 4, dpdx.xy, dpdy.xy
-        *
-        * bias/lod go into the second arg
-        */
-
-       /* insert tex coords: */
-       for (i = 0; i < coords; i++)
-               src0[i] = coord[i];
-
-       nsrc0 = i;
-
-       /* scale up integer coords for TXF based on the LOD */
-       if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
-               assert(has_lod);
-               for (i = 0; i < coords; i++)
-                       src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
-       }
-
-       if (coords == 1) {
-               /* hw doesn't do 1d, so we treat it as 2d with
-                * height of 1, and patch up the y coord.
-                */
-               if (is_isam(opc)) {
-                       src0[nsrc0++] = create_immed(b, 0);
-               } else {
-                       src0[nsrc0++] = create_immed(b, fui(0.5));
-               }
-       }
-
-       if (tex->is_shadow && tex->op != nir_texop_lod)
-               src0[nsrc0++] = compare;
-
-       if (tex->is_array && tex->op != nir_texop_lod) {
-               struct ir3_instruction *idx = coord[coords];
-
-               /* the array coord for cube arrays needs 0.5 added to it */
-               if (ctx->compiler->array_index_add_half && !is_isam(opc))
-                       idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
-
-               src0[nsrc0++] = idx;
-       }
-
-       if (has_proj) {
-               src0[nsrc0++] = proj;
-               flags |= IR3_INSTR_P;
-       }
-
-       /* pad to 4, then ddx/ddy: */
-       if (tex->op == nir_texop_txd) {
-               while (nsrc0 < 4)
-                       src0[nsrc0++] = create_immed(b, fui(0.0));
-               for (i = 0; i < coords; i++)
-                       src0[nsrc0++] = ddx[i];
-               if (coords < 2)
-                       src0[nsrc0++] = create_immed(b, fui(0.0));
-               for (i = 0; i < coords; i++)
-                       src0[nsrc0++] = ddy[i];
-               if (coords < 2)
-                       src0[nsrc0++] = create_immed(b, fui(0.0));
-       }
-
-       /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
-        * with scaled x coord according to requested sample:
-        */
-       if (opc == OPC_ISAMM) {
-               if (ctx->compiler->txf_ms_with_isaml) {
-                       /* the samples are laid out in x dimension as
-                        *     0 1 2 3
-                        * x_ms = (x << ms) + sample_index;
-                        */
-                       struct ir3_instruction *ms;
-                       ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
-
-                       src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
-                       src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
-
-                       opc = OPC_ISAML;
-               } else {
-                       src0[nsrc0++] = sample_index;
-               }
-       }
-
-       /*
-        * second argument (if applicable):
-        *  - offsets
-        *  - lod
-        *  - bias
-        */
-       if (has_off | has_lod | has_bias) {
-               if (has_off) {
-                       unsigned off_coords = coords;
-                       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
-                               off_coords--;
-                       for (i = 0; i < off_coords; i++)
-                               src1[nsrc1++] = off[i];
-                       if (off_coords < 2)
-                               src1[nsrc1++] = create_immed(b, fui(0.0));
-                       flags |= IR3_INSTR_O;
-               }
-
-               if (has_lod | has_bias)
-                       src1[nsrc1++] = lod;
-       }
-
-       type = get_tex_dest_type(tex);
-
-       if (opc == OPC_GETLOD)
-               type = TYPE_S32;
-
-
-       if (tex->op == nir_texop_txf_ms_fb) {
-               /* only expect a single txf_ms_fb per shader: */
-               compile_assert(ctx, !ctx->so->fb_read);
-               compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
-
-               ctx->so->fb_read = true;
-               info.samp_tex = ir3_collect(ctx,
-                       create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
-                       create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
-               info.flags = IR3_INSTR_S2EN;
-
-               ctx->so->num_samp++;
-       } else {
-               info = get_tex_samp_tex_src(ctx, tex);
-       }
-
-       struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0);
-       struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1);
-
-       if (opc == OPC_META_TEX_PREFETCH) {
-               int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
-
-               compile_assert(ctx, tex->src[idx].src.is_ssa);
-
-               sam = ir3_SAM(b, opc, type, MASK(ncomp), 0, NULL,
-                               get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
-               sam->prefetch.input_offset =
-                               ir3_nir_coord_offset(tex->src[idx].src.ssa);
-               /* make sure not to add irrelevant flags like S2EN */
-               sam->flags = flags | (info.flags & IR3_INSTR_B);
-               sam->prefetch.tex  = info.tex_idx;
-               sam->prefetch.samp = info.samp_idx;
-               sam->prefetch.tex_base = info.tex_base;
-               sam->prefetch.samp_base = info.samp_base;
-       } else {
-               info.flags |= flags;
-               sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
-       }
-
-       if ((ctx->astc_srgb & (1 << tex->texture_index)) && !nir_tex_instr_is_query(tex)) {
-               assert(opc != OPC_META_TEX_PREFETCH);
-
-               /* only need first 3 components: */
-               sam->dsts[0]->wrmask = 0x7;
-               ir3_split_dest(b, dst, sam, 0, 3);
-
-               /* we need to sample the alpha separately with a non-ASTC
-                * texture state:
-                */
-               sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags,
-                               info.samp_tex, col0, col1);
-
-               array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
-
-               /* fixup .w component: */
-               ir3_split_dest(b, &dst[3], sam, 3, 1);
-       } else {
-               /* normal (non-workaround) case: */
-               ir3_split_dest(b, dst, sam, 0, ncomp);
-       }
-
-       /* GETLOD returns results in 4.8 fixed point */
-       if (opc == OPC_GETLOD) {
-               struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
-
-               compile_assert(ctx, tex->dest_type == nir_type_float32);
-               for (i = 0; i < 2; i++) {
-                       dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_S32, TYPE_F32), 0,
-                                                          factor, 0);
-               }
-       }
-
-       ir3_put_dst(ctx, &tex->dest);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
+   struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
+   struct ir3_instruction *lod, *compare, *proj, *sample_index;
+   struct tex_src_info info = {0};
+   bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
+   unsigned i, coords, flags, ncomp;
+   unsigned nsrc0 = 0, nsrc1 = 0;
+   type_t type;
+   opc_t opc = 0;
+
+   ncomp = nir_dest_num_components(tex->dest);
+
+   coord = off = ddx = ddy = NULL;
+   lod = proj = compare = sample_index = NULL;
+
+   dst = ir3_get_dst(ctx, &tex->dest, ncomp);
+
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      switch (tex->src[i].src_type) {
+      case nir_tex_src_coord:
+         coord = ir3_get_src(ctx, &tex->src[i].src);
+         break;
+      case nir_tex_src_bias:
+         lod = ir3_get_src(ctx, &tex->src[i].src)[0];
+         has_bias = true;
+         break;
+      case nir_tex_src_lod:
+         lod = ir3_get_src(ctx, &tex->src[i].src)[0];
+         has_lod = true;
+         break;
+      case nir_tex_src_comparator: /* shadow comparator */
+         compare = ir3_get_src(ctx, &tex->src[i].src)[0];
+         break;
+      case nir_tex_src_projector:
+         proj = ir3_get_src(ctx, &tex->src[i].src)[0];
+         has_proj = true;
+         break;
+      case nir_tex_src_offset:
+         off = ir3_get_src(ctx, &tex->src[i].src);
+         has_off = true;
+         break;
+      case nir_tex_src_ddx:
+         ddx = ir3_get_src(ctx, &tex->src[i].src);
+         break;
+      case nir_tex_src_ddy:
+         ddy = ir3_get_src(ctx, &tex->src[i].src);
+         break;
+      case nir_tex_src_ms_index:
+         sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
+         break;
+      case nir_tex_src_texture_offset:
+      case nir_tex_src_sampler_offset:
+      case nir_tex_src_texture_handle:
+      case nir_tex_src_sampler_handle:
+         /* handled in get_tex_samp_src() */
+         break;
+      default:
+         ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
+                           tex->src[i].src_type);
+         return;
+      }
+   }
+
+   switch (tex->op) {
+   case nir_texop_tex_prefetch:
+      compile_assert(ctx, !has_bias);
+      compile_assert(ctx, !has_lod);
+      compile_assert(ctx, !compare);
+      compile_assert(ctx, !has_proj);
+      compile_assert(ctx, !has_off);
+      compile_assert(ctx, !ddx);
+      compile_assert(ctx, !ddy);
+      compile_assert(ctx, !sample_index);
+      compile_assert(
+         ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
+      compile_assert(
+         ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
+
+      if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
+         opc = OPC_META_TEX_PREFETCH;
+         ctx->so->num_sampler_prefetch++;
+         break;
+      }
+      FALLTHROUGH;
+   case nir_texop_tex:
+      opc = has_lod ? OPC_SAML : OPC_SAM;
+      break;
+   case nir_texop_txb:
+      opc = OPC_SAMB;
+      break;
+   case nir_texop_txl:
+      opc = OPC_SAML;
+      break;
+   case nir_texop_txd:
+      opc = OPC_SAMGQ;
+      break;
+   case nir_texop_txf:
+      opc = OPC_ISAML;
+      break;
+   case nir_texop_lod:
+      opc = OPC_GETLOD;
+      break;
+   case nir_texop_tg4:
+      /* NOTE: a4xx might need to emulate gather w/ txf (this is
+       * what blob does, seems gather  is broken?), and a3xx did
+       * not support it (but probably could also emulate).
+       */
+      switch (tex->component) {
+      case 0:
+         opc = OPC_GATHER4R;
+         break;
+      case 1:
+         opc = OPC_GATHER4G;
+         break;
+      case 2:
+         opc = OPC_GATHER4B;
+         break;
+      case 3:
+         opc = OPC_GATHER4A;
+         break;
+      }
+      break;
+   case nir_texop_txf_ms_fb:
+   case nir_texop_txf_ms:
+      opc = OPC_ISAMM;
+      break;
+   default:
+      ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
+      return;
+   }
+
+   tex_info(tex, &flags, &coords);
+
+   /*
+    * lay out the first argument in the proper order:
+    *  - actual coordinates first
+    *  - shadow reference
+    *  - array index
+    *  - projection w
+    *  - starting at offset 4, dpdx.xy, dpdy.xy
+    *
+    * bias/lod go into the second arg
+    */
+
+   /* insert tex coords: */
+   for (i = 0; i < coords; i++)
+      src0[i] = coord[i];
+
+   nsrc0 = i;
+
+   /* scale up integer coords for TXF based on the LOD */
+   if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
+      assert(has_lod);
+      for (i = 0; i < coords; i++)
+         src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
+   }
+
+   if (coords == 1) {
+      /* hw doesn't do 1d, so we treat it as 2d with
+       * height of 1, and patch up the y coord.
+       */
+      if (is_isam(opc)) {
+         src0[nsrc0++] = create_immed(b, 0);
+      } else {
+         src0[nsrc0++] = create_immed(b, fui(0.5));
+      }
+   }
+
+   if (tex->is_shadow && tex->op != nir_texop_lod)
+      src0[nsrc0++] = compare;
+
+   if (tex->is_array && tex->op != nir_texop_lod) {
+      struct ir3_instruction *idx = coord[coords];
+
+      /* the array coord for cube arrays needs 0.5 added to it */
+      if (ctx->compiler->array_index_add_half && !is_isam(opc))
+         idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
+
+      src0[nsrc0++] = idx;
+   }
+
+   if (has_proj) {
+      src0[nsrc0++] = proj;
+      flags |= IR3_INSTR_P;
+   }
+
+   /* pad to 4, then ddx/ddy: */
+   if (tex->op == nir_texop_txd) {
+      while (nsrc0 < 4)
+         src0[nsrc0++] = create_immed(b, fui(0.0));
+      for (i = 0; i < coords; i++)
+         src0[nsrc0++] = ddx[i];
+      if (coords < 2)
+         src0[nsrc0++] = create_immed(b, fui(0.0));
+      for (i = 0; i < coords; i++)
+         src0[nsrc0++] = ddy[i];
+      if (coords < 2)
+         src0[nsrc0++] = create_immed(b, fui(0.0));
+   }
+
+   /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
+    * with scaled x coord according to requested sample:
+    */
+   if (opc == OPC_ISAMM) {
+      if (ctx->compiler->txf_ms_with_isaml) {
+         /* the samples are laid out in x dimension as
+          *     0 1 2 3
+          * x_ms = (x << ms) + sample_index;
+          */
+         struct ir3_instruction *ms;
+         ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
+
+         src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
+         src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
+
+         opc = OPC_ISAML;
+      } else {
+         src0[nsrc0++] = sample_index;
+      }
+   }
+
+   /*
+    * second argument (if applicable):
+    *  - offsets
+    *  - lod
+    *  - bias
+    */
+   if (has_off | has_lod | has_bias) {
+      if (has_off) {
+         unsigned off_coords = coords;
+         if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+            off_coords--;
+         for (i = 0; i < off_coords; i++)
+            src1[nsrc1++] = off[i];
+         if (off_coords < 2)
+            src1[nsrc1++] = create_immed(b, fui(0.0));
+         flags |= IR3_INSTR_O;
+      }
+
+      if (has_lod | has_bias)
+         src1[nsrc1++] = lod;
+   }
+
+   type = get_tex_dest_type(tex);
+
+   if (opc == OPC_GETLOD)
+      type = TYPE_S32;
+
+   if (tex->op == nir_texop_txf_ms_fb) {
+      /* only expect a single txf_ms_fb per shader: */
+      compile_assert(ctx, !ctx->so->fb_read);
+      compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
+
+      ctx->so->fb_read = true;
+      info.samp_tex = ir3_collect(
+         ctx, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
+         create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
+      info.flags = IR3_INSTR_S2EN;
+
+      ctx->so->num_samp++;
+   } else {
+      info = get_tex_samp_tex_src(ctx, tex);
+   }
+
+   struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0);
+   struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1);
+
+   if (opc == OPC_META_TEX_PREFETCH) {
+      int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+
+      compile_assert(ctx, tex->src[idx].src.is_ssa);
+
+      sam = ir3_SAM(b, opc, type, MASK(ncomp), 0, NULL,
+                    get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
+      sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
+      /* make sure not to add irrelevant flags like S2EN */
+      sam->flags = flags | (info.flags & IR3_INSTR_B);
+      sam->prefetch.tex = info.tex_idx;
+      sam->prefetch.samp = info.samp_idx;
+      sam->prefetch.tex_base = info.tex_base;
+      sam->prefetch.samp_base = info.samp_base;
+   } else {
+      info.flags |= flags;
+      sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
+   }
+
+   if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
+       !nir_tex_instr_is_query(tex)) {
+      assert(opc != OPC_META_TEX_PREFETCH);
+
+      /* only need first 3 components: */
+      sam->dsts[0]->wrmask = 0x7;
+      ir3_split_dest(b, dst, sam, 0, 3);
+
+      /* we need to sample the alpha separately with a non-ASTC
+       * texture state:
+       */
+      sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
+                    col0, col1);
+
+      array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
+
+      /* fixup .w component: */
+      ir3_split_dest(b, &dst[3], sam, 3, 1);
+   } else {
+      /* normal (non-workaround) case: */
+      ir3_split_dest(b, dst, sam, 0, ncomp);
+   }
+
+   /* GETLOD returns results in 4.8 fixed point */
+   if (opc == OPC_GETLOD) {
+      struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+      compile_assert(ctx, tex->dest_type == nir_type_float32);
+      for (i = 0; i < 2; i++) {
+         dst[i] =
+            ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_S32, TYPE_F32), 0, factor, 0);
+      }
+   }
+
+   ir3_put_dst(ctx, &tex->dest);
  }
  
  static void
  emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction **dst, *sam;
-       type_t dst_type = get_tex_dest_type(tex);
-       struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction **dst, *sam;
+   type_t dst_type = get_tex_dest_type(tex);
+   struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
  
-       dst = ir3_get_dst(ctx, &tex->dest, 1);
+   dst = ir3_get_dst(ctx, &tex->dest, 1);
  
-       sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
+   sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
  
-       /* even though there is only one component, since it ends
-        * up in .y/.z/.w rather than .x, we need a split_dest()
-        */
-       ir3_split_dest(b, dst, sam, idx, 1);
+   /* even though there is only one component, since it ends
+    * up in .y/.z/.w rather than .x, we need a split_dest()
+    */
+   ir3_split_dest(b, dst, sam, idx, 1);
  
-       /* The # of levels comes from getinfo.z. We need to add 1 to it, since
-        * the value in TEX_CONST_0 is zero-based.
-        */
-       if (ctx->compiler->levels_add_one)
-               dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
+   /* The # of levels comes from getinfo.z. We need to add 1 to it, since
+    * the value in TEX_CONST_0 is zero-based.
+    */
+   if (ctx->compiler->levels_add_one)
+      dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
  
-       ir3_put_dst(ctx, &tex->dest);
+   ir3_put_dst(ctx, &tex->dest);
  }
  
  static void
  emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction **dst, *sam;
-       struct ir3_instruction *lod;
-       unsigned flags, coords;
-       type_t dst_type = get_tex_dest_type(tex);
-       struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
-
-       tex_info(tex, &flags, &coords);
-       info.flags |= flags;
-
-       /* Actually we want the number of dimensions, not coordinates. This
-        * distinction only matters for cubes.
-        */
-       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
-               coords = 2;
-
-       dst = ir3_get_dst(ctx, &tex->dest, 4);
-
-       int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
-       compile_assert(ctx, lod_idx >= 0);
-
-       lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
-
-       if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
-               sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
-       } else {
-               /*
-                * The maximum value which OPC_GETSIZE could return for one dimension
-                * is 0x007ff0, however sampler buffer could be much bigger.
-                * Blob uses OPC_GETBUF for them.
-                */
-               sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
-       }
-
-       ir3_split_dest(b, dst, sam, 0, 4);
-
-       /* Array size actually ends up in .w rather than .z. This doesn't
-        * matter for miplevel 0, but for higher mips the value in z is
-        * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-        * returned, which means that we have to add 1 to it for arrays.
-        */
-       if (tex->is_array) {
-               if (ctx->compiler->levels_add_one) {
-                       dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
-               } else {
-                       dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
-               }
-       }
-
-       ir3_put_dst(ctx, &tex->dest);
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction **dst, *sam;
+   struct ir3_instruction *lod;
+   unsigned flags, coords;
+   type_t dst_type = get_tex_dest_type(tex);
+   struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
+
+   tex_info(tex, &flags, &coords);
+   info.flags |= flags;
+
+   /* Actually we want the number of dimensions, not coordinates. This
+    * distinction only matters for cubes.
+    */
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+      coords = 2;
+
+   dst = ir3_get_dst(ctx, &tex->dest, 4);
+
+   int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
+   compile_assert(ctx, lod_idx >= 0);
+
+   lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
+
+   if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
+      sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
+   } else {
+      /*
+       * The maximum value which OPC_GETSIZE could return for one dimension
+       * is 0x007ff0, however sampler buffer could be much bigger.
+       * Blob uses OPC_GETBUF for them.
+       */
+      sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
+   }
+
+   ir3_split_dest(b, dst, sam, 0, 4);
+
+   /* Array size actually ends up in .w rather than .z. This doesn't
+    * matter for miplevel 0, but for higher mips the value in z is
+    * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+    * returned, which means that we have to add 1 to it for arrays.
+    */
+   if (tex->is_array) {
+      if (ctx->compiler->levels_add_one) {
+         dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
+      } else {
+         dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
+      }
+   }
+
+   ir3_put_dst(ctx, &tex->dest);
  }
  
  /* phi instructions are left partially constructed.  We don't resolve
@@ -2764,230 +2790,231 @@ emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
  static void
  emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
  {
-       struct ir3_instruction *phi, **dst;
+   struct ir3_instruction *phi, **dst;
  
-       /* NOTE: phi's should be lowered to scalar at this point */
-       compile_assert(ctx, nphi->dest.ssa.num_components == 1);
+   /* NOTE: phi's should be lowered to scalar at this point */
+   compile_assert(ctx, nphi->dest.ssa.num_components == 1);
  
-       dst = ir3_get_dst(ctx, &nphi->dest, 1);
+   dst = ir3_get_dst(ctx, &nphi->dest, 1);
  
-       phi = ir3_instr_create(ctx->block, OPC_META_PHI,
-                       1, exec_list_length(&nphi->srcs));
-       __ssa_dst(phi);
-       phi->phi.nphi = nphi;
+   phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
+                          exec_list_length(&nphi->srcs));
+   __ssa_dst(phi);
+   phi->phi.nphi = nphi;
  
-       dst[0] = phi;
+   dst[0] = phi;
  
-       ir3_put_dst(ctx, &nphi->dest);
+   ir3_put_dst(ctx, &nphi->dest);
  }
  
-static struct ir3_block *get_block(struct ir3_context *ctx, const nir_block *nblock);
+static struct ir3_block *get_block(struct ir3_context *ctx,
+                                   const nir_block *nblock);
  
-static struct ir3_instruction *read_phi_src(struct ir3_context *ctx,
-                                                                                       struct ir3_block *blk,
-                                                                                       struct ir3_instruction *phi,
-                                                                                       nir_phi_instr *nphi)
+static struct ir3_instruction *
+read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
+             struct ir3_instruction *phi, nir_phi_instr *nphi)
  {
-       if (!blk->nblock) {
-               struct ir3_instruction *continue_phi =
-                       ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
-               __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
-
-               for (unsigned i = 0; i < blk->predecessors_count; i++) {
-                       struct ir3_instruction *src =
-                               read_phi_src(ctx, blk->predecessors[i], phi, nphi);
-                       if (src)
-                               __ssa_src(continue_phi, src, 0);
-                       else
-                               ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
-               }
-
-               return continue_phi;
-       }
-
-       nir_foreach_phi_src(nsrc, nphi) {
-               if (blk->nblock == nsrc->pred) {
-                       if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
-                               /* Create an ir3 undef */
-                               return NULL;
-                       } else {
-                               return ir3_get_src(ctx, &nsrc->src)[0];
-                       }
-               }
-       }
-
-       unreachable("couldn't find phi node ir3 block");
-       return NULL;
+   if (!blk->nblock) {
+      struct ir3_instruction *continue_phi =
+         ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
+      __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
+
+      for (unsigned i = 0; i < blk->predecessors_count; i++) {
+         struct ir3_instruction *src =
+            read_phi_src(ctx, blk->predecessors[i], phi, nphi);
+         if (src)
+            __ssa_src(continue_phi, src, 0);
+         else
+            ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
+      }
+
+      return continue_phi;
+   }
+
+   nir_foreach_phi_src (nsrc, nphi) {
+      if (blk->nblock == nsrc->pred) {
+         if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+            /* Create an ir3 undef */
+            return NULL;
+         } else {
+            return ir3_get_src(ctx, &nsrc->src)[0];
+         }
+      }
+   }
+
+   unreachable("couldn't find phi node ir3 block");
+   return NULL;
  }
  
  static void
  resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
  {
-       foreach_instr (phi, &block->instr_list) {
-               if (phi->opc != OPC_META_PHI)
-                       break;
-
-               nir_phi_instr *nphi = phi->phi.nphi;
-
-               if (!nphi) /* skip continue phis created above */
-                       continue;
-
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       struct ir3_block *pred = block->predecessors[i];
-                       struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
-                       if (src) {
-                               __ssa_src(phi, src, 0);
-                       } else {
-                               /* Create an ir3 undef */
-                               ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
-                       }
-               }
-       }
+   foreach_instr (phi, &block->instr_list) {
+      if (phi->opc != OPC_META_PHI)
+         break;
+
+      nir_phi_instr *nphi = phi->phi.nphi;
+
+      if (!nphi) /* skip continue phis created above */
+         continue;
+
+      for (unsigned i = 0; i < block->predecessors_count; i++) {
+         struct ir3_block *pred = block->predecessors[i];
+         struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
+         if (src) {
+            __ssa_src(phi, src, 0);
+         } else {
+            /* Create an ir3 undef */
+            ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
+         }
+      }
+   }
  }
  
  static void
  emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
  {
-       switch (jump->type) {
-       case nir_jump_break:
-       case nir_jump_continue:
-       case nir_jump_return:
-               /* I *think* we can simply just ignore this, and use the
-                * successor block link to figure out where we need to
-                * jump to for break/continue
-                */
-               break;
-       default:
-               ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
-               break;
-       }
+   switch (jump->type) {
+   case nir_jump_break:
+   case nir_jump_continue:
+   case nir_jump_return:
+      /* I *think* we can simply just ignore this, and use the
+       * successor block link to figure out where we need to
+       * jump to for break/continue
+       */
+      break;
+   default:
+      ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+      break;
+   }
  }
  
  static void
  emit_instr(struct ir3_context *ctx, nir_instr *instr)
  {
-       switch (instr->type) {
-       case nir_instr_type_alu:
-               emit_alu(ctx, nir_instr_as_alu(instr));
-               break;
-       case nir_instr_type_deref:
-               /* ignored, handled as part of the intrinsic they are src to */
-               break;
-       case nir_instr_type_intrinsic:
-               emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
-               break;
-       case nir_instr_type_load_const:
-               emit_load_const(ctx, nir_instr_as_load_const(instr));
-               break;
-       case nir_instr_type_ssa_undef:
-               emit_undef(ctx, nir_instr_as_ssa_undef(instr));
-               break;
-       case nir_instr_type_tex: {
-               nir_tex_instr *tex = nir_instr_as_tex(instr);
-               /* couple tex instructions get special-cased:
-                */
-               switch (tex->op) {
-               case nir_texop_txs:
-                       emit_tex_txs(ctx, tex);
-                       break;
-               case nir_texop_query_levels:
-                       emit_tex_info(ctx, tex, 2);
-                       break;
-               case nir_texop_texture_samples:
-                       emit_tex_info(ctx, tex, 3);
-                       break;
-               default:
-                       emit_tex(ctx, tex);
-                       break;
-               }
-               break;
-       }
-       case nir_instr_type_jump:
-               emit_jump(ctx, nir_instr_as_jump(instr));
-               break;
-       case nir_instr_type_phi:
-               emit_phi(ctx, nir_instr_as_phi(instr));
-               break;
-       case nir_instr_type_call:
-       case nir_instr_type_parallel_copy:
-               ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
-               break;
-       }
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      emit_alu(ctx, nir_instr_as_alu(instr));
+      break;
+   case nir_instr_type_deref:
+      /* ignored, handled as part of the intrinsic they are src to */
+      break;
+   case nir_instr_type_intrinsic:
+      emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+      break;
+   case nir_instr_type_load_const:
+      emit_load_const(ctx, nir_instr_as_load_const(instr));
+      break;
+   case nir_instr_type_ssa_undef:
+      emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+      break;
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      /* couple tex instructions get special-cased:
+       */
+      switch (tex->op) {
+      case nir_texop_txs:
+         emit_tex_txs(ctx, tex);
+         break;
+      case nir_texop_query_levels:
+         emit_tex_info(ctx, tex, 2);
+         break;
+      case nir_texop_texture_samples:
+         emit_tex_info(ctx, tex, 3);
+         break;
+      default:
+         emit_tex(ctx, tex);
+         break;
+      }
+      break;
+   }
+   case nir_instr_type_jump:
+      emit_jump(ctx, nir_instr_as_jump(instr));
+      break;
+   case nir_instr_type_phi:
+      emit_phi(ctx, nir_instr_as_phi(instr));
+      break;
+   case nir_instr_type_call:
+   case nir_instr_type_parallel_copy:
+      ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
+                        instr->type);
+      break;
+   }
  }
  
  static struct ir3_block *
  get_block(struct ir3_context *ctx, const nir_block *nblock)
  {
-       struct ir3_block *block;
-       struct hash_entry *hentry;
+   struct ir3_block *block;
+   struct hash_entry *hentry;
  
-       hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
-       if (hentry)
-               return hentry->data;
+   hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
+   if (hentry)
+      return hentry->data;
  
-       block = ir3_block_create(ctx->ir);
-       block->nblock = nblock;
-       _mesa_hash_table_insert(ctx->block_ht, nblock, block);
+   block = ir3_block_create(ctx->ir);
+   block->nblock = nblock;
+   _mesa_hash_table_insert(ctx->block_ht, nblock, block);
  
-       return block;
+   return block;
  }
  
  static struct ir3_block *
  get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
  {
-       struct hash_entry *hentry;
+   struct hash_entry *hentry;
  
-       hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
-       if (hentry)
-               return hentry->data;
+   hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
+   if (hentry)
+      return hentry->data;
  
-       return get_block(ctx, nblock);
+   return get_block(ctx, nblock);
  }
  
  static struct ir3_block *
  create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
  {
-       struct ir3_block *block = ir3_block_create(ctx->ir);
-       block->nblock = NULL;
-       _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
-       return block;
+   struct ir3_block *block = ir3_block_create(ctx->ir);
+   block->nblock = NULL;
+   _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
+   return block;
  }
  
  static void
  emit_block(struct ir3_context *ctx, nir_block *nblock)
  {
-       ctx->block = get_block(ctx, nblock);
-
-       list_addtail(&ctx->block->node, &ctx->ir->block_list);
-
-       ctx->block->loop_id = ctx->loop_id;
-
-       /* re-emit addr register in each block if needed: */
-       for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
-               _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
-               ctx->addr0_ht[i] = NULL;
-       }
-
-       _mesa_hash_table_u64_destroy(ctx->addr1_ht);
-       ctx->addr1_ht = NULL;
-
-       nir_foreach_instr (instr, nblock) {
-               ctx->cur_instr = instr;
-               emit_instr(ctx, instr);
-               ctx->cur_instr = NULL;
-               if (ctx->error)
-                       return;
-       }
-
-       for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
-               if (nblock->successors[i]) {
-                       ctx->block->successors[i] =
-                               get_block_or_continue(ctx, nblock->successors[i]);
-                       ctx->block->physical_successors[i] = ctx->block->successors[i];
-               }
-       }
-
-       _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
+   ctx->block = get_block(ctx, nblock);
+
+   list_addtail(&ctx->block->node, &ctx->ir->block_list);
+
+   ctx->block->loop_id = ctx->loop_id;
+
+   /* re-emit addr register in each block if needed: */
+   for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
+      _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
+      ctx->addr0_ht[i] = NULL;
+   }
+
+   _mesa_hash_table_u64_destroy(ctx->addr1_ht);
+   ctx->addr1_ht = NULL;
+
+   nir_foreach_instr (instr, nblock) {
+      ctx->cur_instr = instr;
+      emit_instr(ctx, instr);
+      ctx->cur_instr = NULL;
+      if (ctx->error)
+         return;
+   }
+
+   for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
+      if (nblock->successors[i]) {
+         ctx->block->successors[i] =
+            get_block_or_continue(ctx, nblock->successors[i]);
+         ctx->block->physical_successors[i] = ctx->block->successors[i];
+      }
+   }
+
+   _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
  }
  
  static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
@@ -2995,104 +3022,107 @@ static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
  static void
  emit_if(struct ir3_context *ctx, nir_if *nif)
  {
-       struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
-
-       if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
-               ctx->block->condition = ssa(condition->srcs[0]);
-               ctx->block->brtype = IR3_BRANCH_ANY;
-       } else if (condition->opc == OPC_ALL_MACRO && condition->block == ctx->block) {
-               ctx->block->condition = ssa(condition->srcs[0]);
-               ctx->block->brtype = IR3_BRANCH_ALL;
-       } else if (condition->opc == OPC_ELECT_MACRO && condition->block == ctx->block) {
-               ctx->block->condition = NULL;
-               ctx->block->brtype = IR3_BRANCH_GETONE;
-       } else {
-               ctx->block->condition = ir3_get_predicate(ctx, condition);
-               ctx->block->brtype = IR3_BRANCH_COND;
-       }
-
-       emit_cf_list(ctx, &nif->then_list);
-       emit_cf_list(ctx, &nif->else_list);
-
-       struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));
-       struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));
-       assert(last_then->physical_successors[0] && !last_then->physical_successors[1]);
-       last_then->physical_successors[1] = first_else;
-
-       struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
-       struct ir3_block *after_if =
-               get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
-       last_else->physical_successors[0] = after_if;
+   struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
+
+   if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
+      ctx->block->condition = ssa(condition->srcs[0]);
+      ctx->block->brtype = IR3_BRANCH_ANY;
+   } else if (condition->opc == OPC_ALL_MACRO &&
+              condition->block == ctx->block) {
+      ctx->block->condition = ssa(condition->srcs[0]);
+      ctx->block->brtype = IR3_BRANCH_ALL;
+   } else if (condition->opc == OPC_ELECT_MACRO &&
+              condition->block == ctx->block) {
+      ctx->block->condition = NULL;
+      ctx->block->brtype = IR3_BRANCH_GETONE;
+   } else {
+      ctx->block->condition = ir3_get_predicate(ctx, condition);
+      ctx->block->brtype = IR3_BRANCH_COND;
+   }
+
+   emit_cf_list(ctx, &nif->then_list);
+   emit_cf_list(ctx, &nif->else_list);
+
+   struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));
+   struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));
+   assert(last_then->physical_successors[0] &&
+          !last_then->physical_successors[1]);
+   last_then->physical_successors[1] = first_else;
+
+   struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
+   struct ir3_block *after_if =
+      get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
+   last_else->physical_successors[0] = after_if;
  }
  
  static void
  emit_loop(struct ir3_context *ctx, nir_loop *nloop)
  {
-       unsigned old_loop_id = ctx->loop_id;
-       ctx->loop_id = ctx->so->loops + 1;
-
-       struct nir_block *nstart = nir_loop_first_block(nloop);
-       struct ir3_block *continue_blk = NULL;
-
-       /* There's always one incoming edge from outside the loop, and if there
-        * are more than two backedges from inside the loop (so more than 2 total
-        * edges) then we need to create a continue block after the loop to ensure
-        * that control reconverges at the end of each loop iteration.
-        */
-       if (nstart->predecessors->entries > 2) {
-               continue_blk = create_continue_block(ctx, nstart);
-       }
-
-       emit_cf_list(ctx, &nloop->body);
-
-       if (continue_blk) {
-               struct ir3_block *start = get_block(ctx, nstart);
-               continue_blk->successors[0] = start;
-               continue_blk->physical_successors[0] = start;
-               list_addtail(&continue_blk->node, &ctx->ir->block_list);
-       }
-
-       ctx->so->loops++;
-       ctx->loop_id = old_loop_id;
+   unsigned old_loop_id = ctx->loop_id;
+   ctx->loop_id = ctx->so->loops + 1;
+
+   struct nir_block *nstart = nir_loop_first_block(nloop);
+   struct ir3_block *continue_blk = NULL;
+
+   /* There's always one incoming edge from outside the loop, and if there
+    * are more than two backedges from inside the loop (so more than 2 total
+    * edges) then we need to create a continue block after the loop to ensure
+    * that control reconverges at the end of each loop iteration.
+    */
+   if (nstart->predecessors->entries > 2) {
+      continue_blk = create_continue_block(ctx, nstart);
+   }
+
+   emit_cf_list(ctx, &nloop->body);
+
+   if (continue_blk) {
+      struct ir3_block *start = get_block(ctx, nstart);
+      continue_blk->successors[0] = start;
+      continue_blk->physical_successors[0] = start;
+      list_addtail(&continue_blk->node, &ctx->ir->block_list);
+   }
+
+   ctx->so->loops++;
+   ctx->loop_id = old_loop_id;
  }
  
  static void
  stack_push(struct ir3_context *ctx)
  {
-       ctx->stack++;
-       ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
+   ctx->stack++;
+   ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
  }
  
  static void
  stack_pop(struct ir3_context *ctx)
  {
-       compile_assert(ctx, ctx->stack > 0);
-       ctx->stack--;
+   compile_assert(ctx, ctx->stack > 0);
+   ctx->stack--;
  }
  
  static void
  emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
  {
-       foreach_list_typed (nir_cf_node, node, node, list) {
-               switch (node->type) {
-               case nir_cf_node_block:
-                       emit_block(ctx, nir_cf_node_as_block(node));
-                       break;
-               case nir_cf_node_if:
-                       stack_push(ctx);
-                       emit_if(ctx, nir_cf_node_as_if(node));
-                       stack_pop(ctx);
-                       break;
-               case nir_cf_node_loop:
-                       stack_push(ctx);
-                       emit_loop(ctx, nir_cf_node_as_loop(node));
-                       stack_pop(ctx);
-                       break;
-               case nir_cf_node_function:
-                       ir3_context_error(ctx, "TODO\n");
-                       break;
-               }
-       }
+   foreach_list_typed (nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_block:
+         emit_block(ctx, nir_cf_node_as_block(node));
+         break;
+      case nir_cf_node_if:
+         stack_push(ctx);
+         emit_if(ctx, nir_cf_node_as_if(node));
+         stack_pop(ctx);
+         break;
+      case nir_cf_node_loop:
+         stack_push(ctx);
+         emit_loop(ctx, nir_cf_node_as_loop(node));
+         stack_pop(ctx);
+         break;
+      case nir_cf_node_function:
+         ir3_context_error(ctx, "TODO\n");
+         break;
+      }
+   }
  }
  
  /* emit stream-out code.  At this point, the current block is the original
@@ -3118,241 +3148,239 @@ emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
  static void
  emit_stream_out(struct ir3_context *ctx)
  {
-       struct ir3 *ir = ctx->ir;
-       struct ir3_stream_output_info *strmout =
-                       &ctx->so->shader->stream_output;
-       struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
-       struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
-       struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
-
-       /* create vtxcnt input in input block at top of shader,
-        * so that it is seen as live over the entire duration
-        * of the shader:
-        */
-       vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
-       maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
-
-       /* at this point, we are at the original 'end' block,
-        * re-purpose this block to stream-out condition, then
-        * append stream-out block and new-end block
-        */
-       orig_end_block = ctx->block;
-
-// maybe w/ store_global intrinsic, we could do this
-// stuff in nir->nir pass
-
-       stream_out_block = ir3_block_create(ir);
-       list_addtail(&stream_out_block->node, &ir->block_list);
-
-       new_end_block = ir3_block_create(ir);
-       list_addtail(&new_end_block->node, &ir->block_list);
-
-       orig_end_block->successors[0] = stream_out_block;
-       orig_end_block->successors[1] = new_end_block;
-
-       stream_out_block->successors[0] = new_end_block;
-
-       /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
-       cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
-       cond->dsts[0]->num = regid(REG_P0, 0);
-       cond->dsts[0]->flags &= ~IR3_REG_SSA;
-       cond->cat2.condition = IR3_COND_LT;
-
-       /* condition goes on previous block to the conditional,
-        * since it is used to pick which of the two successor
-        * paths to take:
-        */
-       orig_end_block->condition = cond;
-
-       /* switch to stream_out_block to generate the stream-out
-        * instructions:
-        */
-       ctx->block = stream_out_block;
-
-       /* Calculate base addresses based on vtxcnt.  Instructions
-        * generated for bases not used in following loop will be
-        * stripped out in the backend.
-        */
-       for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
-               const struct ir3_const_state *const_state =
-                               ir3_const_state(ctx->so);
-               unsigned stride = strmout->stride[i];
-               struct ir3_instruction *base, *off;
-
-               base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
-
-               /* 24-bit should be enough: */
-               off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
-                               create_immed(ctx->block, stride * 4), 0);
-
-               bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
-       }
-
-       /* Generate the per-output store instructions: */
-       for (unsigned i = 0; i < strmout->num_outputs; i++) {
-               for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
-                       unsigned c = j + strmout->output[i].start_component;
-                       struct ir3_instruction *base, *out, *stg;
-
-                       base = bases[strmout->output[i].output_buffer];
-                       out = ctx->outputs[regid(strmout->output[i].register_index, c)];
-
-                       stg = ir3_STG(ctx->block,
-                                                 base, 0,
-                                                 create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4), 0,
-                                                 out, 0,
-                                                 create_immed(ctx->block, 1), 0);
-                       stg->cat6.type = TYPE_U32;
-
-                       array_insert(ctx->block, ctx->block->keeps, stg);
-               }
-       }
-
-       /* and finally switch to the new_end_block: */
-       ctx->block = new_end_block;
+   struct ir3 *ir = ctx->ir;
+   struct ir3_stream_output_info *strmout = &ctx->so->shader->stream_output;
+   struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+   struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+   struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
+
+   /* create vtxcnt input in input block at top of shader,
+    * so that it is seen as live over the entire duration
+    * of the shader:
+    */
+   vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
+   maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+   /* at this point, we are at the original 'end' block,
+    * re-purpose this block to stream-out condition, then
+    * append stream-out block and new-end block
+    */
+   orig_end_block = ctx->block;
+
+   // maybe w/ store_global intrinsic, we could do this
+   // stuff in nir->nir pass
+
+   stream_out_block = ir3_block_create(ir);
+   list_addtail(&stream_out_block->node, &ir->block_list);
+
+   new_end_block = ir3_block_create(ir);
+   list_addtail(&new_end_block->node, &ir->block_list);
+
+   orig_end_block->successors[0] = stream_out_block;
+   orig_end_block->successors[1] = new_end_block;
+
+   stream_out_block->successors[0] = new_end_block;
+
+   /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+   cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+   cond->dsts[0]->num = regid(REG_P0, 0);
+   cond->dsts[0]->flags &= ~IR3_REG_SSA;
+   cond->cat2.condition = IR3_COND_LT;
+
+   /* condition goes on previous block to the conditional,
+    * since it is used to pick which of the two successor
+    * paths to take:
+    */
+   orig_end_block->condition = cond;
+
+   /* switch to stream_out_block to generate the stream-out
+    * instructions:
+    */
+   ctx->block = stream_out_block;
+
+   /* Calculate base addresses based on vtxcnt.  Instructions
+    * generated for bases not used in following loop will be
+    * stripped out in the backend.
+    */
+   for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+      const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+      unsigned stride = strmout->stride[i];
+      struct ir3_instruction *base, *off;
+
+      base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
+
+      /* 24-bit should be enough: */
+      off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
+                        create_immed(ctx->block, stride * 4), 0);
+
+      bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+   }
+
+   /* Generate the per-output store instructions: */
+   for (unsigned i = 0; i < strmout->num_outputs; i++) {
+      for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+         unsigned c = j + strmout->output[i].start_component;
+         struct ir3_instruction *base, *out, *stg;
+
+         base = bases[strmout->output[i].output_buffer];
+         out = ctx->outputs[regid(strmout->output[i].register_index, c)];
+
+         stg = ir3_STG(
+            ctx->block, base, 0,
+            create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
+            0, out, 0, create_immed(ctx->block, 1), 0);
+         stg->cat6.type = TYPE_U32;
+
+         array_insert(ctx->block, ctx->block->keeps, stg);
+      }
+   }
+
+   /* and finally switch to the new_end_block: */
+   ctx->block = new_end_block;
  }
  
  static void
  setup_predecessors(struct ir3 *ir)
  {
-       foreach_block(block, &ir->block_list) {
-               for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
-                       if (block->successors[i])
-                               ir3_block_add_predecessor(block->successors[i], block);
-                       if (block->physical_successors[i])
-                               ir3_block_add_physical_predecessor(block->physical_successors[i], block);
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+         if (block->successors[i])
+            ir3_block_add_predecessor(block->successors[i], block);
+         if (block->physical_successors[i])
+            ir3_block_add_physical_predecessor(block->physical_successors[i],
+                                               block);
+      }
+   }
  }
  
  static void
  emit_function(struct ir3_context *ctx, nir_function_impl *impl)
  {
-       nir_metadata_require(impl, nir_metadata_block_index);
-
-       compile_assert(ctx, ctx->stack == 0);
-
-       emit_cf_list(ctx, &impl->body);
-       emit_block(ctx, impl->end_block);
-
-       compile_assert(ctx, ctx->stack == 0);
-
-       /* at this point, we should have a single empty block,
-        * into which we emit the 'end' instruction.
-        */
-       compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
-
-       /* If stream-out (aka transform-feedback) enabled, emit the
-        * stream-out instructions, followed by a new empty block (into
-        * which the 'end' instruction lands).
-        *
-        * NOTE: it is done in this order, rather than inserting before
-        * we emit end_block, because NIR guarantees that all blocks
-        * flow into end_block, and that end_block has no successors.
-        * So by re-purposing end_block as the first block of stream-
-        * out, we guarantee that all exit paths flow into the stream-
-        * out instructions.
-        */
-       if ((ctx->compiler->gpu_id < 500) &&
-                       (ctx->so->shader->stream_output.num_outputs > 0) &&
-                       !ctx->so->binning_pass) {
-               debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
-               emit_stream_out(ctx);
-       }
-
-       setup_predecessors(ctx->ir);
-       foreach_block (block, &ctx->ir->block_list) {
-               resolve_phis(ctx, block);
-       }
+   nir_metadata_require(impl, nir_metadata_block_index);
+
+   compile_assert(ctx, ctx->stack == 0);
+
+   emit_cf_list(ctx, &impl->body);
+   emit_block(ctx, impl->end_block);
+
+   compile_assert(ctx, ctx->stack == 0);
+
+   /* at this point, we should have a single empty block,
+    * into which we emit the 'end' instruction.
+    */
+   compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
+
+   /* If stream-out (aka transform-feedback) enabled, emit the
+    * stream-out instructions, followed by a new empty block (into
+    * which the 'end' instruction lands).
+    *
+    * NOTE: it is done in this order, rather than inserting before
+    * we emit end_block, because NIR guarantees that all blocks
+    * flow into end_block, and that end_block has no successors.
+    * So by re-purposing end_block as the first block of stream-
+    * out, we guarantee that all exit paths flow into the stream-
+    * out instructions.
+    */
+   if ((ctx->compiler->gpu_id < 500) &&
+       (ctx->so->shader->stream_output.num_outputs > 0) &&
+       !ctx->so->binning_pass) {
+      debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
+      emit_stream_out(ctx);
+   }
+
+   setup_predecessors(ctx->ir);
+   foreach_block (block, &ctx->ir->block_list) {
+      resolve_phis(ctx, block);
+   }
  }
  
  static void
  setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_shader_variant *so = ctx->so;
-       struct ir3_instruction *coord = NULL;
-
-       if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
-               coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2);
-
-       compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
-
-       unsigned frac = nir_intrinsic_component(intr);
-       unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
-       unsigned ncomp = nir_intrinsic_dest_components(intr);
-       unsigned n = nir_intrinsic_base(intr) + offset;
-       unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
-       unsigned compmask;
-
-       /* Inputs are loaded using ldlw or ldg for other stages. */
-       compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
-                                               ctx->so->type == MESA_SHADER_VERTEX);
-
-       if (ctx->so->type == MESA_SHADER_FRAGMENT)
-               compmask = BITFIELD_MASK(ncomp) << frac;
-       else
-               compmask = BITFIELD_MASK(ncomp + frac);
-
-       /* for a4xx+ rasterflat */
-       if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
-               coord = NULL;
-
-       so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
-
-       so->inputs[n].slot = slot;
-       so->inputs[n].compmask |= compmask;
-       so->inputs_count = MAX2(so->inputs_count, n + 1);
-       compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
-       so->inputs[n].flat = !coord;
-
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               compile_assert(ctx, slot != VARYING_SLOT_POS);
-
-               so->inputs[n].bary = true;
-
-               for (int i = 0; i < ncomp; i++) {
-                       unsigned idx = (n * 4) + i + frac;
-                       ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
-               }
-       } else {
-               struct ir3_instruction *input = NULL;
-
-               foreach_input (in, ctx->ir) {
-                       if (in->input.inidx == n) {
-                               input = in;
-                               break;
-                       }
-               }
-
-               if (!input) {
-                       input = create_input(ctx, compmask);
-                       input->input.inidx = n;
-               } else {
-                       /* For aliased inputs, just append to the wrmask.. ie. if we
-                        * first see a vec2 index at slot N, and then later a vec4,
-                        * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
-                        */
-                       input->dsts[0]->wrmask |= compmask;
-               }
-
-               for (int i = 0; i < ncomp + frac; i++) {
-                       unsigned idx = (n * 4) + i;
-                       compile_assert(ctx, idx < ctx->ninputs);
-
-                       /* fixup the src wrmask to avoid validation fail */
-                       if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
-                               ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
-                               continue;
-                       }
-
-                       ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
-               }
-
-               for (int i = 0; i < ncomp; i++) {
-                       unsigned idx = (n * 4) + i + frac;
-                       ctx->last_dst[i] = ctx->inputs[idx];
-               }
-       }
+   struct ir3_shader_variant *so = ctx->so;
+   struct ir3_instruction *coord = NULL;
+
+   if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
+      coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2);
+
+   compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
+
+   unsigned frac = nir_intrinsic_component(intr);
+   unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
+   unsigned ncomp = nir_intrinsic_dest_components(intr);
+   unsigned n = nir_intrinsic_base(intr) + offset;
+   unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
+   unsigned compmask;
+
+   /* Inputs are loaded using ldlw or ldg for other stages. */
+   compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
+                          ctx->so->type == MESA_SHADER_VERTEX);
+
+   if (ctx->so->type == MESA_SHADER_FRAGMENT)
+      compmask = BITFIELD_MASK(ncomp) << frac;
+   else
+      compmask = BITFIELD_MASK(ncomp + frac);
+
+   /* for a4xx+ rasterflat */
+   if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
+      coord = NULL;
+
+   so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
+
+   so->inputs[n].slot = slot;
+   so->inputs[n].compmask |= compmask;
+   so->inputs_count = MAX2(so->inputs_count, n + 1);
+   compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
+   so->inputs[n].flat = !coord;
+
+   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+      compile_assert(ctx, slot != VARYING_SLOT_POS);
+
+      so->inputs[n].bary = true;
+
+      for (int i = 0; i < ncomp; i++) {
+         unsigned idx = (n * 4) + i + frac;
+         ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
+      }
+   } else {
+      struct ir3_instruction *input = NULL;
+
+      foreach_input (in, ctx->ir) {
+         if (in->input.inidx == n) {
+            input = in;
+            break;
+         }
+      }
+
+      if (!input) {
+         input = create_input(ctx, compmask);
+         input->input.inidx = n;
+      } else {
+         /* For aliased inputs, just append to the wrmask.. ie. if we
+          * first see a vec2 index at slot N, and then later a vec4,
+          * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
+          */
+         input->dsts[0]->wrmask |= compmask;
+      }
+
+      for (int i = 0; i < ncomp + frac; i++) {
+         unsigned idx = (n * 4) + i;
+         compile_assert(ctx, idx < ctx->ninputs);
+
+         /* fixup the src wrmask to avoid validation fail */
+         if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
+            ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
+            continue;
+         }
+
+         ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
+      }
+
+      for (int i = 0; i < ncomp; i++) {
+         unsigned idx = (n * 4) + i + frac;
+         ctx->last_dst[i] = ctx->inputs[idx];
+      }
+   }
  }
  
  /* Initially we assign non-packed inloc's for varyings, as we don't really
@@ -3363,393 +3391,403 @@ setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  static void
  pack_inlocs(struct ir3_context *ctx)
  {
-       struct ir3_shader_variant *so = ctx->so;
-       uint8_t used_components[so->inputs_count];
-
-       memset(used_components, 0, sizeof(used_components));
-
-       /*
-        * First Step: scan shader to find which bary.f/ldlv remain:
-        */
-
-       foreach_block (block, &ctx->ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (is_input(instr)) {
-                               unsigned inloc = instr->srcs[0]->iim_val;
-                               unsigned i = inloc / 4;
-                               unsigned j = inloc % 4;
-
-                               compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
-                               compile_assert(ctx, i < so->inputs_count);
-
-                               used_components[i] |= 1 << j;
-                       } else if (instr->opc == OPC_META_TEX_PREFETCH) {
-                               for (int n = 0; n < 2; n++) {
-                                       unsigned inloc = instr->prefetch.input_offset + n;
-                                       unsigned i = inloc / 4;
-                                       unsigned j = inloc % 4;
-
-                                       compile_assert(ctx, i < so->inputs_count);
-
-                                       used_components[i] |= 1 << j;
-                               }
-                       }
-               }
-       }
-
-       /*
-        * Second Step: reassign varying inloc/slots:
-        */
-
-       unsigned actual_in = 0;
-       unsigned inloc = 0;
-
-       /* for clip+cull distances, unused components can't be eliminated because
-        * they're read by fixed-function, even if there's a hole.  Note that
-        * clip/cull distance arrays must be declared in the FS, so we can just
-        * use the NIR clip/cull distances to avoid reading ucp_enables in the
-        * shader key.
-        */
-       unsigned clip_cull_size =
-               ctx->so->shader->nir->info.clip_distance_array_size +
-               ctx->so->shader->nir->info.cull_distance_array_size;
-       unsigned clip_cull_mask = MASK(clip_cull_size);
-
-       for (unsigned i = 0; i < so->inputs_count; i++) {
-               unsigned compmask = 0, maxcomp = 0;
-
-               so->inputs[i].inloc = inloc;
-               so->inputs[i].bary = false;
-
-               if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
-                       so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
-                       if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
-                               compmask = clip_cull_mask & 0xf;
-                       else
-                               compmask = clip_cull_mask >> 4;
-                       used_components[i] = compmask;
-               }
-
-               for (unsigned j = 0; j < 4; j++) {
-                       if (!(used_components[i] & (1 << j)))
-                               continue;
-
-                       compmask |= (1 << j);
-                       actual_in++;
-                       maxcomp = j + 1;
-
-                       /* at this point, since used_components[i] mask is only
-                        * considering varyings (ie. not sysvals) we know this
-                        * is a varying:
-                        */
-                       so->inputs[i].bary = true;
-               }
-
-               if (so->inputs[i].bary) {
-                       so->varying_in++;
-                       so->inputs[i].compmask = (1 << maxcomp) - 1;
-                       inloc += maxcomp;
-               }
-       }
-
-       /*
-        * Third Step: reassign packed inloc's:
-        */
-
-       foreach_block (block, &ctx->ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (is_input(instr)) {
-                               unsigned inloc = instr->srcs[0]->iim_val;
-                               unsigned i = inloc / 4;
-                               unsigned j = inloc % 4;
-
-                               instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
-                       } else if (instr->opc == OPC_META_TEX_PREFETCH) {
-                               unsigned i = instr->prefetch.input_offset / 4;
-                               unsigned j = instr->prefetch.input_offset % 4;
-                               instr->prefetch.input_offset = so->inputs[i].inloc + j;
-                       }
-               }
-       }
+   struct ir3_shader_variant *so = ctx->so;
+   uint8_t used_components[so->inputs_count];
+
+   memset(used_components, 0, sizeof(used_components));
+
+   /*
+    * First Step: scan shader to find which bary.f/ldlv remain:
+    */
+
+   foreach_block (block, &ctx->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (is_input(instr)) {
+            unsigned inloc = instr->srcs[0]->iim_val;
+            unsigned i = inloc / 4;
+            unsigned j = inloc % 4;
+
+            compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
+            compile_assert(ctx, i < so->inputs_count);
+
+            used_components[i] |= 1 << j;
+         } else if (instr->opc == OPC_META_TEX_PREFETCH) {
+            for (int n = 0; n < 2; n++) {
+               unsigned inloc = instr->prefetch.input_offset + n;
+               unsigned i = inloc / 4;
+               unsigned j = inloc % 4;
+
+               compile_assert(ctx, i < so->inputs_count);
+
+               used_components[i] |= 1 << j;
+            }
+         }
+      }
+   }
+
+   /*
+    * Second Step: reassign varying inloc/slots:
+    */
+
+   unsigned actual_in = 0;
+   unsigned inloc = 0;
+
+   /* for clip+cull distances, unused components can't be eliminated because
+    * they're read by fixed-function, even if there's a hole.  Note that
+    * clip/cull distance arrays must be declared in the FS, so we can just
+    * use the NIR clip/cull distances to avoid reading ucp_enables in the
+    * shader key.
+    */
+   unsigned clip_cull_size =
+      ctx->so->shader->nir->info.clip_distance_array_size +
+      ctx->so->shader->nir->info.cull_distance_array_size;
+   unsigned clip_cull_mask = MASK(clip_cull_size);
+
+   for (unsigned i = 0; i < so->inputs_count; i++) {
+      unsigned compmask = 0, maxcomp = 0;
+
+      so->inputs[i].inloc = inloc;
+      so->inputs[i].bary = false;
+
+      if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
+          so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
+         if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
+            compmask = clip_cull_mask & 0xf;
+         else
+            compmask = clip_cull_mask >> 4;
+         used_components[i] = compmask;
+      }
+
+      for (unsigned j = 0; j < 4; j++) {
+         if (!(used_components[i] & (1 << j)))
+            continue;
+
+         compmask |= (1 << j);
+         actual_in++;
+         maxcomp = j + 1;
+
+         /* at this point, since used_components[i] mask is only
+          * considering varyings (ie. not sysvals) we know this
+          * is a varying:
+          */
+         so->inputs[i].bary = true;
+      }
+
+      if (so->inputs[i].bary) {
+         so->varying_in++;
+         so->inputs[i].compmask = (1 << maxcomp) - 1;
+         inloc += maxcomp;
+      }
+   }
+
+   /*
+    * Third Step: reassign packed inloc's:
+    */
+
+   foreach_block (block, &ctx->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (is_input(instr)) {
+            unsigned inloc = instr->srcs[0]->iim_val;
+            unsigned i = inloc / 4;
+            unsigned j = inloc % 4;
+
+            instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
+         } else if (instr->opc == OPC_META_TEX_PREFETCH) {
+            unsigned i = instr->prefetch.input_offset / 4;
+            unsigned j = instr->prefetch.input_offset % 4;
+            instr->prefetch.input_offset = so->inputs[i].inloc + j;
+         }
+      }
+   }
  }
  
  static void
  setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  {
-       struct ir3_shader_variant *so = ctx->so;
-       nir_io_semantics io = nir_intrinsic_io_semantics(intr);
-
-       compile_assert(ctx, nir_src_is_const(intr->src[1]));
-
-       unsigned offset = nir_src_as_uint(intr->src[1]);
-       unsigned n = nir_intrinsic_base(intr) + offset;
-       unsigned frac = nir_intrinsic_component(intr);
-       unsigned ncomp = nir_intrinsic_src_components(intr, 0);
-
-       /* For per-view variables, each user-facing slot corresponds to multiple
-        * views, each with a corresponding driver_location, and the offset is for
-        * the driver_location. To properly figure out of the slot, we'd need to
-        * plumb through the number of views. However, for now we only use
-        * per-view with gl_Position, so we assume that the variable is not an
-        * array or matrix (so there are no indirect accesses to the variable
-        * itself) and the indirect offset corresponds to the view.
-        */
-       unsigned slot = io.location + (io.per_view ? 0 : offset);
-
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               switch (slot) {
-               case FRAG_RESULT_DEPTH:
-                       so->writes_pos = true;
-                       break;
-               case FRAG_RESULT_COLOR:
-                       if (!ctx->s->info.fs.color_is_dual_source) {
-                               so->color0_mrt = 1;
-                       } else {
-                               slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
-                       }
-                       break;
-               case FRAG_RESULT_SAMPLE_MASK:
-                       so->writes_smask = true;
-                       break;
-               case FRAG_RESULT_STENCIL:
-                       so->writes_stencilref = true;
-                       break;
-               default:
-                       slot += io.dual_source_blend_index; /* For dual-src blend */
-                       if (slot >= FRAG_RESULT_DATA0)
-                               break;
-                       ir3_context_error(ctx, "unknown FS output name: %s\n",
-                                       gl_frag_result_name(slot));
-               }
-       } else if (ctx->so->type == MESA_SHADER_VERTEX ||
-                       ctx->so->type == MESA_SHADER_TESS_EVAL ||
-                       ctx->so->type == MESA_SHADER_GEOMETRY) {
-               switch (slot) {
-               case VARYING_SLOT_POS:
-                       so->writes_pos = true;
-                       break;
-               case VARYING_SLOT_PSIZ:
-                       so->writes_psize = true;
-                       break;
-               case VARYING_SLOT_PRIMITIVE_ID:
-               case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
-                       debug_assert(ctx->so->type == MESA_SHADER_GEOMETRY);
-                       FALLTHROUGH;
-               case VARYING_SLOT_COL0:
-               case VARYING_SLOT_COL1:
-               case VARYING_SLOT_BFC0:
-               case VARYING_SLOT_BFC1:
-               case VARYING_SLOT_FOGC:
-               case VARYING_SLOT_CLIP_DIST0:
-               case VARYING_SLOT_CLIP_DIST1:
-               case VARYING_SLOT_CLIP_VERTEX:
-               case VARYING_SLOT_LAYER:
-               case VARYING_SLOT_VIEWPORT:
-                       break;
-               default:
-                       if (slot >= VARYING_SLOT_VAR0)
-                               break;
-                       if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
-                               break;
-                       ir3_context_error(ctx, "unknown %s shader output name: %s\n",
-                                       _mesa_shader_stage_to_string(ctx->so->type),
-                                       gl_varying_slot_name_for_stage(slot, ctx->so->type));
-               }
-       } else {
-               ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
-       }
-
-
-       so->outputs_count = MAX2(so->outputs_count, n + 1);
-       compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));
-
-       so->outputs[n].slot = slot;
-       if (io.per_view)
-               so->outputs[n].view = offset;
-
-       for (int i = 0; i < ncomp; i++) {
-               unsigned idx = (n * 4) + i + frac;
-               compile_assert(ctx, idx < ctx->noutputs);
-               ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
-       }
-
-       /* if varying packing doesn't happen, we could end up in a situation
-        * with "holes" in the output, and since the per-generation code that
-        * sets up varying linkage registers doesn't expect to have more than
-        * one varying per vec4 slot, pad the holes.
-        *
-        * Note that this should probably generate a performance warning of
-        * some sort.
-        */
-       for (int i = 0; i < frac; i++) {
-               unsigned idx = (n * 4) + i;
-               if (!ctx->outputs[idx]) {
-                       ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
-               }
-       }
-
-       struct ir3_instruction * const *src = ir3_get_src(ctx, &intr->src[0]);
-       for (int i = 0; i < ncomp; i++) {
-               unsigned idx = (n * 4) + i + frac;
-               ctx->outputs[idx] = src[i];
-       }
+   struct ir3_shader_variant *so = ctx->so;
+   nir_io_semantics io = nir_intrinsic_io_semantics(intr);
+
+   compile_assert(ctx, nir_src_is_const(intr->src[1]));
+
+   unsigned offset = nir_src_as_uint(intr->src[1]);
+   unsigned n = nir_intrinsic_base(intr) + offset;
+   unsigned frac = nir_intrinsic_component(intr);
+   unsigned ncomp = nir_intrinsic_src_components(intr, 0);
+
+   /* For per-view variables, each user-facing slot corresponds to multiple
+    * views, each with a corresponding driver_location, and the offset is for
+    * the driver_location. To properly figure out of the slot, we'd need to
+    * plumb through the number of views. However, for now we only use
+    * per-view with gl_Position, so we assume that the variable is not an
+    * array or matrix (so there are no indirect accesses to the variable
+    * itself) and the indirect offset corresponds to the view.
+    */
+   unsigned slot = io.location + (io.per_view ? 0 : offset);
+
+   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+      switch (slot) {
+      case FRAG_RESULT_DEPTH:
+         so->writes_pos = true;
+         break;
+      case FRAG_RESULT_COLOR:
+         if (!ctx->s->info.fs.color_is_dual_source) {
+            so->color0_mrt = 1;
+         } else {
+            slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
+         }
+         break;
+      case FRAG_RESULT_SAMPLE_MASK:
+         so->writes_smask = true;
+         break;
+      case FRAG_RESULT_STENCIL:
+         so->writes_stencilref = true;
+         break;
+      default:
+         slot += io.dual_source_blend_index; /* For dual-src blend */
+         if (slot >= FRAG_RESULT_DATA0)
+            break;
+         ir3_context_error(ctx, "unknown FS output name: %s\n",
+                           gl_frag_result_name(slot));
+      }
+   } else if (ctx->so->type == MESA_SHADER_VERTEX ||
+              ctx->so->type == MESA_SHADER_TESS_EVAL ||
+              ctx->so->type == MESA_SHADER_GEOMETRY) {
+      switch (slot) {
+      case VARYING_SLOT_POS:
+         so->writes_pos = true;
+         break;
+      case VARYING_SLOT_PSIZ:
+         so->writes_psize = true;
+         break;
+      case VARYING_SLOT_PRIMITIVE_ID:
+      case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
+         debug_assert(ctx->so->type == MESA_SHADER_GEOMETRY);
+         FALLTHROUGH;
+      case VARYING_SLOT_COL0:
+      case VARYING_SLOT_COL1:
+      case VARYING_SLOT_BFC0:
+      case VARYING_SLOT_BFC1:
+      case VARYING_SLOT_FOGC:
+      case VARYING_SLOT_CLIP_DIST0:
+      case VARYING_SLOT_CLIP_DIST1:
+      case VARYING_SLOT_CLIP_VERTEX:
+      case VARYING_SLOT_LAYER:
+      case VARYING_SLOT_VIEWPORT:
+         break;
+      default:
+         if (slot >= VARYING_SLOT_VAR0)
+            break;
+         if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
+            break;
+         ir3_context_error(ctx, "unknown %s shader output name: %s\n",
+                           _mesa_shader_stage_to_string(ctx->so->type),
+                           gl_varying_slot_name_for_stage(slot, ctx->so->type));
+      }
+   } else {
+      ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+   }
+
+   so->outputs_count = MAX2(so->outputs_count, n + 1);
+   compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));
+
+   so->outputs[n].slot = slot;
+   if (io.per_view)
+      so->outputs[n].view = offset;
+
+   for (int i = 0; i < ncomp; i++) {
+      unsigned idx = (n * 4) + i + frac;
+      compile_assert(ctx, idx < ctx->noutputs);
+      ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
+   }
+
+   /* if varying packing doesn't happen, we could end up in a situation
+    * with "holes" in the output, and since the per-generation code that
+    * sets up varying linkage registers doesn't expect to have more than
+    * one varying per vec4 slot, pad the holes.
+    *
+    * Note that this should probably generate a performance warning of
+    * some sort.
+    */
+   for (int i = 0; i < frac; i++) {
+      unsigned idx = (n * 4) + i;
+      if (!ctx->outputs[idx]) {
+         ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
+      }
+   }
+
+   struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
+   for (int i = 0; i < ncomp; i++) {
+      unsigned idx = (n * 4) + i + frac;
+      ctx->outputs[idx] = src[i];
+   }
  }
  
  static bool
  uses_load_input(struct ir3_shader_variant *so)
  {
-       return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
+   return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
  }
  
  static bool
  uses_store_output(struct ir3_shader_variant *so)
  {
-       switch (so->type) {
-               case MESA_SHADER_VERTEX:
-                       return !so->key.has_gs && !so->key.tessellation;
-               case MESA_SHADER_TESS_EVAL:
-                       return !so->key.has_gs;
-               case MESA_SHADER_GEOMETRY:
-               case MESA_SHADER_FRAGMENT:
-                       return true;
-               case MESA_SHADER_TESS_CTRL:
-               case MESA_SHADER_COMPUTE:
-                       return false;
-               default:
-                       unreachable("unknown stage");
-       }
+   switch (so->type) {
+   case MESA_SHADER_VERTEX:
+      return !so->key.has_gs && !so->key.tessellation;
+   case MESA_SHADER_TESS_EVAL:
+      return !so->key.has_gs;
+   case MESA_SHADER_GEOMETRY:
+   case MESA_SHADER_FRAGMENT:
+      return true;
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_COMPUTE:
+      return false;
+   default:
+      unreachable("unknown stage");
+   }
  }
  
  static void
  emit_instructions(struct ir3_context *ctx)
  {
-       nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
-
-       /* some varying setup which can't be done in setup_input(): */
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               nir_foreach_shader_in_variable (var, ctx->s) {
-                       /* if any varyings have 'sample' qualifer, that triggers us
-                        * to run in per-sample mode:
-                        */
-                       if (var->data.sample)
-                               ctx->so->per_samp = true;
-
-                       /* set rasterflat flag for front/back color */
-                       if (var->data.interpolation == INTERP_MODE_NONE) {
-                               switch (var->data.location) {
-                               case VARYING_SLOT_COL0:
-                               case VARYING_SLOT_COL1:
-                               case VARYING_SLOT_BFC0:
-                               case VARYING_SLOT_BFC1:
-                                       ctx->so->inputs[var->data.driver_location].rasterflat = true;
-                                       break;
-                               default:
-                                       break;
-                               }
-                       }
-               }
-       }
-
-       if (uses_load_input(ctx->so)) {
-               ctx->so->inputs_count = ctx->s->num_inputs;
-               compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
-               ctx->ninputs = ctx->s->num_inputs * 4;
-               ctx->inputs  = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
-       } else {
-               ctx->ninputs = 0;
-               ctx->so->inputs_count = 0;
-       }
-
-       if (uses_store_output(ctx->so)) {
-               ctx->noutputs = ctx->s->num_outputs * 4;
-               ctx->outputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
-       } else {
-               ctx->noutputs = 0;
-       }
-
-       ctx->ir = ir3_create(ctx->compiler, ctx->so);
-
-       /* Create inputs in first block: */
-       ctx->block = get_block(ctx, nir_start_block(fxn));
-       ctx->in_block = ctx->block;
-
-       /* for fragment shader, the vcoord input register is used as the
-        * base for bary.f varying fetch instrs:
-        *
-        * TODO defer creating ctx->ij_pixel and corresponding sysvals
-        * until emit_intrinsic when we know they are actually needed.
-        * For now, we defer creating ctx->ij_centroid, etc, since we
-        * only need ij_pixel for "old style" varying inputs (ie.
-        * tgsi_to_nir)
-        */
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
-       }
-
-       /* Defer add_sysval_input() stuff until after setup_inputs(),
-        * because sysvals need to be appended after varyings:
-        */
-       if (ctx->ij[IJ_PERSP_PIXEL]) {
-               add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
-                               0x3, ctx->ij[IJ_PERSP_PIXEL]);
-       }
-
-
-       /* Tesselation shaders always need primitive ID for indexing the
-        * BO. Geometry shaders don't always need it but when they do it has be
-        * delivered and unclobbered in the VS. To make things easy, we always
-        * make room for it in VS/DS.
-        */
-       bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
-       bool has_gs = ctx->so->key.has_gs;
-       switch (ctx->so->type) {
-       case MESA_SHADER_VERTEX:
-               if (has_tess) {
-                       ctx->tcs_header = create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
-                       ctx->primitive_id = create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
-               } else if (has_gs) {
-                       ctx->gs_header = create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
-                       ctx->primitive_id = create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
-               }
-               break;
-       case MESA_SHADER_TESS_CTRL:
-               ctx->tcs_header = create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
-               ctx->primitive_id = create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
-               break;
-       case MESA_SHADER_TESS_EVAL:
-               if (has_gs)
-                       ctx->gs_header = create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
-               ctx->primitive_id = create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
-               break;
-       case MESA_SHADER_GEOMETRY:
-               ctx->gs_header = create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
-               ctx->primitive_id = create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
-               break;
-       default:
-               break;
-       }
-
-       /* Find # of samplers. Just assume that we'll be reading from images.. if
-        * it is write-only we don't have to count it, but after lowering derefs
-        * is too late to compact indices for that.
-        */
-       ctx->so->num_samp = BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
-
-       /* Save off clip+cull information. */
-       ctx->so->clip_mask = MASK(ctx->s->info.clip_distance_array_size);
-       ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size) <<
-               ctx->s->info.clip_distance_array_size;
-
-       ctx->so->pvtmem_size = ctx->s->scratch_size;
-       ctx->so->shared_size = ctx->s->info.shared_size;
-
-       /* NOTE: need to do something more clever when we support >1 fxn */
-       nir_foreach_register (reg, &fxn->registers) {
-               ir3_declare_array(ctx, reg);
-       }
-       /* And emit the body: */
-       ctx->impl = fxn;
-       emit_function(ctx, fxn);
+   nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+   /* some varying setup which can't be done in setup_input(): */
+   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+      nir_foreach_shader_in_variable (var, ctx->s) {
+         /* if any varyings have 'sample' qualifer, that triggers us
+          * to run in per-sample mode:
+          */
+         if (var->data.sample)
+            ctx->so->per_samp = true;
+
+         /* set rasterflat flag for front/back color */
+         if (var->data.interpolation == INTERP_MODE_NONE) {
+            switch (var->data.location) {
+            case VARYING_SLOT_COL0:
+            case VARYING_SLOT_COL1:
+            case VARYING_SLOT_BFC0:
+            case VARYING_SLOT_BFC1:
+               ctx->so->inputs[var->data.driver_location].rasterflat = true;
+               break;
+            default:
+               break;
+            }
+         }
+      }
+   }
+
+   if (uses_load_input(ctx->so)) {
+      ctx->so->inputs_count = ctx->s->num_inputs;
+      compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
+      ctx->ninputs = ctx->s->num_inputs * 4;
+      ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
+   } else {
+      ctx->ninputs = 0;
+      ctx->so->inputs_count = 0;
+   }
+
+   if (uses_store_output(ctx->so)) {
+      ctx->noutputs = ctx->s->num_outputs * 4;
+      ctx->outputs =
+         rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
+   } else {
+      ctx->noutputs = 0;
+   }
+
+   ctx->ir = ir3_create(ctx->compiler, ctx->so);
+
+   /* Create inputs in first block: */
+   ctx->block = get_block(ctx, nir_start_block(fxn));
+   ctx->in_block = ctx->block;
+
+   /* for fragment shader, the vcoord input register is used as the
+    * base for bary.f varying fetch instrs:
+    *
+    * TODO defer creating ctx->ij_pixel and corresponding sysvals
+    * until emit_intrinsic when we know they are actually needed.
+    * For now, we defer creating ctx->ij_centroid, etc, since we
+    * only need ij_pixel for "old style" varying inputs (ie.
+    * tgsi_to_nir)
+    */
+   if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+      ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
+   }
+
+   /* Defer add_sysval_input() stuff until after setup_inputs(),
+    * because sysvals need to be appended after varyings:
+    */
+   if (ctx->ij[IJ_PERSP_PIXEL]) {
+      add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
+                                ctx->ij[IJ_PERSP_PIXEL]);
+   }
+
+   /* Tesselation shaders always need primitive ID for indexing the
+    * BO. Geometry shaders don't always need it but when they do it has be
+    * delivered and unclobbered in the VS. To make things easy, we always
+    * make room for it in VS/DS.
+    */
+   bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
+   bool has_gs = ctx->so->key.has_gs;
+   switch (ctx->so->type) {
+   case MESA_SHADER_VERTEX:
+      if (has_tess) {
+         ctx->tcs_header =
+            create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
+         ctx->primitive_id =
+            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
+      } else if (has_gs) {
+         ctx->gs_header =
+            create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
+         ctx->primitive_id =
+            create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
+      }
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      ctx->tcs_header =
+         create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
+      ctx->primitive_id =
+         create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (has_gs)
+         ctx->gs_header =
+            create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
+      ctx->primitive_id =
+         create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      ctx->gs_header =
+         create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
+      ctx->primitive_id =
+         create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
+      break;
+   default:
+      break;
+   }
+
+   /* Find # of samplers. Just assume that we'll be reading from images.. if
+    * it is write-only we don't have to count it, but after lowering derefs
+    * is too late to compact indices for that.
+    */
+   ctx->so->num_samp =
+      BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
+
+   /* Save off clip+cull information. */
+   ctx->so->clip_mask = MASK(ctx->s->info.clip_distance_array_size);
+   ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
+                        << ctx->s->info.clip_distance_array_size;
+
+   ctx->so->pvtmem_size = ctx->s->scratch_size;
+   ctx->so->shared_size = ctx->s->info.shared_size;
+
+   /* NOTE: need to do something more clever when we support >1 fxn */
+   nir_foreach_register (reg, &fxn->registers) {
+      ir3_declare_array(ctx, reg);
+   }
+   /* And emit the body: */
+   ctx->impl = fxn;
+   emit_function(ctx, fxn);
  }
  
  /* Fixup tex sampler state for astc/srgb workaround instructions.  We
@@ -3759,526 +3797,523 @@ emit_instructions(struct ir3_context *ctx)
  static void
  fixup_astc_srgb(struct ir3_context *ctx)
  {
-       struct ir3_shader_variant *so = ctx->so;
-       /* indexed by original tex idx, value is newly assigned alpha sampler
-        * state tex idx.  Zero is invalid since there is at least one sampler
-        * if we get here.
-        */
-       unsigned alt_tex_state[16] = {0};
-       unsigned tex_idx = ctx->max_texture_index + 1;
-       unsigned idx = 0;
-
-       so->astc_srgb.base = tex_idx;
-
-       for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
-               struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
-
-               compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
-
-               if (alt_tex_state[sam->cat5.tex] == 0) {
-                       /* assign new alternate/alpha tex state slot: */
-                       alt_tex_state[sam->cat5.tex] = tex_idx++;
-                       so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
-                       so->astc_srgb.count++;
-               }
-
-               sam->cat5.tex = alt_tex_state[sam->cat5.tex];
-       }
+   struct ir3_shader_variant *so = ctx->so;
+   /* indexed by original tex idx, value is newly assigned alpha sampler
+    * state tex idx.  Zero is invalid since there is at least one sampler
+    * if we get here.
+    */
+   unsigned alt_tex_state[16] = {0};
+   unsigned tex_idx = ctx->max_texture_index + 1;
+   unsigned idx = 0;
+
+   so->astc_srgb.base = tex_idx;
+
+   for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
+      struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
+
+      compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
+
+      if (alt_tex_state[sam->cat5.tex] == 0) {
+         /* assign new alternate/alpha tex state slot: */
+         alt_tex_state[sam->cat5.tex] = tex_idx++;
+         so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
+         so->astc_srgb.count++;
+      }
+
+      sam->cat5.tex = alt_tex_state[sam->cat5.tex];
+   }
  }
  
  static bool
  output_slot_used_for_binning(gl_varying_slot slot)
  {
-       return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
-                  slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1;
+   return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
+          slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1;
  }
  
-
-static struct ir3_instruction *find_end(struct ir3 *ir)
+static struct ir3_instruction *
+find_end(struct ir3 *ir)
  {
-       foreach_block_rev (block, &ir->block_list) {
-               foreach_instr_rev(instr, &block->instr_list) {
-                       if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
-                               return instr;
-               }
-       }
-       unreachable("couldn't find end instruction");
+   foreach_block_rev (block, &ir->block_list) {
+      foreach_instr_rev (instr, &block->instr_list) {
+         if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
+            return instr;
+      }
+   }
+   unreachable("couldn't find end instruction");
  }
  
  static void
  fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
  {
-       struct ir3_shader_variant *so = ctx->so;
-       unsigned i, j;
-
-       /* first pass, remove unused outputs from the IR level outputs: */
-       for (i = 0, j = 0; i < end->srcs_count; i++) {
-               unsigned outidx = end->end.outidxs[i];
-               unsigned slot = so->outputs[outidx].slot;
-
-               if (output_slot_used_for_binning(slot)) {
-                       end->srcs[j] = end->srcs[i];
-                       end->end.outidxs[j] = end->end.outidxs[i];
-                       j++;
-               }
-       }
-       end->srcs_count = j;
-
-       /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
-        * table:
-        */
-       for (i = 0, j = 0; i < so->outputs_count; i++) {
-               unsigned slot = so->outputs[i].slot;
-
-               if (output_slot_used_for_binning(slot)) {
-                       so->outputs[j] = so->outputs[i];
-
-                       /* fixup outidx to point to new output table entry: */
-                       for (unsigned k = 0; k < end->srcs_count; k++) {
-                               if (end->end.outidxs[k] == i) {
-                                       end->end.outidxs[k] = j;
-                                       break;
-                               }
-                       }
-
-                       j++;
-               }
-       }
-       so->outputs_count = j;
+   struct ir3_shader_variant *so = ctx->so;
+   unsigned i, j;
+
+   /* first pass, remove unused outputs from the IR level outputs: */
+   for (i = 0, j = 0; i < end->srcs_count; i++) {
+      unsigned outidx = end->end.outidxs[i];
+      unsigned slot = so->outputs[outidx].slot;
+
+      if (output_slot_used_for_binning(slot)) {
+         end->srcs[j] = end->srcs[i];
+         end->end.outidxs[j] = end->end.outidxs[i];
+         j++;
+      }
+   }
+   end->srcs_count = j;
+
+   /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
+    * table:
+    */
+   for (i = 0, j = 0; i < so->outputs_count; i++) {
+      unsigned slot = so->outputs[i].slot;
+
+      if (output_slot_used_for_binning(slot)) {
+         so->outputs[j] = so->outputs[i];
+
+         /* fixup outidx to point to new output table entry: */
+         for (unsigned k = 0; k < end->srcs_count; k++) {
+            if (end->end.outidxs[k] == i) {
+               end->end.outidxs[k] = j;
+               break;
+            }
+         }
+
+         j++;
+      }
+   }
+   so->outputs_count = j;
  }
  
  static void
  collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
  {
-       unsigned idx = 0;
-
-       /* Collect sampling instructions eligible for pre-dispatch. */
-       foreach_block (block, &ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (instr->opc == OPC_META_TEX_PREFETCH) {
-                               assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
-                               struct ir3_sampler_prefetch *fetch =
-                                       &ctx->so->sampler_prefetch[idx];
-                               idx++;
-
-                               if (instr->flags & IR3_INSTR_B) {
-                                       fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;
-                                       /* In bindless mode, the index is actually the base */
-                                       fetch->tex_id = instr->prefetch.tex_base;
-                                       fetch->samp_id = instr->prefetch.samp_base;
-                                       fetch->tex_bindless_id = instr->prefetch.tex;
-                                       fetch->samp_bindless_id = instr->prefetch.samp;
-                               } else {
-                                       fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;
-                                       fetch->tex_id = instr->prefetch.tex;
-                                       fetch->samp_id = instr->prefetch.samp;
-                               }
-                               fetch->wrmask = instr->dsts[0]->wrmask;
-                               fetch->dst = instr->dsts[0]->num;
-                               fetch->src = instr->prefetch.input_offset;
-
-                               /* These are the limits on a5xx/a6xx, we might need to
-                                * revisit if SP_FS_PREFETCH[n] changes on later gens:
-                                */
-                               assert(fetch->dst <= 0x3f);
-                               assert(fetch->tex_id <= 0x1f);
-                               assert(fetch->samp_id < 0xf);
-
-                               ctx->so->total_in =
-                                       MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
-
-                               fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
-
-                               /* Remove the prefetch placeholder instruction: */
-                               list_delinit(&instr->node);
-                       }
-               }
-       }
+   unsigned idx = 0;
+
+   /* Collect sampling instructions eligible for pre-dispatch. */
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_META_TEX_PREFETCH) {
+            assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
+            struct ir3_sampler_prefetch *fetch =
+               &ctx->so->sampler_prefetch[idx];
+            idx++;
+
+            if (instr->flags & IR3_INSTR_B) {
+               fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;
+               /* In bindless mode, the index is actually the base */
+               fetch->tex_id = instr->prefetch.tex_base;
+               fetch->samp_id = instr->prefetch.samp_base;
+               fetch->tex_bindless_id = instr->prefetch.tex;
+               fetch->samp_bindless_id = instr->prefetch.samp;
+            } else {
+               fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;
+               fetch->tex_id = instr->prefetch.tex;
+               fetch->samp_id = instr->prefetch.samp;
+            }
+            fetch->wrmask = instr->dsts[0]->wrmask;
+            fetch->dst = instr->dsts[0]->num;
+            fetch->src = instr->prefetch.input_offset;
+
+            /* These are the limits on a5xx/a6xx, we might need to
+             * revisit if SP_FS_PREFETCH[n] changes on later gens:
+             */
+            assert(fetch->dst <= 0x3f);
+            assert(fetch->tex_id <= 0x1f);
+            assert(fetch->samp_id < 0xf);
+
+            ctx->so->total_in =
+               MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
+
+            fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
+
+            /* Remove the prefetch placeholder instruction: */
+            list_delinit(&instr->node);
+         }
+      }
+   }
  }
  
  int
  ir3_compile_shader_nir(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so)
+                       struct ir3_shader_variant *so)
  {
-       struct ir3_context *ctx;
-       struct ir3 *ir;
-       int ret = 0, max_bary;
-       bool progress;
-
-       assert(!so->ir);
-
-       ctx = ir3_context_init(compiler, so);
-       if (!ctx) {
-               DBG("INIT failed!");
-               ret = -1;
-               goto out;
-       }
-
-       emit_instructions(ctx);
-
-       if (ctx->error) {
-               DBG("EMIT failed!");
-               ret = -1;
-               goto out;
-       }
-
-       ir = so->ir = ctx->ir;
-
-       /* Vertex shaders in a tessellation or geometry pipeline treat END as a
-        * NOP and has an epilogue that writes the VS outputs to local storage, to
-        * be read by the HS.  Then it resets execution mask (chmask) and chains
-        * to the next shader (chsh). There are also a few output values which we
-        * must send to the next stage via registers, and in order for both stages
-        * to agree on the register used we must force these to be in specific
-        * registers.
-        */
-       if ((so->type == MESA_SHADER_VERTEX &&
-                               (so->key.has_gs || so->key.tessellation)) ||
-                       (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
-               struct ir3_instruction *outputs[3];
-               unsigned outidxs[3];
-               unsigned regids[3];
-               unsigned outputs_count = 0;
-
-               if (ctx->primitive_id) {
-                       unsigned n = so->outputs_count++;
-                       so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
-
-                       struct ir3_instruction *out = ir3_collect(ctx, ctx->primitive_id);
-                       outputs[outputs_count] = out;
-                       outidxs[outputs_count] = n;
-                       regids[outputs_count] = regid(0, 1);
-                       outputs_count++;
-               }
-
-               if (ctx->gs_header) {
-                       unsigned n = so->outputs_count++;
-                       so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
-                       struct ir3_instruction *out = ir3_collect(ctx, ctx->gs_header);
-                       outputs[outputs_count] = out;
-                       outidxs[outputs_count] = n;
-                       regids[outputs_count] = regid(0, 0);
-                       outputs_count++;
-               }
-
-               if (ctx->tcs_header) {
-                       unsigned n = so->outputs_count++;
-                       so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
-                       struct ir3_instruction *out = ir3_collect(ctx, ctx->tcs_header);
-                       outputs[outputs_count] = out;
-                       outidxs[outputs_count] = n;
-                       regids[outputs_count] = regid(0, 0);
-                       outputs_count++;
-               }
-
-               struct ir3_instruction *chmask =
-                       ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
-               chmask->barrier_class = IR3_BARRIER_EVERYTHING;
-               chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
-
-               for (unsigned i = 0; i < outputs_count; i++)
-                       __ssa_src(chmask, outputs[i], 0)->num = regids[i];
-
-               chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
-               memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
-
-               array_insert(ctx->block, ctx->block->keeps, chmask);
-
-               struct ir3_instruction *chsh =
-                       ir3_CHSH(ctx->block);
-               chsh->barrier_class = IR3_BARRIER_EVERYTHING;
-               chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
-       } else {
-               assert((ctx->noutputs % 4) == 0);
-               unsigned outidxs[ctx->noutputs / 4];
-               struct ir3_instruction *outputs[ctx->noutputs / 4];
-               unsigned outputs_count = 0;
-
-               struct ir3_block *old_block = ctx->block;
-               /* Insert these collect's in the block before the end-block if
-                * possible, so that any moves they generate can be shuffled around to
-                * reduce nop's:
-                */
-               if (ctx->block->predecessors_count == 1)
-                       ctx->block = ctx->block->predecessors[0];
-
-
-               /* Setup IR level outputs, which are "collects" that gather
-                * the scalar components of outputs.
-                */
-               for (unsigned i = 0; i < ctx->noutputs; i += 4) {
-                       unsigned ncomp = 0;
-                       /* figure out the # of components written:
-                        *
-                        * TODO do we need to handle holes, ie. if .x and .z
-                        * components written, but .y component not written?
-                        */
-                       for (unsigned j = 0; j < 4; j++) {
-                               if (!ctx->outputs[i + j])
-                                       break;
-                               ncomp++;
-                       }
-
-                       /* Note that in some stages, like TCS, store_output is
-                        * lowered to memory writes, so no components of the
-                        * are "written" from the PoV of traditional store-
-                        * output instructions:
-                        */
-                       if (!ncomp)
-                               continue;
-
-                       struct ir3_instruction *out =
-                               ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
-
-                       int outidx = i / 4;
-                       assert(outidx < so->outputs_count);
-
-                       outidxs[outputs_count] = outidx;
-                       outputs[outputs_count] = out;
-                       outputs_count++;
-               }
-
-               /* for a6xx+, binning and draw pass VS use same VBO state, so we
-                * need to make sure not to remove any inputs that are used by
-                * the nonbinning VS.
-                */
-               if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
-                               so->type == MESA_SHADER_VERTEX) {
-                       for (int i = 0; i < ctx->ninputs; i++) {
-                               struct ir3_instruction *in = ctx->inputs[i];
-
-                               if (!in)
-                                       continue;
-
-                               unsigned n = i / 4;
-                               unsigned c = i % 4;
-
-                               debug_assert(n < so->nonbinning->inputs_count);
-
-                               if (so->nonbinning->inputs[n].sysval)
-                                       continue;
-
-                               /* be sure to keep inputs, even if only used in VS */
-                               if (so->nonbinning->inputs[n].compmask & (1 << c))
-                                       array_insert(in->block, in->block->keeps, in);
-                       }
-               }
-
-               ctx->block = old_block;
-
-               struct ir3_instruction *end = ir3_instr_create(ctx->block, OPC_END,
-                               0, outputs_count);
-
-               for (unsigned i = 0; i < outputs_count; i++) {
-                       __ssa_src(end, outputs[i], 0);
-               }
-
-               end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
-               memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
-
-               array_insert(ctx->block, ctx->block->keeps, end);
-
-               /* at this point, for binning pass, throw away unneeded outputs: */
-               if (so->binning_pass && (ctx->compiler->gpu_id < 600))
-                       fixup_binning_pass(ctx, end);
-
-       }
-
-
-       ir3_debug_print(ir, "AFTER: nir->ir3");
-       ir3_validate(ir);
-
-       IR3_PASS(ir, ir3_array_to_ssa);
-
-       do {
-               progress = false;
-
-               progress |= IR3_PASS(ir, ir3_cf);
-               progress |= IR3_PASS(ir, ir3_cp, so);
-               progress |= IR3_PASS(ir, ir3_cse);
-               progress |= IR3_PASS(ir, ir3_dce, so);
-       } while (progress);
-
-       /* at this point, for binning pass, throw away unneeded outputs:
-        * Note that for a6xx and later, we do this after ir3_cp to ensure
-        * that the uniform/constant layout for BS and VS matches, so that
-        * we can re-use same VS_CONST state group.
-        */
-       if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {
-               fixup_binning_pass(ctx, find_end(ctx->so->ir));
-               /* cleanup the result of removing unneeded outputs: */
-               while (IR3_PASS(ir, ir3_dce, so)) {}
-       }
-
-       IR3_PASS(ir, ir3_sched_add_deps);
-
-       /* At this point, all the dead code should be long gone: */
-       assert(!IR3_PASS(ir, ir3_dce, so));
-
-       ret = ir3_sched(ir);
-       if (ret) {
-               DBG("SCHED failed!");
-               goto out;
-       }
-
-       ir3_debug_print(ir, "AFTER: ir3_sched");
-
-       if (IR3_PASS(ir, ir3_cp_postsched)) {
-               /* cleanup the result of removing unneeded mov's: */
-               while (IR3_PASS(ir, ir3_dce, so)) {}
-       }
-
-       /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
-        * with draw pass VS, so binning and draw pass can both use the
-        * same VBO state.
-        *
-        * Note that VS inputs are expected to be full precision.
-        */
-       bool pre_assign_inputs = (ir->compiler->gpu_id >= 600) &&
-                       (ir->type == MESA_SHADER_VERTEX) &&
-                       so->binning_pass;
-
-       if (pre_assign_inputs) {
-               foreach_input (in, ir) {
-                       assert(in->opc == OPC_META_INPUT);
-                       unsigned inidx = in->input.inidx;
-
-                       in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
-               }
-       } else if (ctx->tcs_header) {
-               /* We need to have these values in the same registers between VS and TCS
-                * since the VS chains to TCS and doesn't get the sysvals redelivered.
-                */
-
-               ctx->tcs_header->dsts[0]->num = regid(0, 0);
-               ctx->primitive_id->dsts[0]->num = regid(0, 1);
-       } else if (ctx->gs_header) {
-               /* We need to have these values in the same registers between producer
-                * (VS or DS) and GS since the producer chains to GS and doesn't get
-                * the sysvals redelivered.
-                */
-
-               ctx->gs_header->dsts[0]->num = regid(0, 0);
-               ctx->primitive_id->dsts[0]->num = regid(0, 1);
-       } else if (so->num_sampler_prefetch) {
-               assert(so->type == MESA_SHADER_FRAGMENT);
-               int idx = 0;
-
-               foreach_input (instr, ir) {
-                       if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
-                               continue;
-
-                       assert(idx < 2);
-                       instr->dsts[0]->num = idx;
-                       idx++;
-               }
-       }
-
-       ret = ir3_ra(so);
-
-       if (ret) {
-               mesa_loge("ir3_ra() failed!");
-               goto out;
-       }
-
-       IR3_PASS(ir, ir3_postsched, so);
-
-       IR3_PASS(ir, ir3_lower_subgroups);
-
-       if (so->type == MESA_SHADER_FRAGMENT)
-               pack_inlocs(ctx);
-
-       /*
-        * Fixup inputs/outputs to point to the actual registers assigned:
-        *
-        * 1) initialize to r63.x (invalid/unused)
-        * 2) iterate IR level inputs/outputs and update the variants
-        *    inputs/outputs table based on the assigned registers for
-        *    the remaining inputs/outputs.
-        */
-
-       for (unsigned i = 0; i < so->inputs_count; i++)
-               so->inputs[i].regid = INVALID_REG;
-       for (unsigned i = 0; i < so->outputs_count; i++)
-               so->outputs[i].regid = INVALID_REG;
-
-       struct ir3_instruction *end = find_end(so->ir);
-
-       for (unsigned i = 0; i < end->srcs_count; i++) {
-               unsigned outidx = end->end.outidxs[i];
-               struct ir3_register *reg = end->srcs[i];
-
-               so->outputs[outidx].regid = reg->num;
-               so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
-       }
-
-       foreach_input (in, ir) {
-               assert(in->opc == OPC_META_INPUT);
-               unsigned inidx = in->input.inidx;
-
-               if (pre_assign_inputs && !so->inputs[inidx].sysval) {
-                       if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
-                               compile_assert(ctx, in->dsts[0]->num ==
-                                               so->nonbinning->inputs[inidx].regid);
-                               compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
-                                               so->nonbinning->inputs[inidx].half);
-                       }
-                       so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
-                       so->inputs[inidx].half  = so->nonbinning->inputs[inidx].half;
-               } else {
-                       so->inputs[inidx].regid = in->dsts[0]->num;
-                       so->inputs[inidx].half  = !!(in->dsts[0]->flags & IR3_REG_HALF);
-               }
-       }
-
-       if (ctx->astc_srgb)
-               fixup_astc_srgb(ctx);
-
-       /* We need to do legalize after (for frag shader's) the "bary.f"
-        * offsets (inloc) have been assigned.
-        */
-       IR3_PASS(ir, ir3_legalize, so, &max_bary);
-
-       /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
-        * know what we might have to wait on when coming in from VS chsh.
-        */
-       if (so->type == MESA_SHADER_TESS_CTRL ||
-               so->type == MESA_SHADER_GEOMETRY ) {
-               foreach_block (block, &ir->block_list) {
-                       foreach_instr (instr, &block->instr_list) {
-                               instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-                               break;
-                       }
-               }
-       }
-
-       so->branchstack = ctx->max_stack;
-
-       /* Note that actual_in counts inputs that are not bary.f'd for FS: */
-       if (so->type == MESA_SHADER_FRAGMENT)
-               so->total_in = max_bary + 1;
-
-       /* Collect sampling instructions eligible for pre-dispatch. */
-       collect_tex_prefetches(ctx, ir);
-
-       if (so->type == MESA_SHADER_FRAGMENT &&
-                       ctx->s->info.fs.needs_quad_helper_invocations)
-               so->need_pixlod = true;
-
-       if (so->type == MESA_SHADER_COMPUTE) {
-               so->local_size[0] = ctx->s->info.workgroup_size[0];
-               so->local_size[1] = ctx->s->info.workgroup_size[1];
-               so->local_size[2] = ctx->s->info.workgroup_size[2];
-               so->local_size_variable = ctx->s->info.workgroup_size_variable;
-       }
+   struct ir3_context *ctx;
+   struct ir3 *ir;
+   int ret = 0, max_bary;
+   bool progress;
+
+   assert(!so->ir);
+
+   ctx = ir3_context_init(compiler, so);
+   if (!ctx) {
+      DBG("INIT failed!");
+      ret = -1;
+      goto out;
+   }
+
+   emit_instructions(ctx);
+
+   if (ctx->error) {
+      DBG("EMIT failed!");
+      ret = -1;
+      goto out;
+   }
+
+   ir = so->ir = ctx->ir;
+
+   /* Vertex shaders in a tessellation or geometry pipeline treat END as a
+    * NOP and has an epilogue that writes the VS outputs to local storage, to
+    * be read by the HS.  Then it resets execution mask (chmask) and chains
+    * to the next shader (chsh). There are also a few output values which we
+    * must send to the next stage via registers, and in order for both stages
+    * to agree on the register used we must force these to be in specific
+    * registers.
+    */
+   if ((so->type == MESA_SHADER_VERTEX &&
+        (so->key.has_gs || so->key.tessellation)) ||
+       (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
+      struct ir3_instruction *outputs[3];
+      unsigned outidxs[3];
+      unsigned regids[3];
+      unsigned outputs_count = 0;
+
+      if (ctx->primitive_id) {
+         unsigned n = so->outputs_count++;
+         so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
+
+         struct ir3_instruction *out = ir3_collect(ctx, ctx->primitive_id);
+         outputs[outputs_count] = out;
+         outidxs[outputs_count] = n;
+         regids[outputs_count] = regid(0, 1);
+         outputs_count++;
+      }
+
+      if (ctx->gs_header) {
+         unsigned n = so->outputs_count++;
+         so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
+         struct ir3_instruction *out = ir3_collect(ctx, ctx->gs_header);
+         outputs[outputs_count] = out;
+         outidxs[outputs_count] = n;
+         regids[outputs_count] = regid(0, 0);
+         outputs_count++;
+      }
+
+      if (ctx->tcs_header) {
+         unsigned n = so->outputs_count++;
+         so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
+         struct ir3_instruction *out = ir3_collect(ctx, ctx->tcs_header);
+         outputs[outputs_count] = out;
+         outidxs[outputs_count] = n;
+         regids[outputs_count] = regid(0, 0);
+         outputs_count++;
+      }
+
+      struct ir3_instruction *chmask =
+         ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
+      chmask->barrier_class = IR3_BARRIER_EVERYTHING;
+      chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
+
+      for (unsigned i = 0; i < outputs_count; i++)
+         __ssa_src(chmask, outputs[i], 0)->num = regids[i];
+
+      chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
+      memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
+
+      array_insert(ctx->block, ctx->block->keeps, chmask);
+
+      struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
+      chsh->barrier_class = IR3_BARRIER_EVERYTHING;
+      chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
+   } else {
+      assert((ctx->noutputs % 4) == 0);
+      unsigned outidxs[ctx->noutputs / 4];
+      struct ir3_instruction *outputs[ctx->noutputs / 4];
+      unsigned outputs_count = 0;
+
+      struct ir3_block *old_block = ctx->block;
+      /* Insert these collect's in the block before the end-block if
+       * possible, so that any moves they generate can be shuffled around to
+       * reduce nop's:
+       */
+      if (ctx->block->predecessors_count == 1)
+         ctx->block = ctx->block->predecessors[0];
+
+      /* Setup IR level outputs, which are "collects" that gather
+       * the scalar components of outputs.
+       */
+      for (unsigned i = 0; i < ctx->noutputs; i += 4) {
+         unsigned ncomp = 0;
+         /* figure out the # of components written:
+          *
+          * TODO do we need to handle holes, ie. if .x and .z
+          * components written, but .y component not written?
+          */
+         for (unsigned j = 0; j < 4; j++) {
+            if (!ctx->outputs[i + j])
+               break;
+            ncomp++;
+         }
+
+         /* Note that in some stages, like TCS, store_output is
+          * lowered to memory writes, so no components of the
+          * are "written" from the PoV of traditional store-
+          * output instructions:
+          */
+         if (!ncomp)
+            continue;
+
+         struct ir3_instruction *out =
+            ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
+
+         int outidx = i / 4;
+         assert(outidx < so->outputs_count);
+
+         outidxs[outputs_count] = outidx;
+         outputs[outputs_count] = out;
+         outputs_count++;
+      }
+
+      /* for a6xx+, binning and draw pass VS use same VBO state, so we
+       * need to make sure not to remove any inputs that are used by
+       * the nonbinning VS.
+       */
+      if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
+          so->type == MESA_SHADER_VERTEX) {
+         for (int i = 0; i < ctx->ninputs; i++) {
+            struct ir3_instruction *in = ctx->inputs[i];
+
+            if (!in)
+               continue;
+
+            unsigned n = i / 4;
+            unsigned c = i % 4;
+
+            debug_assert(n < so->nonbinning->inputs_count);
+
+            if (so->nonbinning->inputs[n].sysval)
+               continue;
+
+            /* be sure to keep inputs, even if only used in VS */
+            if (so->nonbinning->inputs[n].compmask & (1 << c))
+               array_insert(in->block, in->block->keeps, in);
+         }
+      }
+
+      ctx->block = old_block;
+
+      struct ir3_instruction *end =
+         ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
+
+      for (unsigned i = 0; i < outputs_count; i++) {
+         __ssa_src(end, outputs[i], 0);
+      }
+
+      end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
+      memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
+
+      array_insert(ctx->block, ctx->block->keeps, end);
+
+      /* at this point, for binning pass, throw away unneeded outputs: */
+      if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+         fixup_binning_pass(ctx, end);
+   }
+
+   ir3_debug_print(ir, "AFTER: nir->ir3");
+   ir3_validate(ir);
+
+   IR3_PASS(ir, ir3_array_to_ssa);
+
+   do {
+      progress = false;
+
+      progress |= IR3_PASS(ir, ir3_cf);
+      progress |= IR3_PASS(ir, ir3_cp, so);
+      progress |= IR3_PASS(ir, ir3_cse);
+      progress |= IR3_PASS(ir, ir3_dce, so);
+   } while (progress);
+
+   /* at this point, for binning pass, throw away unneeded outputs:
+    * Note that for a6xx and later, we do this after ir3_cp to ensure
+    * that the uniform/constant layout for BS and VS matches, so that
+    * we can re-use same VS_CONST state group.
+    */
+   if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {
+      fixup_binning_pass(ctx, find_end(ctx->so->ir));
+      /* cleanup the result of removing unneeded outputs: */
+      while (IR3_PASS(ir, ir3_dce, so)) {
+      }
+   }
+
+   IR3_PASS(ir, ir3_sched_add_deps);
+
+   /* At this point, all the dead code should be long gone: */
+   assert(!IR3_PASS(ir, ir3_dce, so));
+
+   ret = ir3_sched(ir);
+   if (ret) {
+      DBG("SCHED failed!");
+      goto out;
+   }
+
+   ir3_debug_print(ir, "AFTER: ir3_sched");
+
+   if (IR3_PASS(ir, ir3_cp_postsched)) {
+      /* cleanup the result of removing unneeded mov's: */
+      while (IR3_PASS(ir, ir3_dce, so)) {
+      }
+   }
+
+   /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
+    * with draw pass VS, so binning and draw pass can both use the
+    * same VBO state.
+    *
+    * Note that VS inputs are expected to be full precision.
+    */
+   bool pre_assign_inputs = (ir->compiler->gpu_id >= 600) &&
+                            (ir->type == MESA_SHADER_VERTEX) &&
+                            so->binning_pass;
+
+   if (pre_assign_inputs) {
+      foreach_input (in, ir) {
+         assert(in->opc == OPC_META_INPUT);
+         unsigned inidx = in->input.inidx;
+
+         in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
+      }
+   } else if (ctx->tcs_header) {
+      /* We need to have these values in the same registers between VS and TCS
+       * since the VS chains to TCS and doesn't get the sysvals redelivered.
+       */
+
+      ctx->tcs_header->dsts[0]->num = regid(0, 0);
+      ctx->primitive_id->dsts[0]->num = regid(0, 1);
+   } else if (ctx->gs_header) {
+      /* We need to have these values in the same registers between producer
+       * (VS or DS) and GS since the producer chains to GS and doesn't get
+       * the sysvals redelivered.
+       */
+
+      ctx->gs_header->dsts[0]->num = regid(0, 0);
+      ctx->primitive_id->dsts[0]->num = regid(0, 1);
+   } else if (so->num_sampler_prefetch) {
+      assert(so->type == MESA_SHADER_FRAGMENT);
+      int idx = 0;
+
+      foreach_input (instr, ir) {
+         if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
+            continue;
+
+         assert(idx < 2);
+         instr->dsts[0]->num = idx;
+         idx++;
+      }
+   }
+
+   ret = ir3_ra(so);
+
+   if (ret) {
+      mesa_loge("ir3_ra() failed!");
+      goto out;
+   }
+
+   IR3_PASS(ir, ir3_postsched, so);
+
+   IR3_PASS(ir, ir3_lower_subgroups);
+
+   if (so->type == MESA_SHADER_FRAGMENT)
+      pack_inlocs(ctx);
+
+   /*
+    * Fixup inputs/outputs to point to the actual registers assigned:
+    *
+    * 1) initialize to r63.x (invalid/unused)
+    * 2) iterate IR level inputs/outputs and update the variants
+    *    inputs/outputs table based on the assigned registers for
+    *    the remaining inputs/outputs.
+    */
+
+   for (unsigned i = 0; i < so->inputs_count; i++)
+      so->inputs[i].regid = INVALID_REG;
+   for (unsigned i = 0; i < so->outputs_count; i++)
+      so->outputs[i].regid = INVALID_REG;
+
+   struct ir3_instruction *end = find_end(so->ir);
+
+   for (unsigned i = 0; i < end->srcs_count; i++) {
+      unsigned outidx = end->end.outidxs[i];
+      struct ir3_register *reg = end->srcs[i];
+
+      so->outputs[outidx].regid = reg->num;
+      so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
+   }
+
+   foreach_input (in, ir) {
+      assert(in->opc == OPC_META_INPUT);
+      unsigned inidx = in->input.inidx;
+
+      if (pre_assign_inputs && !so->inputs[inidx].sysval) {
+         if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
+            compile_assert(
+               ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
+            compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
+                                   so->nonbinning->inputs[inidx].half);
+         }
+         so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
+         so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
+      } else {
+         so->inputs[inidx].regid = in->dsts[0]->num;
+         so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
+      }
+   }
+
+   if (ctx->astc_srgb)
+      fixup_astc_srgb(ctx);
+
+   /* We need to do legalize after (for frag shader's) the "bary.f"
+    * offsets (inloc) have been assigned.
+    */
+   IR3_PASS(ir, ir3_legalize, so, &max_bary);
+
+   /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
+    * know what we might have to wait on when coming in from VS chsh.
+    */
+   if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
+      foreach_block (block, &ir->block_list) {
+         foreach_instr (instr, &block->instr_list) {
+            instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+            break;
+         }
+      }
+   }
+
+   so->branchstack = ctx->max_stack;
+
+   /* Note that actual_in counts inputs that are not bary.f'd for FS: */
+   if (so->type == MESA_SHADER_FRAGMENT)
+      so->total_in = max_bary + 1;
+
+   /* Collect sampling instructions eligible for pre-dispatch. */
+   collect_tex_prefetches(ctx, ir);
+
+   if (so->type == MESA_SHADER_FRAGMENT &&
+       ctx->s->info.fs.needs_quad_helper_invocations)
+      so->need_pixlod = true;
+
+   if (so->type == MESA_SHADER_COMPUTE) {
+      so->local_size[0] = ctx->s->info.workgroup_size[0];
+      so->local_size[1] = ctx->s->info.workgroup_size[1];
+      so->local_size[2] = ctx->s->info.workgroup_size[2];
+      so->local_size_variable = ctx->s->info.workgroup_size_variable;
+   }
  
  out:
-       if (ret) {
-               if (so->ir)
-                       ir3_destroy(so->ir);
-               so->ir = NULL;
-       }
-       ir3_context_free(ctx);
-
-       return ret;
+   if (ret) {
+      if (so->ir)
+         ir3_destroy(so->ir);
+      so->ir = NULL;
+   }
+   ir3_context_free(ctx);
+
+   return ret;
  }
diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c

index 4e3d9fc..e534143 100644 (file)
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@@ -24,149 +24,148 @@
   *    Rob Clark <robclark@freedesktop.org>
   */
  
-#include "ir3_compiler.h"
  #include "ir3_context.h"
+#include "ir3_compiler.h"
  #include "ir3_image.h"
-#include "ir3_shader.h"
  #include "ir3_nir.h"
+#include "ir3_shader.h"
  
  struct ir3_context *
-ir3_context_init(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so)
+ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader_variant *so)
  {
-       struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
-
-       if (compiler->gpu_id >= 400) {
-               if (so->type == MESA_SHADER_VERTEX) {
-                       ctx->astc_srgb = so->key.vastc_srgb;
-               } else if (so->type == MESA_SHADER_FRAGMENT) {
-                       ctx->astc_srgb = so->key.fastc_srgb;
-               }
-
-       } else {
-               if (so->type == MESA_SHADER_VERTEX) {
-                       ctx->samples = so->key.vsamples;
-               } else if (so->type == MESA_SHADER_FRAGMENT) {
-                       ctx->samples = so->key.fsamples;
-               }
-       }
-
-       if (compiler->gpu_id >= 600) {
-               ctx->funcs = &ir3_a6xx_funcs;
-       } else if (compiler->gpu_id >= 400) {
-               ctx->funcs = &ir3_a4xx_funcs;
-       }
-
-       ctx->compiler = compiler;
-       ctx->so = so;
-       ctx->def_ht = _mesa_hash_table_create(ctx,
-                       _mesa_hash_pointer, _mesa_key_pointer_equal);
-       ctx->block_ht = _mesa_hash_table_create(ctx,
-                       _mesa_hash_pointer, _mesa_key_pointer_equal);
-       ctx->continue_block_ht = _mesa_hash_table_create(ctx,
-                       _mesa_hash_pointer, _mesa_key_pointer_equal);
-       ctx->sel_cond_conversions = _mesa_hash_table_create(ctx,
-                       _mesa_hash_pointer, _mesa_key_pointer_equal);
-
-       /* TODO: maybe generate some sort of bitmask of what key
-        * lowers vs what shader has (ie. no need to lower
-        * texture clamp lowering if no texture sample instrs)..
-        * although should be done further up the stack to avoid
-        * creating duplicate variants..
-        */
-
-       ctx->s = nir_shader_clone(ctx, so->shader->nir);
-       ir3_nir_lower_variant(so, ctx->s);
-
-       /* this needs to be the last pass run, so do this here instead of
-        * in ir3_optimize_nir():
-        */
-       bool progress = false;
-       NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);
-
-       /* we could need cleanup after lower_locals_to_regs */
-       while (progress) {
-               progress = false;
-               NIR_PASS(progress, ctx->s, nir_opt_algebraic);
-               NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
-       }
-
-       /* We want to lower nir_op_imul as late as possible, to catch also
-        * those generated by earlier passes (e.g, nir_lower_locals_to_regs).
-        * However, we want a final swing of a few passes to have a chance
-        * at optimizing the result.
-        */
-       progress = false;
-       NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
-       while (progress) {
-               progress = false;
-               NIR_PASS(progress, ctx->s, nir_opt_algebraic);
-               NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
-               NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
-               NIR_PASS(progress, ctx->s, nir_opt_dce);
-               NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
-       }
-
-       /* Enable the texture pre-fetch feature only a4xx onwards.  But
-        * only enable it on generations that have been tested:
-        */
-       if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
-               NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
-
-       NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
-
-       /* Super crude heuristic to limit # of tex prefetch in small
-        * shaders.  This completely ignores loops.. but that's really
-        * not the worst of it's problems.  (A frag shader that has
-        * loops is probably going to be big enough to not trigger a
-        * lower threshold.)
-        *
-        *   1) probably want to do this in terms of ir3 instructions
-        *   2) probably really want to decide this after scheduling
-        *      (or at least pre-RA sched) so we have a rough idea about
-        *      nops, and don't count things that get cp'd away
-        *   3) blob seems to use higher thresholds with a mix of more
-        *      SFU instructions.  Which partly makes sense, more SFU
-        *      instructions probably means you want to get the real
-        *      shader started sooner, but that considers where in the
-        *      shader the SFU instructions are, which blob doesn't seem
-        *      to do.
-        *
-        * This uses more conservative thresholds assuming a more alu
-        * than sfu heavy instruction mix.
-        */
-       if (so->type == MESA_SHADER_FRAGMENT) {
-               nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
-
-               unsigned instruction_count = 0;
-               nir_foreach_block (block, fxn) {
-                       instruction_count += exec_list_length(&block->instr_list);
-               }
-
-               if (instruction_count < 50) {
-                       ctx->prefetch_limit = 2;
-               } else if (instruction_count < 70) {
-                       ctx->prefetch_limit = 3;
-               } else {
-                       ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
-               }
-       }
-
-       if (shader_debug_enabled(so->type)) {
-               mesa_logi("NIR (final form) for %s shader %s:",
-                       ir3_shader_stage(so), so->shader->nir->info.name);
-               nir_log_shaderi(ctx->s);
-       }
-
-       ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);
-
-       return ctx;
+   struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
+
+   if (compiler->gpu_id >= 400) {
+      if (so->type == MESA_SHADER_VERTEX) {
+         ctx->astc_srgb = so->key.vastc_srgb;
+      } else if (so->type == MESA_SHADER_FRAGMENT) {
+         ctx->astc_srgb = so->key.fastc_srgb;
+      }
+
+   } else {
+      if (so->type == MESA_SHADER_VERTEX) {
+         ctx->samples = so->key.vsamples;
+      } else if (so->type == MESA_SHADER_FRAGMENT) {
+         ctx->samples = so->key.fsamples;
+      }
+   }
+
+   if (compiler->gpu_id >= 600) {
+      ctx->funcs = &ir3_a6xx_funcs;
+   } else if (compiler->gpu_id >= 400) {
+      ctx->funcs = &ir3_a4xx_funcs;
+   }
+
+   ctx->compiler = compiler;
+   ctx->so = so;
+   ctx->def_ht =
+      _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   ctx->block_ht =
+      _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   ctx->continue_block_ht =
+      _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   ctx->sel_cond_conversions =
+      _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+   /* TODO: maybe generate some sort of bitmask of what key
+    * lowers vs what shader has (ie. no need to lower
+    * texture clamp lowering if no texture sample instrs)..
+    * although should be done further up the stack to avoid
+    * creating duplicate variants..
+    */
+
+   ctx->s = nir_shader_clone(ctx, so->shader->nir);
+   ir3_nir_lower_variant(so, ctx->s);
+
+   /* this needs to be the last pass run, so do this here instead of
+    * in ir3_optimize_nir():
+    */
+   bool progress = false;
+   NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);
+
+   /* we could need cleanup after lower_locals_to_regs */
+   while (progress) {
+      progress = false;
+      NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+      NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
+   }
+
+   /* We want to lower nir_op_imul as late as possible, to catch also
+    * those generated by earlier passes (e.g, nir_lower_locals_to_regs).
+    * However, we want a final swing of a few passes to have a chance
+    * at optimizing the result.
+    */
+   progress = false;
+   NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
+   while (progress) {
+      progress = false;
+      NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+      NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
+      NIR_PASS(progress, ctx->s, nir_opt_dce);
+      NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
+   }
+
+   /* Enable the texture pre-fetch feature only a4xx onwards.  But
+    * only enable it on generations that have been tested:
+    */
+   if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
+      NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
+
+   NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
+
+   /* Super crude heuristic to limit # of tex prefetch in small
+    * shaders.  This completely ignores loops.. but that's really
+    * not the worst of it's problems.  (A frag shader that has
+    * loops is probably going to be big enough to not trigger a
+    * lower threshold.)
+    *
+    *   1) probably want to do this in terms of ir3 instructions
+    *   2) probably really want to decide this after scheduling
+    *      (or at least pre-RA sched) so we have a rough idea about
+    *      nops, and don't count things that get cp'd away
+    *   3) blob seems to use higher thresholds with a mix of more
+    *      SFU instructions.  Which partly makes sense, more SFU
+    *      instructions probably means you want to get the real
+    *      shader started sooner, but that considers where in the
+    *      shader the SFU instructions are, which blob doesn't seem
+    *      to do.
+    *
+    * This uses more conservative thresholds assuming a more alu
+    * than sfu heavy instruction mix.
+    */
+   if (so->type == MESA_SHADER_FRAGMENT) {
+      nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+      unsigned instruction_count = 0;
+      nir_foreach_block (block, fxn) {
+         instruction_count += exec_list_length(&block->instr_list);
+      }
+
+      if (instruction_count < 50) {
+         ctx->prefetch_limit = 2;
+      } else if (instruction_count < 70) {
+         ctx->prefetch_limit = 3;
+      } else {
+         ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
+      }
+   }
+
+   if (shader_debug_enabled(so->type)) {
+      mesa_logi("NIR (final form) for %s shader %s:", ir3_shader_stage(so),
+                so->shader->nir->info.name);
+      nir_log_shaderi(ctx->s);
+   }
+
+   ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);
+
+   return ctx;
  }
  
  void
  ir3_context_free(struct ir3_context *ctx)
  {
-       ralloc_free(ctx);
+   ralloc_free(ctx);
  }
  
  /*
@@ -179,178 +178,178 @@ ir3_context_free(struct ir3_context *ctx)
  struct ir3_instruction **
  ir3_get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
  {
-       struct ir3_instruction **value =
-               ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
-       _mesa_hash_table_insert(ctx->def_ht, dst, value);
-       return value;
+   struct ir3_instruction **value =
+      ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
+   _mesa_hash_table_insert(ctx->def_ht, dst, value);
+   return value;
  }
  
  struct ir3_instruction **
  ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
  {
-       struct ir3_instruction **value;
-
-       if (dst->is_ssa) {
-               value = ir3_get_dst_ssa(ctx, &dst->ssa, n);
-       } else {
-               value = ralloc_array(ctx, struct ir3_instruction *, n);
-       }
-
-       /* NOTE: in non-ssa case, we don't really need to store last_dst
-        * but this helps us catch cases where put_dst() call is forgotten
-        */
-       compile_assert(ctx, !ctx->last_dst);
-       ctx->last_dst = value;
-       ctx->last_dst_n = n;
-
-       return value;
+   struct ir3_instruction **value;
+
+   if (dst->is_ssa) {
+      value = ir3_get_dst_ssa(ctx, &dst->ssa, n);
+   } else {
+      value = ralloc_array(ctx, struct ir3_instruction *, n);
+   }
+
+   /* NOTE: in non-ssa case, we don't really need to store last_dst
+    * but this helps us catch cases where put_dst() call is forgotten
+    */
+   compile_assert(ctx, !ctx->last_dst);
+   ctx->last_dst = value;
+   ctx->last_dst_n = n;
+
+   return value;
  }
  
-struct ir3_instruction * const *
+struct ir3_instruction *const *
  ir3_get_src(struct ir3_context *ctx, nir_src *src)
  {
-       if (src->is_ssa) {
-               struct hash_entry *entry;
-               entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
-               compile_assert(ctx, entry);
-               return entry->data;
-       } else {
-               nir_register *reg = src->reg.reg;
-               struct ir3_array *arr = ir3_get_array(ctx, reg);
-               unsigned num_components = arr->r->num_components;
-               struct ir3_instruction *addr = NULL;
-               struct ir3_instruction **value =
-                       ralloc_array(ctx, struct ir3_instruction *, num_components);
-
-               if (src->reg.indirect)
-                       addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
-                                       reg->num_components);
-
-               for (unsigned i = 0; i < num_components; i++) {
-                       unsigned n = src->reg.base_offset * reg->num_components + i;
-                       compile_assert(ctx, n < arr->length);
-                       value[i] = ir3_create_array_load(ctx, arr, n, addr);
-               }
-
-               return value;
-       }
+   if (src->is_ssa) {
+      struct hash_entry *entry;
+      entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
+      compile_assert(ctx, entry);
+      return entry->data;
+   } else {
+      nir_register *reg = src->reg.reg;
+      struct ir3_array *arr = ir3_get_array(ctx, reg);
+      unsigned num_components = arr->r->num_components;
+      struct ir3_instruction *addr = NULL;
+      struct ir3_instruction **value =
+         ralloc_array(ctx, struct ir3_instruction *, num_components);
+
+      if (src->reg.indirect)
+         addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
+                              reg->num_components);
+
+      for (unsigned i = 0; i < num_components; i++) {
+         unsigned n = src->reg.base_offset * reg->num_components + i;
+         compile_assert(ctx, n < arr->length);
+         value[i] = ir3_create_array_load(ctx, arr, n, addr);
+      }
+
+      return value;
+   }
  }
  
  void
  ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
  {
-       unsigned bit_size = nir_dest_bit_size(*dst);
-
-       /* add extra mov if dst value is shared reg.. in some cases not all
-        * instructions can read from shared regs, in cases where they can
-        * ir3_cp will clean up the extra mov:
-        */
-       for (unsigned i = 0; i < ctx->last_dst_n; i++) {
-               if (!ctx->last_dst[i])
-                       continue;
-               if (ctx->last_dst[i]->dsts[0]->flags & IR3_REG_SHARED) {
-                       ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32);
-               }
-       }
-
-       /* Note: 1-bit bools are stored in 32-bit regs */
-       if (bit_size == 16) {
-               for (unsigned i = 0; i < ctx->last_dst_n; i++) {
-                       struct ir3_instruction *dst = ctx->last_dst[i];
-                       ir3_set_dst_type(dst, true);
-                       ir3_fixup_src_type(dst);
-                       if (dst->opc == OPC_META_SPLIT) {
-                               ir3_set_dst_type(ssa(dst->srcs[0]), true);
-                               ir3_fixup_src_type(ssa(dst->srcs[0]));
-                               dst->srcs[0]->flags |= IR3_REG_HALF;
-                       }
-               }
-       }
-
-       if (!dst->is_ssa) {
-               nir_register *reg = dst->reg.reg;
-               struct ir3_array *arr = ir3_get_array(ctx, reg);
-               unsigned num_components = ctx->last_dst_n;
-               struct ir3_instruction *addr = NULL;
-
-               if (dst->reg.indirect)
-                       addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
-                                       reg->num_components);
-
-               for (unsigned i = 0; i < num_components; i++) {
-                       unsigned n = dst->reg.base_offset * reg->num_components + i;
-                       compile_assert(ctx, n < arr->length);
-                       if (!ctx->last_dst[i])
-                               continue;
-                       ir3_create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
-               }
-
-               ralloc_free(ctx->last_dst);
-       }
-
-       ctx->last_dst = NULL;
-       ctx->last_dst_n = 0;
+   unsigned bit_size = nir_dest_bit_size(*dst);
+
+   /* add extra mov if dst value is shared reg.. in some cases not all
+    * instructions can read from shared regs, in cases where they can
+    * ir3_cp will clean up the extra mov:
+    */
+   for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+      if (!ctx->last_dst[i])
+         continue;
+      if (ctx->last_dst[i]->dsts[0]->flags & IR3_REG_SHARED) {
+         ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32);
+      }
+   }
+
+   /* Note: 1-bit bools are stored in 32-bit regs */
+   if (bit_size == 16) {
+      for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+         struct ir3_instruction *dst = ctx->last_dst[i];
+         ir3_set_dst_type(dst, true);
+         ir3_fixup_src_type(dst);
+         if (dst->opc == OPC_META_SPLIT) {
+            ir3_set_dst_type(ssa(dst->srcs[0]), true);
+            ir3_fixup_src_type(ssa(dst->srcs[0]));
+            dst->srcs[0]->flags |= IR3_REG_HALF;
+         }
+      }
+   }
+
+   if (!dst->is_ssa) {
+      nir_register *reg = dst->reg.reg;
+      struct ir3_array *arr = ir3_get_array(ctx, reg);
+      unsigned num_components = ctx->last_dst_n;
+      struct ir3_instruction *addr = NULL;
+
+      if (dst->reg.indirect)
+         addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
+                              reg->num_components);
+
+      for (unsigned i = 0; i < num_components; i++) {
+         unsigned n = dst->reg.base_offset * reg->num_components + i;
+         compile_assert(ctx, n < arr->length);
+         if (!ctx->last_dst[i])
+            continue;
+         ir3_create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
+      }
+
+      ralloc_free(ctx->last_dst);
+   }
+
+   ctx->last_dst = NULL;
+   ctx->last_dst_n = 0;
  }
  
  static unsigned
  dest_flags(struct ir3_instruction *instr)
  {
-       return instr->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+   return instr->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
  }
  
  struct ir3_instruction *
  ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
-               unsigned arrsz)
+                   unsigned arrsz)
  {
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *collect;
-
-       if (arrsz == 0)
-               return NULL;
-
-       unsigned flags = dest_flags(arr[0]);
-
-       collect = ir3_instr_create(block, OPC_META_COLLECT, 1, arrsz);
-       __ssa_dst(collect)->flags |= flags;
-       for (unsigned i = 0; i < arrsz; i++) {
-               struct ir3_instruction *elem = arr[i];
-
-               /* Since arrays are pre-colored in RA, we can't assume that
-                * things will end up in the right place.  (Ie. if a collect
-                * joins elements from two different arrays.)  So insert an
-                * extra mov.
-                *
-                * We could possibly skip this if all the collected elements
-                * are contiguous elements in a single array.. not sure how
-                * likely that is to happen.
-                *
-                * Fixes a problem with glamor shaders, that in effect do
-                * something like:
-                *
-                *   if (foo)
-                *     texcoord = ..
-                *   else
-                *     texcoord = ..
-                *   color = texture2D(tex, texcoord);
-                *
-                * In this case, texcoord will end up as nir registers (which
-                * translate to ir3 array's of length 1.  And we can't assume
-                * the two (or more) arrays will get allocated in consecutive
-                * scalar registers.
-                *
-                */
-               if (elem->dsts[0]->flags & IR3_REG_ARRAY) {
-                       type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-                       elem = ir3_MOV(block, elem, type);
-               }
-
-               compile_assert(ctx, dest_flags(elem) == flags);
-               __ssa_src(collect, elem, flags);
-       }
-
-       collect->dsts[0]->wrmask = MASK(arrsz);
-
-       return collect;
+   struct ir3_block *block = ctx->block;
+   struct ir3_instruction *collect;
+
+   if (arrsz == 0)
+      return NULL;
+
+   unsigned flags = dest_flags(arr[0]);
+
+   collect = ir3_instr_create(block, OPC_META_COLLECT, 1, arrsz);
+   __ssa_dst(collect)->flags |= flags;
+   for (unsigned i = 0; i < arrsz; i++) {
+      struct ir3_instruction *elem = arr[i];
+
+      /* Since arrays are pre-colored in RA, we can't assume that
+       * things will end up in the right place.  (Ie. if a collect
+       * joins elements from two different arrays.)  So insert an
+       * extra mov.
+       *
+       * We could possibly skip this if all the collected elements
+       * are contiguous elements in a single array.. not sure how
+       * likely that is to happen.
+       *
+       * Fixes a problem with glamor shaders, that in effect do
+       * something like:
+       *
+       *   if (foo)
+       *     texcoord = ..
+       *   else
+       *     texcoord = ..
+       *   color = texture2D(tex, texcoord);
+       *
+       * In this case, texcoord will end up as nir registers (which
+       * translate to ir3 array's of length 1.  And we can't assume
+       * the two (or more) arrays will get allocated in consecutive
+       * scalar registers.
+       *
+       */
+      if (elem->dsts[0]->flags & IR3_REG_ARRAY) {
+         type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+         elem = ir3_MOV(block, elem, type);
+      }
+
+      compile_assert(ctx, dest_flags(elem) == flags);
+      __ssa_src(collect, elem, flags);
+   }
+
+   collect->dsts[0]->wrmask = MASK(arrsz);
+
+   return collect;
  }
  
  /* helper for instructions that produce multiple consecutive scalar
@@ -358,108 +357,107 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
   */
  void
  ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
-               struct ir3_instruction *src, unsigned base, unsigned n)
+               struct ir3_instruction *src, unsigned base, unsigned n)
  {
-       if ((n == 1) && (src->dsts[0]->wrmask == 0x1) &&
-               /* setup_input needs ir3_split_dest to generate a SPLIT instruction */
-               src->opc != OPC_META_INPUT) {
-               dst[0] = src;
-               return;
-
-       }
-
-       if (src->opc == OPC_META_COLLECT) {
-               debug_assert((base + n) <= src->srcs_count);
-
-               for (int i = 0; i < n; i++) {
-                       dst[i] = ssa(src->srcs[i + base]);
-               }
-
-               return;
-       }
-
-       unsigned flags = dest_flags(src);
-
-       for (int i = 0, j = 0; i < n; i++) {
-               struct ir3_instruction *split =
-                               ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
-               __ssa_dst(split)->flags |= flags;
-               __ssa_src(split, src, flags);
-               split->split.off = i + base;
-
-               if (src->dsts[0]->wrmask & (1 << (i + base)))
-                       dst[j++] = split;
-       }
+   if ((n == 1) && (src->dsts[0]->wrmask == 0x1) &&
+       /* setup_input needs ir3_split_dest to generate a SPLIT instruction */
+       src->opc != OPC_META_INPUT) {
+      dst[0] = src;
+      return;
+   }
+
+   if (src->opc == OPC_META_COLLECT) {
+      debug_assert((base + n) <= src->srcs_count);
+
+      for (int i = 0; i < n; i++) {
+         dst[i] = ssa(src->srcs[i + base]);
+      }
+
+      return;
+   }
+
+   unsigned flags = dest_flags(src);
+
+   for (int i = 0, j = 0; i < n; i++) {
+      struct ir3_instruction *split =
+         ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
+      __ssa_dst(split)->flags |= flags;
+      __ssa_src(split, src, flags);
+      split->split.off = i + base;
+
+      if (src->dsts[0]->wrmask & (1 << (i + base)))
+         dst[j++] = split;
+   }
  }
  
  NORETURN void
  ir3_context_error(struct ir3_context *ctx, const char *format, ...)
  {
-       struct hash_table *errors = NULL;
-       va_list ap;
-       va_start(ap, format);
-       if (ctx->cur_instr) {
-               errors = _mesa_hash_table_create(NULL,
-                               _mesa_hash_pointer,
-                               _mesa_key_pointer_equal);
-               char *msg = ralloc_vasprintf(errors, format, ap);
-               _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
-       } else {
-               mesa_loge_v(format, ap);
-       }
-       va_end(ap);
-       nir_log_shader_annotated(ctx->s, errors);
-       ralloc_free(errors);
-       ctx->error = true;
-       unreachable("");
+   struct hash_table *errors = NULL;
+   va_list ap;
+   va_start(ap, format);
+   if (ctx->cur_instr) {
+      errors = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                       _mesa_key_pointer_equal);
+      char *msg = ralloc_vasprintf(errors, format, ap);
+      _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
+   } else {
+      mesa_loge_v(format, ap);
+   }
+   va_end(ap);
+   nir_log_shader_annotated(ctx->s, errors);
+   ralloc_free(errors);
+   ctx->error = true;
+   unreachable("");
  }
  
  static struct ir3_instruction *
  create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
  {
-       struct ir3_instruction *instr, *immed;
-
-       instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
-
-       switch(align){
-       case 1:
-               /* src *= 1: */
-               break;
-       case 2:
-               /* src *= 2     => src <<= 1: */
-               immed = create_immed_typed(block, 1, TYPE_S16);
-               instr = ir3_SHL_B(block, instr, 0, immed, 0);
-               break;
-       case 3:
-               /* src *= 3: */
-               immed = create_immed_typed(block, 3, TYPE_S16);
-               instr = ir3_MULL_U(block, instr, 0, immed, 0);
-               break;
-       case 4:
-               /* src *= 4 => src <<= 2: */
-               immed = create_immed_typed(block, 2, TYPE_S16);
-               instr = ir3_SHL_B(block, instr, 0, immed, 0);
-               break;
-       default:
-               unreachable("bad align");
-               return NULL;
-       }
-
-       instr->dsts[0]->flags |= IR3_REG_HALF;
-
-       instr = ir3_MOV(block, instr, TYPE_S16);
-       instr->dsts[0]->num = regid(REG_A0, 0);
-
-       return instr;
+   struct ir3_instruction *instr, *immed;
+
+   instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
+
+   switch (align) {
+   case 1:
+      /* src *= 1: */
+      break;
+   case 2:
+      /* src *= 2      => src <<= 1: */
+      immed = create_immed_typed(block, 1, TYPE_S16);
+      instr = ir3_SHL_B(block, instr, 0, immed, 0);
+      break;
+   case 3:
+      /* src *= 3: */
+      immed = create_immed_typed(block, 3, TYPE_S16);
+      instr = ir3_MULL_U(block, instr, 0, immed, 0);
+      break;
+   case 4:
+      /* src *= 4 => src <<= 2: */
+      immed = create_immed_typed(block, 2, TYPE_S16);
+      instr = ir3_SHL_B(block, instr, 0, immed, 0);
+      break;
+   default:
+      unreachable("bad align");
+      return NULL;
+   }
+
+   instr->dsts[0]->flags |= IR3_REG_HALF;
+
+   instr = ir3_MOV(block, instr, TYPE_S16);
+   instr->dsts[0]->num = regid(REG_A0, 0);
+
+   return instr;
  }
  
  static struct ir3_instruction *
  create_addr1(struct ir3_block *block, unsigned const_val)
  {
-       struct ir3_instruction *immed = create_immed_typed(block, const_val, TYPE_U16);
-       struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_U16);
-       instr->dsts[0]->num = regid(REG_A0, 1);
-       return instr;
+   struct ir3_instruction *immed =
+      create_immed_typed(block, const_val, TYPE_U16);
+   struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_U16);
+   instr->dsts[0]->num = regid(REG_A0, 1);
+   return instr;
  }
  
  /* caches addr values to avoid generating multiple cov/shl/mova
@@ -468,62 +466,62 @@ create_addr1(struct ir3_block *block, unsigned const_val)
  struct ir3_instruction *
  ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
  {
-       struct ir3_instruction *addr;
-       unsigned idx = align - 1;
+   struct ir3_instruction *addr;
+   unsigned idx = align - 1;
  
-       compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));
+   compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));
  
-       if (!ctx->addr0_ht[idx]) {
-               ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx,
-                               _mesa_hash_pointer, _mesa_key_pointer_equal);
-       } else {
-               struct hash_entry *entry;
-               entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
-               if (entry)
-                       return entry->data;
-       }
+   if (!ctx->addr0_ht[idx]) {
+      ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
+                                                   _mesa_key_pointer_equal);
+   } else {
+      struct hash_entry *entry;
+      entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
+      if (entry)
+         return entry->data;
+   }
  
-       addr = create_addr0(ctx->block, src, align);
-       _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
+   addr = create_addr0(ctx->block, src, align);
+   _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
  
-       return addr;
+   return addr;
  }
  
  /* Similar to ir3_get_addr0, but for a1.x. */
  struct ir3_instruction *
  ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
  {
-       struct ir3_instruction *addr;
+   struct ir3_instruction *addr;
  
-       if (!ctx->addr1_ht) {
-               ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
-       } else {
-               addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
-               if (addr)
-                       return addr;
-       }
+   if (!ctx->addr1_ht) {
+      ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
+   } else {
+      addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
+      if (addr)
+         return addr;
+   }
  
-       addr = create_addr1(ctx->block, const_val);
-       _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);
+   addr = create_addr1(ctx->block, const_val);
+   _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);
  
-       return addr;
+   return addr;
  }
  
  struct ir3_instruction *
  ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
  {
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *cond;
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *cond;
  
-       /* NOTE: only cmps.*.* can write p0.x: */
-       cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
-       cond->cat2.condition = IR3_COND_NE;
+   /* NOTE: only cmps.*.* can write p0.x: */
+   cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+   cond->cat2.condition = IR3_COND_NE;
  
-       /* condition always goes in predicate register: */
-       cond->dsts[0]->num = regid(REG_P0, 0);
-       cond->dsts[0]->flags &= ~IR3_REG_SSA;
+   /* condition always goes in predicate register: */
+   cond->dsts[0]->num = regid(REG_P0, 0);
+   cond->dsts[0]->flags &= ~IR3_REG_SSA;
  
-       return cond;
+   return cond;
  }
  
  /*
@@ -533,144 +531,146 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
  void
  ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
  {
-       struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
-       arr->id = ++ctx->num_arrays;
-       /* NOTE: sometimes we get non array regs, for example for arrays of
-        * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
-        * treat a non-array as if it was an array of length 1.
-        *
-        * It would be nice if there was a nir pass to convert arrays of
-        * length 1 to ssa.
-        */
-       arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
-       compile_assert(ctx, arr->length > 0);
-       arr->r = reg;
-       arr->half = reg->bit_size <= 16;
-       // HACK one-bit bools still end up as 32b:
-       if (reg->bit_size == 1)
-               arr->half = false;
-       list_addtail(&arr->node, &ctx->ir->array_list);
+   struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
+   arr->id = ++ctx->num_arrays;
+   /* NOTE: sometimes we get non array regs, for example for arrays of
+    * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
+    * treat a non-array as if it was an array of length 1.
+    *
+    * It would be nice if there was a nir pass to convert arrays of
+    * length 1 to ssa.
+    */
+   arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
+   compile_assert(ctx, arr->length > 0);
+   arr->r = reg;
+   arr->half = reg->bit_size <= 16;
+   // HACK one-bit bools still end up as 32b:
+   if (reg->bit_size == 1)
+      arr->half = false;
+   list_addtail(&arr->node, &ctx->ir->array_list);
  }
  
  struct ir3_array *
  ir3_get_array(struct ir3_context *ctx, nir_register *reg)
  {
-       foreach_array (arr, &ctx->ir->array_list) {
-               if (arr->r == reg)
-                       return arr;
-       }
-       ir3_context_error(ctx, "bogus reg: r%d\n", reg->index);
-       return NULL;
+   foreach_array (arr, &ctx->ir->array_list) {
+      if (arr->r == reg)
+         return arr;
+   }
+   ir3_context_error(ctx, "bogus reg: r%d\n", reg->index);
+   return NULL;
  }
  
  /* relative (indirect) if address!=NULL */
  struct ir3_instruction *
  ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
-               struct ir3_instruction *address)
+                      struct ir3_instruction *address)
  {
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *mov;
-       struct ir3_register *src;
-       unsigned flags = 0;
-
-       mov = ir3_instr_create(block, OPC_MOV, 1, 1);
-       if (arr->half) {
-               mov->cat1.src_type = TYPE_U16;
-               mov->cat1.dst_type = TYPE_U16;
-               flags |= IR3_REG_HALF;
-       } else {
-               mov->cat1.src_type = TYPE_U32;
-               mov->cat1.dst_type = TYPE_U32;
-       }
-
-       mov->barrier_class = IR3_BARRIER_ARRAY_R;
-       mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
-       __ssa_dst(mov)->flags |= flags;
-       src = ir3_src_create(mov, 0, IR3_REG_ARRAY |
-                       COND(address, IR3_REG_RELATIV) | flags);
-       src->def = (arr->last_write && arr->last_write->instr->block == block) ?
-               arr->last_write : NULL;
-       src->size  = arr->length;
-       src->array.id = arr->id;
-       src->array.offset = n;
-       src->array.base = INVALID_REG;
-
-       if (address)
-               ir3_instr_set_address(mov, address);
-
-       return mov;
+   struct ir3_block *block = ctx->block;
+   struct ir3_instruction *mov;
+   struct ir3_register *src;
+   unsigned flags = 0;
+
+   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   if (arr->half) {
+      mov->cat1.src_type = TYPE_U16;
+      mov->cat1.dst_type = TYPE_U16;
+      flags |= IR3_REG_HALF;
+   } else {
+      mov->cat1.src_type = TYPE_U32;
+      mov->cat1.dst_type = TYPE_U32;
+   }
+
+   mov->barrier_class = IR3_BARRIER_ARRAY_R;
+   mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
+   __ssa_dst(mov)->flags |= flags;
+   src = ir3_src_create(mov, 0,
+                        IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV) | flags);
+   src->def = (arr->last_write && arr->last_write->instr->block == block)
+                 ? arr->last_write
+                 : NULL;
+   src->size = arr->length;
+   src->array.id = arr->id;
+   src->array.offset = n;
+   src->array.base = INVALID_REG;
+
+   if (address)
+      ir3_instr_set_address(mov, address);
+
+   return mov;
  }
  
  /* relative (indirect) if address!=NULL */
  void
  ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
-               struct ir3_instruction *src, struct ir3_instruction *address)
+                       struct ir3_instruction *src,
+                       struct ir3_instruction *address)
  {
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *mov;
-       struct ir3_register *dst;
-       unsigned flags = 0;
-
-       /* if not relative store, don't create an extra mov, since that
-        * ends up being difficult for cp to remove.
-        *
-        * Also, don't skip the mov if the src is meta (like fanout/split),
-        * since that creates a situation that RA can't really handle properly.
-        */
-       if (!address && !is_meta(src)) {
-               dst = src->dsts[0];
-
-               src->barrier_class |= IR3_BARRIER_ARRAY_W;
-               src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
-
-               dst->flags |= IR3_REG_ARRAY;
-               dst->size = arr->length;
-               dst->array.id = arr->id;
-               dst->array.offset = n;
-               dst->array.base = INVALID_REG;
-
-               if (arr->last_write && arr->last_write->instr->block == src->block)
-                       ir3_reg_set_last_array(src, dst, arr->last_write);
-
-               arr->last_write = dst;
-
-               array_insert(block, block->keeps, src);
-
-               return;
-       }
-
-       mov = ir3_instr_create(block, OPC_MOV, 1, 1);
-       if (arr->half) {
-               mov->cat1.src_type = TYPE_U16;
-               mov->cat1.dst_type = TYPE_U16;
-               flags |= IR3_REG_HALF;
-       } else {
-               mov->cat1.src_type = TYPE_U32;
-               mov->cat1.dst_type = TYPE_U32;
-       }
-       mov->barrier_class = IR3_BARRIER_ARRAY_W;
-       mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
-       dst = ir3_dst_create(mov, 0, IR3_REG_SSA | IR3_REG_ARRAY |
-                       flags |
-                       COND(address, IR3_REG_RELATIV));
-       dst->instr = mov;
-       dst->size  = arr->length;
-       dst->array.id = arr->id;
-       dst->array.offset = n;
-       dst->array.base = INVALID_REG;
-       ir3_src_create(mov, 0, IR3_REG_SSA | flags)->def = src->dsts[0];
-
-       if (arr->last_write && arr->last_write->instr->block == block)
-               ir3_reg_set_last_array(mov, dst, arr->last_write);
-
-       if (address)
-               ir3_instr_set_address(mov, address);
-
-       arr->last_write = dst;
-
-       /* the array store may only matter to something in an earlier
-        * block (ie. loops), but since arrays are not in SSA, depth
-        * pass won't know this.. so keep all array stores:
-        */
-       array_insert(block, block->keeps, mov);
+   struct ir3_block *block = ctx->block;
+   struct ir3_instruction *mov;
+   struct ir3_register *dst;
+   unsigned flags = 0;
+
+   /* if not relative store, don't create an extra mov, since that
+    * ends up being difficult for cp to remove.
+    *
+    * Also, don't skip the mov if the src is meta (like fanout/split),
+    * since that creates a situation that RA can't really handle properly.
+    */
+   if (!address && !is_meta(src)) {
+      dst = src->dsts[0];
+
+      src->barrier_class |= IR3_BARRIER_ARRAY_W;
+      src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+
+      dst->flags |= IR3_REG_ARRAY;
+      dst->size = arr->length;
+      dst->array.id = arr->id;
+      dst->array.offset = n;
+      dst->array.base = INVALID_REG;
+
+      if (arr->last_write && arr->last_write->instr->block == src->block)
+         ir3_reg_set_last_array(src, dst, arr->last_write);
+
+      arr->last_write = dst;
+
+      array_insert(block, block->keeps, src);
+
+      return;
+   }
+
+   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   if (arr->half) {
+      mov->cat1.src_type = TYPE_U16;
+      mov->cat1.dst_type = TYPE_U16;
+      flags |= IR3_REG_HALF;
+   } else {
+      mov->cat1.src_type = TYPE_U32;
+      mov->cat1.dst_type = TYPE_U32;
+   }
+   mov->barrier_class = IR3_BARRIER_ARRAY_W;
+   mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+   dst = ir3_dst_create(
+      mov, 0,
+      IR3_REG_SSA | IR3_REG_ARRAY | flags | COND(address, IR3_REG_RELATIV));
+   dst->instr = mov;
+   dst->size = arr->length;
+   dst->array.id = arr->id;
+   dst->array.offset = n;
+   dst->array.base = INVALID_REG;
+   ir3_src_create(mov, 0, IR3_REG_SSA | flags)->def = src->dsts[0];
+
+   if (arr->last_write && arr->last_write->instr->block == block)
+      ir3_reg_set_last_array(mov, dst, arr->last_write);
+
+   if (address)
+      ir3_instr_set_address(mov, address);
+
+   arr->last_write = dst;
+
+   /* the array store may only matter to something in an earlier
+    * block (ie. loops), but since arrays are not in SSA, depth
+    * pass won't know this.. so keep all array stores:
+    */
+   array_insert(block, block->keeps, mov);
  }
diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h

index aea6823..17bfb48 100644 (file)
--- a/src/freedreno/ir3/ir3_context.h
+++ b/src/freedreno/ir3/ir3_context.h
@@ -27,215 +27,250 @@
  #ifndef IR3_CONTEXT_H_
  #define IR3_CONTEXT_H_
  
+#include "ir3.h"
  #include "ir3_compiler.h"
  #include "ir3_nir.h"
-#include "ir3.h"
  
  /* for conditionally setting boolean flag(s): */
  #define COND(bool, val) ((bool) ? (val) : 0)
  
-#define DBG(fmt, ...) \
-               do { mesa_logd("%s:%d: "fmt, \
-                               __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+#define DBG(fmt, ...)                                                          \
+   do {                                                                        \
+      mesa_logd("%s:%d: " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__);         \
+   } while (0)
  
  /**
   * The context for compilation of a single shader.
   */
  struct ir3_context {
-       struct ir3_compiler *compiler;
-       const struct ir3_context_funcs *funcs;
-
-       struct nir_shader *s;
-
-       struct nir_instr *cur_instr;  /* current instruction, just for debug */
-
-       struct ir3 *ir;
-       struct ir3_shader_variant *so;
-
-       /* Tables of scalar inputs/outputs.  Because of the way varying packing
-        * works, we could have inputs w/ fractional location, which is a bit
-        * awkward to deal with unless we keep track of the split scalar in/
-        * out components.
-        *
-        * These *only* have inputs/outputs that are touched by load_*input and
-        * store_output.
-        */
-       unsigned ninputs, noutputs;
-       struct ir3_instruction **inputs;
-       struct ir3_instruction **outputs;
-
-       struct ir3_block *block;      /* the current block */
-       struct ir3_block *in_block;   /* block created for shader inputs */
-
-       nir_function_impl *impl;
-
-       /* For fragment shaders, varyings are not actual shader inputs,
-        * instead the hw passes a ij coord which is used with
-        * bary.f.
-        *
-        * But NIR doesn't know that, it still declares varyings as
-        * inputs.  So we do all the input tracking normally and fix
-        * things up after compile_instructions()
-        */
-       struct ir3_instruction *ij[IJ_COUNT];
-
-       /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
-       struct ir3_instruction *frag_face, *frag_coord;
-
-       /* For vertex shaders, keep track of the system values sources */
-       struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance, *draw_id, *view_index;
-
-       /* For fragment shaders: */
-       struct ir3_instruction *samp_id, *samp_mask_in;
-
-       /* For geometry shaders: */
-       struct ir3_instruction *primitive_id;
-       struct ir3_instruction *gs_header;
-
-       /* For tessellation shaders: */
-       struct ir3_instruction *patch_vertices_in;
-       struct ir3_instruction *tcs_header;
-       struct ir3_instruction *tess_coord;
-
-       /* Compute shader inputs: */
-       struct ir3_instruction *local_invocation_id, *work_group_id;
-
-       /* mapping from nir_register to defining instruction: */
-       struct hash_table *def_ht;
-
-       unsigned num_arrays;
-
-       /* Tracking for max level of flowcontrol (branchstack) needed
-        * by a5xx+:
-        */
-       unsigned stack, max_stack;
-
-       unsigned loop_id;
-
-       /* a common pattern for indirect addressing is to request the
-        * same address register multiple times.  To avoid generating
-        * duplicate instruction sequences (which our backend does not
-        * try to clean up, since that should be done as the NIR stage)
-        * we cache the address value generated for a given src value:
-        *
-        * Note that we have to cache these per alignment, since same
-        * src used for an array of vec1 cannot be also used for an
-        * array of vec4.
-        */
-       struct hash_table *addr0_ht[4];
-
-       /* The same for a1.x. We only support immediate values for a1.x, as this
-        * is the only use so far.
-        */
-       struct hash_table_u64 *addr1_ht;
-
-       struct hash_table *sel_cond_conversions;
-
-       /* last dst array, for indirect we need to insert a var-store.
-        */
-       struct ir3_instruction **last_dst;
-       unsigned last_dst_n;
-
-       /* maps nir_block to ir3_block, mostly for the purposes of
-        * figuring out the blocks successors
-        */
-       struct hash_table *block_ht;
-
-       /* maps nir_block at the top of a loop to ir3_block collecting continue
-        * edges.
-        */
-       struct hash_table *continue_block_ht;
-
-       /* on a4xx, bitmask of samplers which need astc+srgb workaround: */
-       unsigned astc_srgb;
-
-       unsigned samples;             /* bitmask of x,y sample shifts */
-
-       unsigned max_texture_index;
-
-       unsigned prefetch_limit;
-
-       /* set if we encounter something we can't handle yet, so we
-        * can bail cleanly and fallback to TGSI compiler f/e
-        */
-       bool error;
+   struct ir3_compiler *compiler;
+   const struct ir3_context_funcs *funcs;
+
+   struct nir_shader *s;
+
+   struct nir_instr *cur_instr; /* current instruction, just for debug */
+
+   struct ir3 *ir;
+   struct ir3_shader_variant *so;
+
+   /* Tables of scalar inputs/outputs.  Because of the way varying packing
+    * works, we could have inputs w/ fractional location, which is a bit
+    * awkward to deal with unless we keep track of the split scalar in/
+    * out components.
+    *
+    * These *only* have inputs/outputs that are touched by load_*input and
+    * store_output.
+    */
+   unsigned ninputs, noutputs;
+   struct ir3_instruction **inputs;
+   struct ir3_instruction **outputs;
+
+   struct ir3_block *block;    /* the current block */
+   struct ir3_block *in_block; /* block created for shader inputs */
+
+   nir_function_impl *impl;
+
+   /* For fragment shaders, varyings are not actual shader inputs,
+    * instead the hw passes a ij coord which is used with
+    * bary.f.
+    *
+    * But NIR doesn't know that, it still declares varyings as
+    * inputs.  So we do all the input tracking normally and fix
+    * things up after compile_instructions()
+    */
+   struct ir3_instruction *ij[IJ_COUNT];
+
+   /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
+   struct ir3_instruction *frag_face, *frag_coord;
+
+   /* For vertex shaders, keep track of the system values sources */
+   struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance,
+      *draw_id, *view_index;
+
+   /* For fragment shaders: */
+   struct ir3_instruction *samp_id, *samp_mask_in;
+
+   /* For geometry shaders: */
+   struct ir3_instruction *primitive_id;
+   struct ir3_instruction *gs_header;
+
+   /* For tessellation shaders: */
+   struct ir3_instruction *patch_vertices_in;
+   struct ir3_instruction *tcs_header;
+   struct ir3_instruction *tess_coord;
+
+   /* Compute shader inputs: */
+   struct ir3_instruction *local_invocation_id, *work_group_id;
+
+   /* mapping from nir_register to defining instruction: */
+   struct hash_table *def_ht;
+
+   unsigned num_arrays;
+
+   /* Tracking for max level of flowcontrol (branchstack) needed
+    * by a5xx+:
+    */
+   unsigned stack, max_stack;
+
+   unsigned loop_id;
+
+   /* a common pattern for indirect addressing is to request the
+    * same address register multiple times.  To avoid generating
+    * duplicate instruction sequences (which our backend does not
+    * try to clean up, since that should be done as the NIR stage)
+    * we cache the address value generated for a given src value:
+    *
+    * Note that we have to cache these per alignment, since same
+    * src used for an array of vec1 cannot be also used for an
+    * array of vec4.
+    */
+   struct hash_table *addr0_ht[4];
+
+   /* The same for a1.x. We only support immediate values for a1.x, as this
+    * is the only use so far.
+    */
+   struct hash_table_u64 *addr1_ht;
+
+   struct hash_table *sel_cond_conversions;
+
+   /* last dst array, for indirect we need to insert a var-store.
+    */
+   struct ir3_instruction **last_dst;
+   unsigned last_dst_n;
+
+   /* maps nir_block to ir3_block, mostly for the purposes of
+    * figuring out the blocks successors
+    */
+   struct hash_table *block_ht;
+
+   /* maps nir_block at the top of a loop to ir3_block collecting continue
+    * edges.
+    */
+   struct hash_table *continue_block_ht;
+
+   /* on a4xx, bitmask of samplers which need astc+srgb workaround: */
+   unsigned astc_srgb;
+
+   unsigned samples; /* bitmask of x,y sample shifts */
+
+   unsigned max_texture_index;
+
+   unsigned prefetch_limit;
+
+   /* set if we encounter something we can't handle yet, so we
+    * can bail cleanly and fallback to TGSI compiler f/e
+    */
+   bool error;
  };
  
  struct ir3_context_funcs {
-       void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-                       struct ir3_instruction **dst);
-       void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
-       struct ir3_instruction * (*emit_intrinsic_atomic_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
-       void (*emit_intrinsic_load_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-                       struct ir3_instruction **dst);
-       void (*emit_intrinsic_store_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
-       struct ir3_instruction * (*emit_intrinsic_atomic_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
-       void (*emit_intrinsic_image_size)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-                       struct ir3_instruction **dst);
-       void (*emit_intrinsic_load_global_ir3)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-                       struct ir3_instruction **dst);
-       void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
+   void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx,
+                                    nir_intrinsic_instr *intr,
+                                    struct ir3_instruction **dst);
+   void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx,
+                                     nir_intrinsic_instr *intr);
+   struct ir3_instruction *(*emit_intrinsic_atomic_ssbo)(
+      struct ir3_context *ctx, nir_intrinsic_instr *intr);
+   void (*emit_intrinsic_load_image)(struct ir3_context *ctx,
+                                     nir_intrinsic_instr *intr,
+                                     struct ir3_instruction **dst);
+   void (*emit_intrinsic_store_image)(struct ir3_context *ctx,
+                                      nir_intrinsic_instr *intr);
+   struct ir3_instruction *(*emit_intrinsic_atomic_image)(
+      struct ir3_context *ctx, nir_intrinsic_instr *intr);
+   void (*emit_intrinsic_image_size)(struct ir3_context *ctx,
+                                     nir_intrinsic_instr *intr,
+                                     struct ir3_instruction **dst);
+   void (*emit_intrinsic_load_global_ir3)(struct ir3_context *ctx,
+                                          nir_intrinsic_instr *intr,
+                                          struct ir3_instruction **dst);
+   void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx,
+                                           nir_intrinsic_instr *intr);
  };
  
  extern const struct ir3_context_funcs ir3_a4xx_funcs;
  extern const struct ir3_context_funcs ir3_a6xx_funcs;
  
-struct ir3_context * ir3_context_init(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so);
+struct ir3_context *ir3_context_init(struct ir3_compiler *compiler,
+                                     struct ir3_shader_variant *so);
  void ir3_context_free(struct ir3_context *ctx);
  
-struct ir3_instruction ** ir3_get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n);
-struct ir3_instruction ** ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n);
-struct ir3_instruction * const * ir3_get_src(struct ir3_context *ctx, nir_src *src);
+struct ir3_instruction **ir3_get_dst_ssa(struct ir3_context *ctx,
+                                         nir_ssa_def *dst, unsigned n);
+struct ir3_instruction **ir3_get_dst(struct ir3_context *ctx, nir_dest *dst,
+                                     unsigned n);
+struct ir3_instruction *const *ir3_get_src(struct ir3_context *ctx,
+                                           nir_src *src);
  void ir3_put_dst(struct ir3_context *ctx, nir_dest *dst);
-struct ir3_instruction * ir3_create_collect(struct ir3_context *ctx,
-               struct ir3_instruction *const *arr, unsigned arrsz);
+struct ir3_instruction *ir3_create_collect(struct ir3_context *ctx,
+                                           struct ir3_instruction *const *arr,
+                                           unsigned arrsz);
  void ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
-               struct ir3_instruction *src, unsigned base, unsigned n);
+                    struct ir3_instruction *src, unsigned base, unsigned n);
  void ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc);
-void ir3_handle_nonuniform(struct ir3_instruction *instr, nir_intrinsic_instr *intrin);
-void emit_intrinsic_image_size_tex(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst);
-
-#define ir3_collect(ctx, ...) ({ \
-       struct ir3_instruction *__arr[] = { __VA_ARGS__ }; \
-       ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr)); \
-})
-
-NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format, ...);
-
-#define compile_assert(ctx, cond) do { \
-               if (!(cond)) ir3_context_error((ctx), "failed assert: "#cond"\n"); \
-       } while (0)
-
-struct ir3_instruction * ir3_get_addr0(struct ir3_context *ctx,
-               struct ir3_instruction *src, int align);
-struct ir3_instruction * ir3_get_addr1(struct ir3_context *ctx,
-               unsigned const_val);
-struct ir3_instruction * ir3_get_predicate(struct ir3_context *ctx,
-               struct ir3_instruction *src);
+void ir3_handle_nonuniform(struct ir3_instruction *instr,
+                           nir_intrinsic_instr *intrin);
+void emit_intrinsic_image_size_tex(struct ir3_context *ctx,
+                                   nir_intrinsic_instr *intr,
+                                   struct ir3_instruction **dst);
+
+#define ir3_collect(ctx, ...)                                                  \
+   ({                                                                          \
+      struct ir3_instruction *__arr[] = {__VA_ARGS__};                         \
+      ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr));                       \
+   })
+
+NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format,
+                                ...);
+
+#define compile_assert(ctx, cond)                                              \
+   do {                                                                        \
+      if (!(cond))                                                             \
+         ir3_context_error((ctx), "failed assert: " #cond "\n");               \
+   } while (0)
+
+struct ir3_instruction *ir3_get_addr0(struct ir3_context *ctx,
+                                      struct ir3_instruction *src, int align);
+struct ir3_instruction *ir3_get_addr1(struct ir3_context *ctx,
+                                      unsigned const_val);
+struct ir3_instruction *ir3_get_predicate(struct ir3_context *ctx,
+                                          struct ir3_instruction *src);
  
  void ir3_declare_array(struct ir3_context *ctx, nir_register *reg);
-struct ir3_array * ir3_get_array(struct ir3_context *ctx, nir_register *reg);
+struct ir3_array *ir3_get_array(struct ir3_context *ctx, nir_register *reg);
  struct ir3_instruction *ir3_create_array_load(struct ir3_context *ctx,
-               struct ir3_array *arr, int n, struct ir3_instruction *address);
-void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
-               struct ir3_instruction *src, struct ir3_instruction *address);
-
-static inline type_t utype_for_size(unsigned bit_size)
+                                              struct ir3_array *arr, int n,
+                                              struct ir3_instruction *address);
+void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr,
+                            int n, struct ir3_instruction *src,
+                            struct ir3_instruction *address);
+
+static inline type_t
+utype_for_size(unsigned bit_size)
  {
-       switch (bit_size) {
-       case 32: return TYPE_U32;
-       case 16: return TYPE_U16;
-       case  8: return TYPE_U8;
-       default: unreachable("bad bitsize"); return ~0;
-       }
+   switch (bit_size) {
+   case 32:
+      return TYPE_U32;
+   case 16:
+      return TYPE_U16;
+   case 8:
+      return TYPE_U8;
+   default:
+      unreachable("bad bitsize");
+      return ~0;
+   }
  }
  
-static inline type_t utype_src(nir_src src)
-{ return utype_for_size(nir_src_bit_size(src)); }
+static inline type_t
+utype_src(nir_src src)
+{
+   return utype_for_size(nir_src_bit_size(src));
+}
  
-static inline type_t utype_dst(nir_dest dst)
-{ return utype_for_size(nir_dest_bit_size(dst)); }
+static inline type_t
+utype_dst(nir_dest dst)
+{
+   return utype_for_size(nir_dest_bit_size(dst));
+}
  
  #endif /* IR3_CONTEXT_H_ */
diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c

index a0c5ac9..78a5247 100644 (file)
--- a/src/freedreno/ir3/ir3_cp.c
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -32,17 +32,21 @@
  #include "ir3_compiler.h"
  #include "ir3_shader.h"
  
-#define swap(a, b) \
-       do { __typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+#define swap(a, b)                                                             \
+   do {                                                                        \
+      __typeof(a) __tmp = (a);                                                 \
+      (a) = (b);                                                               \
+      (b) = __tmp;                                                             \
+   } while (0)
  
  /*
   * Copy Propagate:
   */
  
  struct ir3_cp_ctx {
-       struct ir3 *shader;
-       struct ir3_shader_variant *so;
-       bool progress;
+   struct ir3 *shader;
+   struct ir3_shader_variant *so;
+   bool progress;
  };
  
  /* is it a type preserving mov, with ok flags?
@@ -53,35 +57,36 @@ struct ir3_cp_ctx {
   * TODO maybe drop allow_flags since this is only false when dst is
   * NULL (ie. outputs)
   */
-static bool is_eligible_mov(struct ir3_instruction *instr,
-               struct ir3_instruction *dst_instr, bool allow_flags)
+static bool
+is_eligible_mov(struct ir3_instruction *instr,
+                struct ir3_instruction *dst_instr, bool allow_flags)
  {
-       if (is_same_type_mov(instr)) {
-               struct ir3_register *dst = instr->dsts[0];
-               struct ir3_register *src = instr->srcs[0];
-               struct ir3_instruction *src_instr = ssa(src);
-
-               /* only if mov src is SSA (not const/immed): */
-               if (!src_instr)
-                       return false;
-
-               /* no indirect: */
-               if (dst->flags & IR3_REG_RELATIV)
-                       return false;
-               if (src->flags & IR3_REG_RELATIV)
-                       return false;
-
-               if (src->flags & IR3_REG_ARRAY)
-                       return false;
-
-               if (!allow_flags)
-                       if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
-                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-                               return false;
-
-               return true;
-       }
-       return false;
+   if (is_same_type_mov(instr)) {
+      struct ir3_register *dst = instr->dsts[0];
+      struct ir3_register *src = instr->srcs[0];
+      struct ir3_instruction *src_instr = ssa(src);
+
+      /* only if mov src is SSA (not const/immed): */
+      if (!src_instr)
+         return false;
+
+      /* no indirect: */
+      if (dst->flags & IR3_REG_RELATIV)
+         return false;
+      if (src->flags & IR3_REG_RELATIV)
+         return false;
+
+      if (src->flags & IR3_REG_ARRAY)
+         return false;
+
+      if (!allow_flags)
+         if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG | IR3_REG_SABS |
+                           IR3_REG_SNEG | IR3_REG_BNOT))
+            return false;
+
+      return true;
+   }
+   return false;
  }
  
  /* we can end up with extra cmps.s from frontend, which uses a
@@ -91,59 +96,60 @@ static bool is_eligible_mov(struct ir3_instruction *instr,
   * as a way to mov into the predicate register.  But frequently 'cond'
   * is itself a cmps.s/cmps.f/cmps.u. So detect this special case.
   */
-static bool is_foldable_double_cmp(struct ir3_instruction *cmp)
+static bool
+is_foldable_double_cmp(struct ir3_instruction *cmp)
  {
-       struct ir3_instruction *cond = ssa(cmp->srcs[0]);
-       return (cmp->dsts[0]->num == regid(REG_P0, 0)) &&
-                               cond &&
-                               (cmp->srcs[1]->flags & IR3_REG_IMMED) &&
-                               (cmp->srcs[1]->iim_val == 0) &&
-                               (cmp->cat2.condition == IR3_COND_NE) &&
-                               (!cond->address || cond->address->def->instr->block == cmp->block);
+   struct ir3_instruction *cond = ssa(cmp->srcs[0]);
+   return (cmp->dsts[0]->num == regid(REG_P0, 0)) && cond &&
+          (cmp->srcs[1]->flags & IR3_REG_IMMED) &&
+          (cmp->srcs[1]->iim_val == 0) &&
+          (cmp->cat2.condition == IR3_COND_NE) &&
+          (!cond->address || cond->address->def->instr->block == cmp->block);
  }
  
  /* propagate register flags from src to dst.. negates need special
   * handling to cancel each other out.
   */
-static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
+static void
+combine_flags(unsigned *dstflags, struct ir3_instruction *src)
  {
-       unsigned srcflags = src->srcs[0]->flags;
-
-       /* if what we are combining into already has (abs) flags,
-        * we can drop (neg) from src:
-        */
-       if (*dstflags & IR3_REG_FABS)
-               srcflags &= ~IR3_REG_FNEG;
-       if (*dstflags & IR3_REG_SABS)
-               srcflags &= ~IR3_REG_SNEG;
-
-       if (srcflags & IR3_REG_FABS)
-               *dstflags |= IR3_REG_FABS;
-       if (srcflags & IR3_REG_SABS)
-               *dstflags |= IR3_REG_SABS;
-       if (srcflags & IR3_REG_FNEG)
-               *dstflags ^= IR3_REG_FNEG;
-       if (srcflags & IR3_REG_SNEG)
-               *dstflags ^= IR3_REG_SNEG;
-       if (srcflags & IR3_REG_BNOT)
-               *dstflags ^= IR3_REG_BNOT;
-
-       *dstflags &= ~IR3_REG_SSA;
-       *dstflags |= srcflags & IR3_REG_SSA;
-       *dstflags |= srcflags & IR3_REG_CONST;
-       *dstflags |= srcflags & IR3_REG_IMMED;
-       *dstflags |= srcflags & IR3_REG_RELATIV;
-       *dstflags |= srcflags & IR3_REG_ARRAY;
-       *dstflags |= srcflags & IR3_REG_SHARED;
-
-       /* if src of the src is boolean we can drop the (abs) since we know
-        * the source value is already a postitive integer.  This cleans
-        * up the absnegs that get inserted when converting between nir and
-        * native boolean (see ir3_b2n/n2b)
-        */
-       struct ir3_instruction *srcsrc = ssa(src->srcs[0]);
-       if (srcsrc && is_bool(srcsrc))
-               *dstflags &= ~IR3_REG_SABS;
+   unsigned srcflags = src->srcs[0]->flags;
+
+   /* if what we are combining into already has (abs) flags,
+    * we can drop (neg) from src:
+    */
+   if (*dstflags & IR3_REG_FABS)
+      srcflags &= ~IR3_REG_FNEG;
+   if (*dstflags & IR3_REG_SABS)
+      srcflags &= ~IR3_REG_SNEG;
+
+   if (srcflags & IR3_REG_FABS)
+      *dstflags |= IR3_REG_FABS;
+   if (srcflags & IR3_REG_SABS)
+      *dstflags |= IR3_REG_SABS;
+   if (srcflags & IR3_REG_FNEG)
+      *dstflags ^= IR3_REG_FNEG;
+   if (srcflags & IR3_REG_SNEG)
+      *dstflags ^= IR3_REG_SNEG;
+   if (srcflags & IR3_REG_BNOT)
+      *dstflags ^= IR3_REG_BNOT;
+
+   *dstflags &= ~IR3_REG_SSA;
+   *dstflags |= srcflags & IR3_REG_SSA;
+   *dstflags |= srcflags & IR3_REG_CONST;
+   *dstflags |= srcflags & IR3_REG_IMMED;
+   *dstflags |= srcflags & IR3_REG_RELATIV;
+   *dstflags |= srcflags & IR3_REG_ARRAY;
+   *dstflags |= srcflags & IR3_REG_SHARED;
+
+   /* if src of the src is boolean we can drop the (abs) since we know
+    * the source value is already a postitive integer.  This cleans
+    * up the absnegs that get inserted when converting between nir and
+    * native boolean (see ir3_b2n/n2b)
+    */
+   struct ir3_instruction *srcsrc = ssa(src->srcs[0]);
+   if (srcsrc && is_bool(srcsrc))
+      *dstflags &= ~IR3_REG_SABS;
  }
  
  /* Tries lowering an immediate register argument to a const buffer access by
@@ -152,111 +158,111 @@ static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
   */
  static bool
  lower_immed(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, unsigned n,
-               struct ir3_register *reg, unsigned new_flags)
+            struct ir3_register *reg, unsigned new_flags)
  {
-       if (!(new_flags & IR3_REG_IMMED))
-               return false;
-
-       new_flags &= ~IR3_REG_IMMED;
-       new_flags |= IR3_REG_CONST;
-
-       if (!ir3_valid_flags(instr, n, new_flags))
-               return false;
-
-       reg = ir3_reg_clone(ctx->shader, reg);
-
-       /* Half constant registers seems to handle only 32-bit values
-        * within floating-point opcodes. So convert back to 32-bit values.
-        */
-       bool f_opcode = (is_cat2_float(instr->opc) ||
-                       is_cat3_float(instr->opc)) ? true : false;
-       if (f_opcode && (new_flags & IR3_REG_HALF))
-               reg->uim_val = fui(_mesa_half_to_float(reg->uim_val));
-
-       /* in some cases, there are restrictions on (abs)/(neg) plus const..
-        * so just evaluate those and clear the flags:
-        */
-       if (new_flags & IR3_REG_SABS) {
-               reg->iim_val = abs(reg->iim_val);
-               new_flags &= ~IR3_REG_SABS;
-       }
-
-       if (new_flags & IR3_REG_FABS) {
-               reg->fim_val = fabs(reg->fim_val);
-               new_flags &= ~IR3_REG_FABS;
-       }
-
-       if (new_flags & IR3_REG_SNEG) {
-               reg->iim_val = -reg->iim_val;
-               new_flags &= ~IR3_REG_SNEG;
-       }
-
-       if (new_flags & IR3_REG_FNEG) {
-               reg->fim_val = -reg->fim_val;
-               new_flags &= ~IR3_REG_FNEG;
-       }
-
-       /* Reallocate for 4 more elements whenever it's necessary.  Note that ir3
-        * printing relies on having groups of 4 dwords, so we fill the unused
-        * slots with a dummy value.
-        */
-       struct ir3_const_state *const_state = ir3_const_state(ctx->so);
-       if (const_state->immediates_count == const_state->immediates_size) {
-               const_state->immediates = rerzalloc(const_state,
-                               const_state->immediates,
-                               __typeof__(const_state->immediates[0]),
-                               const_state->immediates_size,
-                               const_state->immediates_size + 4);
-               const_state->immediates_size += 4;
-
-               for (int i = const_state->immediates_count; i < const_state->immediates_size; i++)
-                       const_state->immediates[i] = 0xd0d0d0d0;
-       }
-
-       int i;
-       for (i = 0; i < const_state->immediates_count; i++) {
-               if (const_state->immediates[i] == reg->uim_val)
-                       break;
-       }
-
-       if (i == const_state->immediates_count) {
-               /* Add on a new immediate to be pushed, if we have space left in the
-                * constbuf.
-                */
-               if (const_state->offsets.immediate + const_state->immediates_count / 4 >=
-                               ir3_max_const(ctx->so))
-                       return false;
-
-               const_state->immediates[i] = reg->uim_val;
-               const_state->immediates_count++;
-       }
-
-       reg->flags = new_flags;
-       reg->num = i + (4 * const_state->offsets.immediate);
-
-       instr->srcs[n] = reg;
-
-       return true;
+   if (!(new_flags & IR3_REG_IMMED))
+      return false;
+
+   new_flags &= ~IR3_REG_IMMED;
+   new_flags |= IR3_REG_CONST;
+
+   if (!ir3_valid_flags(instr, n, new_flags))
+      return false;
+
+   reg = ir3_reg_clone(ctx->shader, reg);
+
+   /* Half constant registers seems to handle only 32-bit values
+    * within floating-point opcodes. So convert back to 32-bit values.
+    */
+   bool f_opcode =
+      (is_cat2_float(instr->opc) || is_cat3_float(instr->opc)) ? true : false;
+   if (f_opcode && (new_flags & IR3_REG_HALF))
+      reg->uim_val = fui(_mesa_half_to_float(reg->uim_val));
+
+   /* in some cases, there are restrictions on (abs)/(neg) plus const..
+    * so just evaluate those and clear the flags:
+    */
+   if (new_flags & IR3_REG_SABS) {
+      reg->iim_val = abs(reg->iim_val);
+      new_flags &= ~IR3_REG_SABS;
+   }
+
+   if (new_flags & IR3_REG_FABS) {
+      reg->fim_val = fabs(reg->fim_val);
+      new_flags &= ~IR3_REG_FABS;
+   }
+
+   if (new_flags & IR3_REG_SNEG) {
+      reg->iim_val = -reg->iim_val;
+      new_flags &= ~IR3_REG_SNEG;
+   }
+
+   if (new_flags & IR3_REG_FNEG) {
+      reg->fim_val = -reg->fim_val;
+      new_flags &= ~IR3_REG_FNEG;
+   }
+
+   /* Reallocate for 4 more elements whenever it's necessary.  Note that ir3
+    * printing relies on having groups of 4 dwords, so we fill the unused
+    * slots with a dummy value.
+    */
+   struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   if (const_state->immediates_count == const_state->immediates_size) {
+      const_state->immediates = rerzalloc(
+         const_state, const_state->immediates,
+         __typeof__(const_state->immediates[0]), const_state->immediates_size,
+         const_state->immediates_size + 4);
+      const_state->immediates_size += 4;
+
+      for (int i = const_state->immediates_count;
+           i < const_state->immediates_size; i++)
+         const_state->immediates[i] = 0xd0d0d0d0;
+   }
+
+   int i;
+   for (i = 0; i < const_state->immediates_count; i++) {
+      if (const_state->immediates[i] == reg->uim_val)
+         break;
+   }
+
+   if (i == const_state->immediates_count) {
+      /* Add on a new immediate to be pushed, if we have space left in the
+       * constbuf.
+       */
+      if (const_state->offsets.immediate + const_state->immediates_count / 4 >=
+          ir3_max_const(ctx->so))
+         return false;
+
+      const_state->immediates[i] = reg->uim_val;
+      const_state->immediates_count++;
+   }
+
+   reg->flags = new_flags;
+   reg->num = i + (4 * const_state->offsets.immediate);
+
+   instr->srcs[n] = reg;
+
+   return true;
  }
  
  static void
  unuse(struct ir3_instruction *instr)
  {
-       debug_assert(instr->use_count > 0);
+   debug_assert(instr->use_count > 0);
  
-       if (--instr->use_count == 0) {
-               struct ir3_block *block = instr->block;
+   if (--instr->use_count == 0) {
+      struct ir3_block *block = instr->block;
  
-               instr->barrier_class = 0;
-               instr->barrier_conflict = 0;
+      instr->barrier_class = 0;
+      instr->barrier_conflict = 0;
  
-               /* we don't want to remove anything in keeps (which could
-                * be things like array store's)
-                */
-               for (unsigned i = 0; i < block->keeps_count; i++) {
-                       debug_assert(block->keeps[i] != instr);
-               }
-       }
+      /* we don't want to remove anything in keeps (which could
+       * be things like array store's)
+       */
+      for (unsigned i = 0; i < block->keeps_count; i++) {
+         debug_assert(block->keeps[i] != instr);
+      }
+   }
  }
  
  /**
@@ -267,34 +273,34 @@ unuse(struct ir3_instruction *instr)
  static bool
  try_swap_mad_two_srcs(struct ir3_instruction *instr, unsigned new_flags)
  {
-       if (!is_mad(instr->opc))
-               return false;
-
-       /* NOTE: pre-swap first two src's before valid_flags(),
-        * which might try to dereference the n'th src:
-        */
-       swap(instr->srcs[0], instr->srcs[1]);
-
-       /* cat3 doesn't encode immediate, but we can lower immediate
-        * to const if that helps:
-        */
-       if (new_flags & IR3_REG_IMMED) {
-               new_flags &= ~IR3_REG_IMMED;
-               new_flags |=  IR3_REG_CONST;
-       }
-
-       bool valid_swap =
-               /* can we propagate mov if we move 2nd src to first? */
-               ir3_valid_flags(instr, 0, new_flags) &&
-               /* and does first src fit in second slot? */
-               ir3_valid_flags(instr, 1, instr->srcs[1]->flags);
-
-       if (!valid_swap) {
-               /* put things back the way they were: */
-               swap(instr->srcs[0], instr->srcs[1]);
-       }   /* otherwise leave things swapped */
-
-       return valid_swap;
+   if (!is_mad(instr->opc))
+      return false;
+
+   /* NOTE: pre-swap first two src's before valid_flags(),
+    * which might try to dereference the n'th src:
+    */
+   swap(instr->srcs[0], instr->srcs[1]);
+
+   /* cat3 doesn't encode immediate, but we can lower immediate
+    * to const if that helps:
+    */
+   if (new_flags & IR3_REG_IMMED) {
+      new_flags &= ~IR3_REG_IMMED;
+      new_flags |= IR3_REG_CONST;
+   }
+
+   bool valid_swap =
+      /* can we propagate mov if we move 2nd src to first? */
+      ir3_valid_flags(instr, 0, new_flags) &&
+      /* and does first src fit in second slot? */
+      ir3_valid_flags(instr, 1, instr->srcs[1]->flags);
+
+   if (!valid_swap) {
+      /* put things back the way they were: */
+      swap(instr->srcs[0], instr->srcs[1]);
+   } /* otherwise leave things swapped */
+
+   return valid_swap;
  }
  
  /**
@@ -306,180 +312,178 @@ try_swap_mad_two_srcs(struct ir3_instruction *instr, unsigned new_flags)
   */
  static bool
  reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
-               struct ir3_register *reg, unsigned n)
+       struct ir3_register *reg, unsigned n)
  {
-       struct ir3_instruction *src = ssa(reg);
-
-       /* Values that are uniform inside a loop can become divergent outside
-        * it if the loop has a divergent trip count. This means that we can't
-        * propagate a copy of a shared to non-shared register if it would
-        * make the shared reg's live range extend outside of its loop. Users
-        * outside the loop would see the value for the thread(s) that last
-        * exited the loop, rather than for their own thread.
-        */
-       if ((src->dsts[0]->flags & IR3_REG_SHARED) &&
-               src->block->loop_id != instr->block->loop_id)
-               return false;
-
-       if (is_eligible_mov(src, instr, true)) {
-               /* simple case, no immed/const/relativ, only mov's w/ ssa src: */
-               struct ir3_register *src_reg = src->srcs[0];
-               unsigned new_flags = reg->flags;
-
-               combine_flags(&new_flags, src);
-
-               if (ir3_valid_flags(instr, n, new_flags)) {
-                       if (new_flags & IR3_REG_ARRAY) {
-                               debug_assert(!(reg->flags & IR3_REG_ARRAY));
-                               reg->array = src_reg->array;
-                       }
-                       reg->flags = new_flags;
-                       reg->def = src_reg->def;
-
-                       instr->barrier_class |= src->barrier_class;
-                       instr->barrier_conflict |= src->barrier_conflict;
-
-                       unuse(src);
-                       reg->def->instr->use_count++;
-
-                       return true;
-               }
-       } else if ((is_same_type_mov(src) || is_const_mov(src)) &&
-                       /* cannot collapse const/immed/etc into control flow: */
-                       opc_cat(instr->opc) != 0) {
-               /* immed/const/etc cases, which require some special handling: */
-               struct ir3_register *src_reg = src->srcs[0];
-               unsigned new_flags = reg->flags;
-
-               if (src_reg->flags & IR3_REG_ARRAY)
-                       return false;
-
-               combine_flags(&new_flags, src);
-
-               if (!ir3_valid_flags(instr, n, new_flags)) {
-                       /* See if lowering an immediate to const would help. */
-                       if (lower_immed(ctx, instr, n, src_reg, new_flags))
-                               return true;
-
-                       /* special case for "normal" mad instructions, we can
-                        * try swapping the first two args if that fits better.
-                        *
-                        * the "plain" MAD's (ie. the ones that don't shift first
-                        * src prior to multiply) can swap their first two srcs if
-                        * src[0] is !CONST and src[1] is CONST:
-                        */
-                       if ((n == 1) && try_swap_mad_two_srcs(instr, new_flags)) {
-                               return true;
-                       } else {
-                               return false;
-                       }
-               }
-
-               /* Here we handle the special case of mov from
-                * CONST and/or RELATIV.  These need to be handled
-                * specially, because in the case of move from CONST
-                * there is no src ir3_instruction so we need to
-                * replace the ir3_register.  And in the case of
-                * RELATIV we need to handle the address register
-                * dependency.
-                */
-               if (src_reg->flags & IR3_REG_CONST) {
-                       /* an instruction cannot reference two different
-                        * address registers:
-                        */
-                       if ((src_reg->flags & IR3_REG_RELATIV) &&
-                                       conflicts(instr->address, reg->def->instr->address))
-                               return false;
-
-                       /* This seems to be a hw bug, or something where the timings
-                        * just somehow don't work out.  This restriction may only
-                        * apply if the first src is also CONST.
-                        */
-                       if ((opc_cat(instr->opc) == 3) && (n == 2) &&
-                                       (src_reg->flags & IR3_REG_RELATIV) &&
-                                       (src_reg->array.offset == 0))
-                               return false;
-
-                       /* When narrowing constant from 32b to 16b, it seems
-                        * to work only for float. So we should do this only with
-                        * float opcodes.
-                        */
-                       if (src->cat1.dst_type == TYPE_F16) {
-                               /* TODO: should we have a way to tell phi/collect to use a
-                                * float move so that this is legal?
-                                */
-                               if (is_meta(instr))
-                                       return false;
-                               if (instr->opc == OPC_MOV && !type_float(instr->cat1.src_type))
-                                       return false;
-                               if (!is_cat2_float(instr->opc) && !is_cat3_float(instr->opc))
-                                       return false;
-                       } else if (src->cat1.dst_type == TYPE_U16) {
-                               if (is_meta(instr))
-                                       return true;
-                               /* Since we set CONSTANT_DEMOTION_ENABLE, a float reference of
-                                * what was a U16 value read from the constbuf would incorrectly
-                                * do 32f->16f conversion, when we want to read a 16f value.
-                                */
-                               if (is_cat2_float(instr->opc) || is_cat3_float(instr->opc))
-                                       return false;
-                       }
-
-                       src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-                       src_reg->flags = new_flags;
-                       instr->srcs[n] = src_reg;
-
-                       if (src_reg->flags & IR3_REG_RELATIV)
-                               ir3_instr_set_address(instr, reg->def->instr->address->def->instr);
-
-                       return true;
-               }
-
-               /* NOTE: seems we can only do immed integers, so don't
-                * need to care about float.  But we do need to handle
-                * abs/neg *before* checking that the immediate requires
-                * few enough bits to encode:
-                *
-                * TODO: do we need to do something to avoid accidentally
-                * catching a float immed?
-                */
-               if (src_reg->flags & IR3_REG_IMMED) {
-                       int32_t iim_val = src_reg->iim_val;
-
-                       debug_assert((opc_cat(instr->opc) == 1) ||
-                                       (opc_cat(instr->opc) == 6) ||
-                                       is_meta(instr) ||
-                                       ir3_cat2_int(instr->opc) ||
-                                       (is_mad(instr->opc) && (n == 0)));
-
-                       if (new_flags & IR3_REG_SABS)
-                               iim_val = abs(iim_val);
-
-                       if (new_flags & IR3_REG_SNEG)
-                               iim_val = -iim_val;
-
-                       if (new_flags & IR3_REG_BNOT)
-                               iim_val = ~iim_val;
-
-                       /* other than category 1 (mov) we can only encode up to 10 bits: */
-                       if (ir3_valid_flags(instr, n, new_flags) &&
-                                       ((instr->opc == OPC_MOV) || is_meta(instr) ||
-                                        !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff)))) {
-                               new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
-                               src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-                               src_reg->flags = new_flags;
-                               src_reg->iim_val = iim_val;
-                               instr->srcs[n] = src_reg;
-
-                               return true;
-                       } else if (lower_immed(ctx, instr, n, src_reg, new_flags)) {
-                               /* Fell back to loading the immediate as a const */
-                               return true;
-                       }
-               }
-       }
-
-       return false;
+   struct ir3_instruction *src = ssa(reg);
+
+   /* Values that are uniform inside a loop can become divergent outside
+    * it if the loop has a divergent trip count. This means that we can't
+    * propagate a copy of a shared to non-shared register if it would
+    * make the shared reg's live range extend outside of its loop. Users
+    * outside the loop would see the value for the thread(s) that last
+    * exited the loop, rather than for their own thread.
+    */
+   if ((src->dsts[0]->flags & IR3_REG_SHARED) &&
+       src->block->loop_id != instr->block->loop_id)
+      return false;
+
+   if (is_eligible_mov(src, instr, true)) {
+      /* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+      struct ir3_register *src_reg = src->srcs[0];
+      unsigned new_flags = reg->flags;
+
+      combine_flags(&new_flags, src);
+
+      if (ir3_valid_flags(instr, n, new_flags)) {
+         if (new_flags & IR3_REG_ARRAY) {
+            debug_assert(!(reg->flags & IR3_REG_ARRAY));
+            reg->array = src_reg->array;
+         }
+         reg->flags = new_flags;
+         reg->def = src_reg->def;
+
+         instr->barrier_class |= src->barrier_class;
+         instr->barrier_conflict |= src->barrier_conflict;
+
+         unuse(src);
+         reg->def->instr->use_count++;
+
+         return true;
+      }
+   } else if ((is_same_type_mov(src) || is_const_mov(src)) &&
+              /* cannot collapse const/immed/etc into control flow: */
+              opc_cat(instr->opc) != 0) {
+      /* immed/const/etc cases, which require some special handling: */
+      struct ir3_register *src_reg = src->srcs[0];
+      unsigned new_flags = reg->flags;
+
+      if (src_reg->flags & IR3_REG_ARRAY)
+         return false;
+
+      combine_flags(&new_flags, src);
+
+      if (!ir3_valid_flags(instr, n, new_flags)) {
+         /* See if lowering an immediate to const would help. */
+         if (lower_immed(ctx, instr, n, src_reg, new_flags))
+            return true;
+
+         /* special case for "normal" mad instructions, we can
+          * try swapping the first two args if that fits better.
+          *
+          * the "plain" MAD's (ie. the ones that don't shift first
+          * src prior to multiply) can swap their first two srcs if
+          * src[0] is !CONST and src[1] is CONST:
+          */
+         if ((n == 1) && try_swap_mad_two_srcs(instr, new_flags)) {
+            return true;
+         } else {
+            return false;
+         }
+      }
+
+      /* Here we handle the special case of mov from
+       * CONST and/or RELATIV.  These need to be handled
+       * specially, because in the case of move from CONST
+       * there is no src ir3_instruction so we need to
+       * replace the ir3_register.  And in the case of
+       * RELATIV we need to handle the address register
+       * dependency.
+       */
+      if (src_reg->flags & IR3_REG_CONST) {
+         /* an instruction cannot reference two different
+          * address registers:
+          */
+         if ((src_reg->flags & IR3_REG_RELATIV) &&
+             conflicts(instr->address, reg->def->instr->address))
+            return false;
+
+         /* This seems to be a hw bug, or something where the timings
+          * just somehow don't work out.  This restriction may only
+          * apply if the first src is also CONST.
+          */
+         if ((opc_cat(instr->opc) == 3) && (n == 2) &&
+             (src_reg->flags & IR3_REG_RELATIV) && (src_reg->array.offset == 0))
+            return false;
+
+         /* When narrowing constant from 32b to 16b, it seems
+          * to work only for float. So we should do this only with
+          * float opcodes.
+          */
+         if (src->cat1.dst_type == TYPE_F16) {
+            /* TODO: should we have a way to tell phi/collect to use a
+             * float move so that this is legal?
+             */
+            if (is_meta(instr))
+               return false;
+            if (instr->opc == OPC_MOV && !type_float(instr->cat1.src_type))
+               return false;
+            if (!is_cat2_float(instr->opc) && !is_cat3_float(instr->opc))
+               return false;
+         } else if (src->cat1.dst_type == TYPE_U16) {
+            if (is_meta(instr))
+               return true;
+            /* Since we set CONSTANT_DEMOTION_ENABLE, a float reference of
+             * what was a U16 value read from the constbuf would incorrectly
+             * do 32f->16f conversion, when we want to read a 16f value.
+             */
+            if (is_cat2_float(instr->opc) || is_cat3_float(instr->opc))
+               return false;
+         }
+
+         src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+         src_reg->flags = new_flags;
+         instr->srcs[n] = src_reg;
+
+         if (src_reg->flags & IR3_REG_RELATIV)
+            ir3_instr_set_address(instr, reg->def->instr->address->def->instr);
+
+         return true;
+      }
+
+      /* NOTE: seems we can only do immed integers, so don't
+       * need to care about float.  But we do need to handle
+       * abs/neg *before* checking that the immediate requires
+       * few enough bits to encode:
+       *
+       * TODO: do we need to do something to avoid accidentally
+       * catching a float immed?
+       */
+      if (src_reg->flags & IR3_REG_IMMED) {
+         int32_t iim_val = src_reg->iim_val;
+
+         debug_assert((opc_cat(instr->opc) == 1) ||
+                      (opc_cat(instr->opc) == 6) || is_meta(instr) ||
+                      ir3_cat2_int(instr->opc) ||
+                      (is_mad(instr->opc) && (n == 0)));
+
+         if (new_flags & IR3_REG_SABS)
+            iim_val = abs(iim_val);
+
+         if (new_flags & IR3_REG_SNEG)
+            iim_val = -iim_val;
+
+         if (new_flags & IR3_REG_BNOT)
+            iim_val = ~iim_val;
+
+         /* other than category 1 (mov) we can only encode up to 10 bits: */
+         if (ir3_valid_flags(instr, n, new_flags) &&
+             ((instr->opc == OPC_MOV) || is_meta(instr) ||
+              !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff)))) {
+            new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+            src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+            src_reg->flags = new_flags;
+            src_reg->iim_val = iim_val;
+            instr->srcs[n] = src_reg;
+
+            return true;
+         } else if (lower_immed(ctx, instr, n, src_reg, new_flags)) {
+            /* Fell back to loading the immediate as a const */
+            return true;
+         }
+      }
+   }
+
+   return false;
  }
  
  /* Handle special case of eliminating output mov, and similar cases where
@@ -490,16 +494,16 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
  static struct ir3_instruction *
  eliminate_output_mov(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (is_eligible_mov(instr, NULL, false)) {
-               struct ir3_register *reg = instr->srcs[0];
-               if (!(reg->flags & IR3_REG_ARRAY)) {
-                       struct ir3_instruction *src_instr = ssa(reg);
-                       debug_assert(src_instr);
-                       ctx->progress = true;
-                       return src_instr;
-               }
-       }
-       return instr;
+   if (is_eligible_mov(instr, NULL, false)) {
+      struct ir3_register *reg = instr->srcs[0];
+      if (!(reg->flags & IR3_REG_ARRAY)) {
+         struct ir3_instruction *src_instr = ssa(reg);
+         debug_assert(src_instr);
+         ctx->progress = true;
+         return src_instr;
+      }
+   }
+   return instr;
  }
  
  /**
@@ -509,172 +513,170 @@ eliminate_output_mov(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
  static void
  instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (instr->srcs_count == 0)
-               return;
-
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       /* walk down the graph from each src: */
-       bool progress;
-       do {
-               progress = false;
-               foreach_src_n (reg, n, instr) {
-                       struct ir3_instruction *src = ssa(reg);
-
-                       if (!src)
-                               continue;
-
-                       instr_cp(ctx, src);
-
-                       /* TODO non-indirect access we could figure out which register
-                        * we actually want and allow cp..
-                        */
-                       if (reg->flags & IR3_REG_ARRAY)
-                               continue;
-
-                       /* Don't CP absneg into meta instructions, that won't end well: */
-                       if (is_meta(instr) && (src->opc != OPC_MOV))
-                               continue;
-
-                       /* Don't CP mova and mova1 into their users */
-                       if (writes_addr0(src) || writes_addr1(src))
-                               continue;
-
-                       progress |= reg_cp(ctx, instr, reg, n);
-                       ctx->progress |= progress;
-               }
-       } while (progress);
-
-       /* After folding a mov's source we may wind up with a type-converting mov
-        * of an immediate. This happens e.g. with texture descriptors, since we
-        * narrow the descriptor (which may be a constant) to a half-reg in ir3.
-        * By converting the immediate in-place to the destination type, we can
-        * turn the mov into a same-type mov so that it can be further propagated.
-        */
-       if (instr->opc == OPC_MOV &&
-                       (instr->srcs[0]->flags & IR3_REG_IMMED) &&
-                       instr->cat1.src_type != instr->cat1.dst_type &&
-                       /* Only do uint types for now, until we generate other types of
-                        * mov's during instruction selection.
-                        */
-                       full_type(instr->cat1.src_type) == TYPE_U32 &&
-                       full_type(instr->cat1.dst_type) == TYPE_U32) {
-               uint32_t uimm = instr->srcs[0]->uim_val;
-               if (instr->cat1.dst_type == TYPE_U16)
-                       uimm &= 0xffff;
-               instr->srcs[0]->uim_val = uimm;
-               if (instr->dsts[0]->flags & IR3_REG_HALF)
-                       instr->srcs[0]->flags |= IR3_REG_HALF;
-               else
-                       instr->srcs[0]->flags &= ~IR3_REG_HALF;
-               instr->cat1.src_type = instr->cat1.dst_type;
-               ctx->progress = true;
-       }
-
-       /* Re-write the instruction writing predicate register to get rid
-        * of the double cmps.
-        */
-       if ((instr->opc == OPC_CMPS_S) && is_foldable_double_cmp(instr)) {
-               struct ir3_instruction *cond = ssa(instr->srcs[0]);
-               switch (cond->opc) {
-               case OPC_CMPS_S:
-               case OPC_CMPS_F:
-               case OPC_CMPS_U:
-                       instr->opc   = cond->opc;
-                       instr->flags = cond->flags;
-                       instr->cat2  = cond->cat2;
-                       if (cond->address)
-                               ir3_instr_set_address(instr, cond->address->def->instr);
-                       instr->srcs[0] = ir3_reg_clone(ctx->shader, cond->srcs[0]);
-                       instr->srcs[1] = ir3_reg_clone(ctx->shader, cond->srcs[1]);
-                       instr->barrier_class |= cond->barrier_class;
-                       instr->barrier_conflict |= cond->barrier_conflict;
-                       unuse(cond);
-                       ctx->progress = true;
-                       break;
-               default:
-                       break;
-               }
-       }
-
-       /* Handle converting a sam.s2en (taking samp/tex idx params via register)
-        * into a normal sam (encoding immediate samp/tex idx) if they are
-        * immediate. This saves some instructions and regs in the common case
-        * where we know samp/tex at compile time. This needs to be done in the
-        * frontend for bindless tex, though, so don't replicate it here.
-        */
-       if (is_tex(instr) && (instr->flags & IR3_INSTR_S2EN) &&
-                       !(instr->flags & IR3_INSTR_B) &&
-                       !(ir3_shader_debug & IR3_DBG_FORCES2EN)) {
-               /* The first src will be a collect, if both of it's
-                * two sources are mov from imm, then we can
-                */
-               struct ir3_instruction *samp_tex = ssa(instr->srcs[0]);
-
-               debug_assert(samp_tex->opc == OPC_META_COLLECT);
-
-               struct ir3_register *samp = samp_tex->srcs[0];
-               struct ir3_register *tex  = samp_tex->srcs[1];
-
-               if ((samp->flags & IR3_REG_IMMED) &&
-                       (tex->flags & IR3_REG_IMMED)) {
-                       instr->flags &= ~IR3_INSTR_S2EN;
-                       instr->cat5.samp = samp->iim_val;
-                       instr->cat5.tex  = tex->iim_val;
-
-                       /* shuffle around the regs to remove the first src: */
-                       instr->srcs_count--;
-                       for (unsigned i = 0; i < instr->srcs_count; i++) {
-                               instr->srcs[i] = instr->srcs[i + 1];
-                       }
-
-                       ctx->progress = true;
-               }
-       }
+   if (instr->srcs_count == 0)
+      return;
+
+   if (ir3_instr_check_mark(instr))
+      return;
+
+   /* walk down the graph from each src: */
+   bool progress;
+   do {
+      progress = false;
+      foreach_src_n (reg, n, instr) {
+         struct ir3_instruction *src = ssa(reg);
+
+         if (!src)
+            continue;
+
+         instr_cp(ctx, src);
+
+         /* TODO non-indirect access we could figure out which register
+          * we actually want and allow cp..
+          */
+         if (reg->flags & IR3_REG_ARRAY)
+            continue;
+
+         /* Don't CP absneg into meta instructions, that won't end well: */
+         if (is_meta(instr) && (src->opc != OPC_MOV))
+            continue;
+
+         /* Don't CP mova and mova1 into their users */
+         if (writes_addr0(src) || writes_addr1(src))
+            continue;
+
+         progress |= reg_cp(ctx, instr, reg, n);
+         ctx->progress |= progress;
+      }
+   } while (progress);
+
+   /* After folding a mov's source we may wind up with a type-converting mov
+    * of an immediate. This happens e.g. with texture descriptors, since we
+    * narrow the descriptor (which may be a constant) to a half-reg in ir3.
+    * By converting the immediate in-place to the destination type, we can
+    * turn the mov into a same-type mov so that it can be further propagated.
+    */
+   if (instr->opc == OPC_MOV && (instr->srcs[0]->flags & IR3_REG_IMMED) &&
+       instr->cat1.src_type != instr->cat1.dst_type &&
+       /* Only do uint types for now, until we generate other types of
+        * mov's during instruction selection.
+        */
+       full_type(instr->cat1.src_type) == TYPE_U32 &&
+       full_type(instr->cat1.dst_type) == TYPE_U32) {
+      uint32_t uimm = instr->srcs[0]->uim_val;
+      if (instr->cat1.dst_type == TYPE_U16)
+         uimm &= 0xffff;
+      instr->srcs[0]->uim_val = uimm;
+      if (instr->dsts[0]->flags & IR3_REG_HALF)
+         instr->srcs[0]->flags |= IR3_REG_HALF;
+      else
+         instr->srcs[0]->flags &= ~IR3_REG_HALF;
+      instr->cat1.src_type = instr->cat1.dst_type;
+      ctx->progress = true;
+   }
+
+   /* Re-write the instruction writing predicate register to get rid
+    * of the double cmps.
+    */
+   if ((instr->opc == OPC_CMPS_S) && is_foldable_double_cmp(instr)) {
+      struct ir3_instruction *cond = ssa(instr->srcs[0]);
+      switch (cond->opc) {
+      case OPC_CMPS_S:
+      case OPC_CMPS_F:
+      case OPC_CMPS_U:
+         instr->opc = cond->opc;
+         instr->flags = cond->flags;
+         instr->cat2 = cond->cat2;
+         if (cond->address)
+            ir3_instr_set_address(instr, cond->address->def->instr);
+         instr->srcs[0] = ir3_reg_clone(ctx->shader, cond->srcs[0]);
+         instr->srcs[1] = ir3_reg_clone(ctx->shader, cond->srcs[1]);
+         instr->barrier_class |= cond->barrier_class;
+         instr->barrier_conflict |= cond->barrier_conflict;
+         unuse(cond);
+         ctx->progress = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   /* Handle converting a sam.s2en (taking samp/tex idx params via register)
+    * into a normal sam (encoding immediate samp/tex idx) if they are
+    * immediate. This saves some instructions and regs in the common case
+    * where we know samp/tex at compile time. This needs to be done in the
+    * frontend for bindless tex, though, so don't replicate it here.
+    */
+   if (is_tex(instr) && (instr->flags & IR3_INSTR_S2EN) &&
+       !(instr->flags & IR3_INSTR_B) &&
+       !(ir3_shader_debug & IR3_DBG_FORCES2EN)) {
+      /* The first src will be a collect, if both of it's
+       * two sources are mov from imm, then we can
+       */
+      struct ir3_instruction *samp_tex = ssa(instr->srcs[0]);
+
+      debug_assert(samp_tex->opc == OPC_META_COLLECT);
+
+      struct ir3_register *samp = samp_tex->srcs[0];
+      struct ir3_register *tex = samp_tex->srcs[1];
+
+      if ((samp->flags & IR3_REG_IMMED) && (tex->flags & IR3_REG_IMMED)) {
+         instr->flags &= ~IR3_INSTR_S2EN;
+         instr->cat5.samp = samp->iim_val;
+         instr->cat5.tex = tex->iim_val;
+
+         /* shuffle around the regs to remove the first src: */
+         instr->srcs_count--;
+         for (unsigned i = 0; i < instr->srcs_count; i++) {
+            instr->srcs[i] = instr->srcs[i + 1];
+         }
+
+         ctx->progress = true;
+      }
+   }
  }
  
  bool
  ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
  {
-       struct ir3_cp_ctx ctx = {
-                       .shader = ir,
-                       .so = so,
-       };
-
-       /* This is a bit annoying, and probably wouldn't be necessary if we
-        * tracked a reverse link from producing instruction to consumer.
-        * But we need to know when we've eliminated the last consumer of
-        * a mov, so we need to do a pass to first count consumers of a
-        * mov.
-        */
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-
-                       /* by the way, we don't account for false-dep's, so the CP
-                        * pass should always happen before false-dep's are inserted
-                        */
-                       debug_assert(instr->deps_count == 0);
-
-                       foreach_ssa_src (src, instr) {
-                               src->use_count++;
-                       }
-               }
-       }
-
-       ir3_clear_mark(ir);
-
-       foreach_block (block, &ir->block_list) {
-               if (block->condition) {
-                       instr_cp(&ctx, block->condition);
-                       block->condition = eliminate_output_mov(&ctx, block->condition);
-               }
-
-               for (unsigned i = 0; i < block->keeps_count; i++) {
-                       instr_cp(&ctx, block->keeps[i]);
-                       block->keeps[i] = eliminate_output_mov(&ctx, block->keeps[i]);
-               }
-       }
-
-       return ctx.progress;
+   struct ir3_cp_ctx ctx = {
+      .shader = ir,
+      .so = so,
+   };
+
+   /* This is a bit annoying, and probably wouldn't be necessary if we
+    * tracked a reverse link from producing instruction to consumer.
+    * But we need to know when we've eliminated the last consumer of
+    * a mov, so we need to do a pass to first count consumers of a
+    * mov.
+    */
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+
+         /* by the way, we don't account for false-dep's, so the CP
+          * pass should always happen before false-dep's are inserted
+          */
+         debug_assert(instr->deps_count == 0);
+
+         foreach_ssa_src (src, instr) {
+            src->use_count++;
+         }
+      }
+   }
+
+   ir3_clear_mark(ir);
+
+   foreach_block (block, &ir->block_list) {
+      if (block->condition) {
+         instr_cp(&ctx, block->condition);
+         block->condition = eliminate_output_mov(&ctx, block->condition);
+      }
+
+      for (unsigned i = 0; i < block->keeps_count; i++) {
+         instr_cp(&ctx, block->keeps[i]);
+         block->keeps[i] = eliminate_output_mov(&ctx, block->keeps[i]);
+      }
+   }
+
+   return ctx.progress;
  }
diff --git a/src/freedreno/ir3/ir3_cp_postsched.c b/src/freedreno/ir3/ir3_cp_postsched.c

index 89efd34..967b50f 100644 (file)
--- a/src/freedreno/ir3/ir3_cp_postsched.c
+++ b/src/freedreno/ir3/ir3_cp_postsched.c
@@ -36,7 +36,6 @@
   * one.  It is basically anything that is not SSA.
   */
  
-
  /**
   * Check if any instruction before `use` and after `src` writes to the
   * specified array.  If `offset` is negative, it is a relative (a0.x)
@@ -48,186 +47,184 @@
   * the correct array write.
   */
  static bool
-has_conflicting_write(struct ir3_instruction *src,
-               struct ir3_instruction *use,
-               struct ir3_register **def,
-               unsigned id, int offset)
+has_conflicting_write(struct ir3_instruction *src, struct ir3_instruction *use,
+                      struct ir3_register **def, unsigned id, int offset)
  {
-       assert(src->block == use->block);
-       bool last_write = true;
-
-       /* NOTE that since src and use are in the same block, src by
-        * definition appears in the block's instr_list before use:
-        */
-       foreach_instr_rev (instr, &use->node) {
-               if (instr == src)
-                       break;
-
-               /* if we are looking at a RELATIV read, we can't move
-                * it past an a0.x write:
-                */
-               if ((offset < 0) && (dest_regs(instr) > 0) &&
-                               (instr->dsts[0]->num == regid(REG_A0, 0)))
-                       return true;
-
-               if (!writes_gpr(instr))
-                       continue;
-
-               struct ir3_register *dst = instr->dsts[0];
-               if (!(dst->flags & IR3_REG_ARRAY))
-                       continue;
-
-               if (dst->array.id != id)
-                       continue;
-
-               /*
-                * At this point, we have narrowed down an instruction
-                * that writes to the same array.. check if it the write
-                * is to an array element that we care about:
-                */
-
-               /* is write to an unknown array element? */
-               if (dst->flags & IR3_REG_RELATIV)
-                       return true;
-
-               /* is read from an unknown array element? */
-               if (offset < 0)
-                       return true;
-
-               /* is write to same array element? */
-               if (dst->array.offset == offset)
-                       return true;
-
-               if (last_write)
-                       *def = dst;
-
-               last_write = false;
-       }
-
-       return false;
+   assert(src->block == use->block);
+   bool last_write = true;
+
+   /* NOTE that since src and use are in the same block, src by
+    * definition appears in the block's instr_list before use:
+    */
+   foreach_instr_rev (instr, &use->node) {
+      if (instr == src)
+         break;
+
+      /* if we are looking at a RELATIV read, we can't move
+       * it past an a0.x write:
+       */
+      if ((offset < 0) && (dest_regs(instr) > 0) &&
+          (instr->dsts[0]->num == regid(REG_A0, 0)))
+         return true;
+
+      if (!writes_gpr(instr))
+         continue;
+
+      struct ir3_register *dst = instr->dsts[0];
+      if (!(dst->flags & IR3_REG_ARRAY))
+         continue;
+
+      if (dst->array.id != id)
+         continue;
+
+      /*
+       * At this point, we have narrowed down an instruction
+       * that writes to the same array.. check if it the write
+       * is to an array element that we care about:
+       */
+
+      /* is write to an unknown array element? */
+      if (dst->flags & IR3_REG_RELATIV)
+         return true;
+
+      /* is read from an unknown array element? */
+      if (offset < 0)
+         return true;
+
+      /* is write to same array element? */
+      if (dst->array.offset == offset)
+         return true;
+
+      if (last_write)
+         *def = dst;
+
+      last_write = false;
+   }
+
+   return false;
  }
  
  /* Can we fold the mov src into use without invalid flags? */
  static bool
  valid_flags(struct ir3_instruction *use, struct ir3_instruction *mov)
  {
-       struct ir3_register *src = mov->srcs[0];
+   struct ir3_register *src = mov->srcs[0];
  
-       foreach_src_n (reg, n, use) {
-               if (ssa(reg) != mov)
-                       continue;
+   foreach_src_n (reg, n, use) {
+      if (ssa(reg) != mov)
+         continue;
  
-               if (!ir3_valid_flags(use, n, reg->flags | src->flags))
-                       return false;
-       }
+      if (!ir3_valid_flags(use, n, reg->flags | src->flags))
+         return false;
+   }
  
-       return true;
+   return true;
  }
  
  static bool
  instr_cp_postsched(struct ir3_instruction *mov)
  {
-       struct ir3_register *src = mov->srcs[0];
-
-       /* only consider mov's from "arrays", other cases we have
-        * already considered already:
-        */
-       if (!(src->flags & IR3_REG_ARRAY))
-               return false;
-
-       int offset = (src->flags & IR3_REG_RELATIV) ? -1 : src->array.offset;
-
-       /* Once we move the array read directly into the consuming
-        * instruction(s), we will also need to update instructions
-        * that had a false-dep on the original mov to have deps
-        * on the consuming instructions:
-        */
-       struct util_dynarray newdeps;
-       util_dynarray_init(&newdeps, mov->uses);
-
-       foreach_ssa_use (use, mov) {
-               if (use->block != mov->block)
-                       continue;
-
-               if (is_meta(use))
-                       continue;
-
-               struct ir3_register *def = src->def;
-               if (has_conflicting_write(mov, use, &def, src->array.id, offset))
-                       continue;
-
-               if (conflicts(mov->address, use->address))
-                       continue;
-
-               if (!valid_flags(use, mov))
-                       continue;
-
-               /* Ok, we've established that it is safe to remove this copy: */
-
-               bool removed = false;
-               foreach_src_n (reg, n, use) {
-                       if (ssa(reg) != mov)
-                               continue;
-
-                       use->srcs[n] = ir3_reg_clone(mov->block->shader, src);
-
-                       /* preserve (abs)/etc modifiers: */
-                       use->srcs[n]-> flags |= reg->flags;
-
-                       /* If we're sinking the array read past any writes, make
-                        * sure to update it to point to the new previous write:
-                        */
-                       use->srcs[n]->def = def;
-
-                       removed = true;
-               }
-
-               /* the use could have been only a false-dep, only add to the newdeps
-                * array and update the address if we've actually updated a real src
-                * reg for the use:
-                */
-               if (removed) {
-                       if (src->flags & IR3_REG_RELATIV)
-                               ir3_instr_set_address(use, mov->address->def->instr);
-
-                       util_dynarray_append(&newdeps, struct ir3_instruction *, use);
-
-                       /* Remove the use from the src instruction: */
-                       _mesa_set_remove_key(mov->uses, use);
-               }
-       }
-
-       /* Once we have the complete set of instruction(s) that are are now
-        * directly reading from the array, update any false-dep uses to
-        * now depend on these instructions.  The only remaining uses at
-        * this point should be false-deps:
-        */
-       foreach_ssa_use (use, mov) {
-               util_dynarray_foreach(&newdeps, struct ir3_instruction *, instrp) {
-                       struct ir3_instruction *newdep = *instrp;
-                       ir3_instr_add_dep(use, newdep);
-               }
-       }
-
-       return util_dynarray_num_elements(&newdeps, struct ir3_instruction **) > 0;
+   struct ir3_register *src = mov->srcs[0];
+
+   /* only consider mov's from "arrays", other cases we have
+    * already considered already:
+    */
+   if (!(src->flags & IR3_REG_ARRAY))
+      return false;
+
+   int offset = (src->flags & IR3_REG_RELATIV) ? -1 : src->array.offset;
+
+   /* Once we move the array read directly into the consuming
+    * instruction(s), we will also need to update instructions
+    * that had a false-dep on the original mov to have deps
+    * on the consuming instructions:
+    */
+   struct util_dynarray newdeps;
+   util_dynarray_init(&newdeps, mov->uses);
+
+   foreach_ssa_use (use, mov) {
+      if (use->block != mov->block)
+         continue;
+
+      if (is_meta(use))
+         continue;
+
+      struct ir3_register *def = src->def;
+      if (has_conflicting_write(mov, use, &def, src->array.id, offset))
+         continue;
+
+      if (conflicts(mov->address, use->address))
+         continue;
+
+      if (!valid_flags(use, mov))
+         continue;
+
+      /* Ok, we've established that it is safe to remove this copy: */
+
+      bool removed = false;
+      foreach_src_n (reg, n, use) {
+         if (ssa(reg) != mov)
+            continue;
+
+         use->srcs[n] = ir3_reg_clone(mov->block->shader, src);
+
+         /* preserve (abs)/etc modifiers: */
+         use->srcs[n]->flags |= reg->flags;
+
+         /* If we're sinking the array read past any writes, make
+          * sure to update it to point to the new previous write:
+          */
+         use->srcs[n]->def = def;
+
+         removed = true;
+      }
+
+      /* the use could have been only a false-dep, only add to the newdeps
+       * array and update the address if we've actually updated a real src
+       * reg for the use:
+       */
+      if (removed) {
+         if (src->flags & IR3_REG_RELATIV)
+            ir3_instr_set_address(use, mov->address->def->instr);
+
+         util_dynarray_append(&newdeps, struct ir3_instruction *, use);
+
+         /* Remove the use from the src instruction: */
+         _mesa_set_remove_key(mov->uses, use);
+      }
+   }
+
+   /* Once we have the complete set of instruction(s) that are are now
+    * directly reading from the array, update any false-dep uses to
+    * now depend on these instructions.  The only remaining uses at
+    * this point should be false-deps:
+    */
+   foreach_ssa_use (use, mov) {
+      util_dynarray_foreach (&newdeps, struct ir3_instruction *, instrp) {
+         struct ir3_instruction *newdep = *instrp;
+         ir3_instr_add_dep(use, newdep);
+      }
+   }
+
+   return util_dynarray_num_elements(&newdeps, struct ir3_instruction **) > 0;
  }
  
  bool
  ir3_cp_postsched(struct ir3 *ir)
  {
-       void *mem_ctx = ralloc_context(NULL);
-       bool progress = false;
+   void *mem_ctx = ralloc_context(NULL);
+   bool progress = false;
  
-       ir3_find_ssa_uses(ir, mem_ctx, false);
+   ir3_find_ssa_uses(ir, mem_ctx, false);
  
-       foreach_block (block, &ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (is_same_type_mov(instr))
-                               progress |= instr_cp_postsched(instr);
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (is_same_type_mov(instr))
+            progress |= instr_cp_postsched(instr);
+      }
+   }
  
-       ralloc_free(mem_ctx);
+   ralloc_free(mem_ctx);
  
-       return progress;
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_cse.c b/src/freedreno/ir3/ir3_cse.c

index 6de040b..712730f 100644 (file)
--- a/src/freedreno/ir3/ir3_cse.c
+++ b/src/freedreno/ir3/ir3_cse.c
@@ -37,112 +37,109 @@
  static uint32_t
  hash_instr(const void *data)
  {
-       const struct ir3_instruction *instr = data;
-       uint32_t hash = 0;
-
-       hash = HASH(hash, instr->opc);
-       hash = HASH(hash, instr->dsts[0]->flags);
-       foreach_src (src, (struct ir3_instruction *) instr) {
-               if (src->flags & IR3_REG_CONST)
-                       hash = HASH(hash, src->num);
-               else if (src->flags & IR3_REG_IMMED)
-                       hash = HASH(hash, src->uim_val);
-               else
-                       hash = HASH(hash, src->def);
-       }
-
-       return hash;
+   const struct ir3_instruction *instr = data;
+   uint32_t hash = 0;
+
+   hash = HASH(hash, instr->opc);
+   hash = HASH(hash, instr->dsts[0]->flags);
+   foreach_src (src, (struct ir3_instruction *)instr) {
+      if (src->flags & IR3_REG_CONST)
+         hash = HASH(hash, src->num);
+      else if (src->flags & IR3_REG_IMMED)
+         hash = HASH(hash, src->uim_val);
+      else
+         hash = HASH(hash, src->def);
+   }
+
+   return hash;
  }
  
  static bool
  instrs_equal(const struct ir3_instruction *i1, const struct ir3_instruction *i2)
  {
-       if (i1->opc != i2->opc)
-               return false;
+   if (i1->opc != i2->opc)
+      return false;
  
-       if (i1->dsts_count != i2->dsts_count)
-               return false;
+   if (i1->dsts_count != i2->dsts_count)
+      return false;
  
-       if (i1->srcs_count != i2->srcs_count)
-               return false;
+   if (i1->srcs_count != i2->srcs_count)
+      return false;
  
-       if (i1->dsts[0]->flags != i2->dsts[0]->flags)
-               return false;
+   if (i1->dsts[0]->flags != i2->dsts[0]->flags)
+      return false;
  
-       for (unsigned i = 0; i < i1->srcs_count; i++) {
-               const struct ir3_register *i1_reg = i1->srcs[i], *i2_reg = i2->srcs[i];
+   for (unsigned i = 0; i < i1->srcs_count; i++) {
+      const struct ir3_register *i1_reg = i1->srcs[i], *i2_reg = i2->srcs[i];
  
-               if (i1_reg->flags != i2_reg->flags)
-                       return false;
+      if (i1_reg->flags != i2_reg->flags)
+         return false;
  
-               if (i1_reg->flags & IR3_REG_CONST) {
-                       if (i1_reg->num != i2_reg->num)
-                               return false;
-               } else if (i1_reg->flags & IR3_REG_IMMED) {
-                       if (i1_reg->uim_val != i2_reg->uim_val)
-                               return false;
-               } else {
-                       if (i1_reg->def != i2_reg->def)
-                               return false;
-               }
-       }
+      if (i1_reg->flags & IR3_REG_CONST) {
+         if (i1_reg->num != i2_reg->num)
+            return false;
+      } else if (i1_reg->flags & IR3_REG_IMMED) {
+         if (i1_reg->uim_val != i2_reg->uim_val)
+            return false;
+      } else {
+         if (i1_reg->def != i2_reg->def)
+            return false;
+      }
+   }
  
-       return true;
+   return true;
  }
  
  static bool
  instr_can_cse(const struct ir3_instruction *instr)
  {
-       if (instr->opc != OPC_META_COLLECT)
-               return false;
+   if (instr->opc != OPC_META_COLLECT)
+      return false;
  
-       return true;
+   return true;
  }
  
  static bool
  cmp_func(const void *data1, const void *data2)
  {
-       return instrs_equal(data1, data2);
+   return instrs_equal(data1, data2);
  }
  
  bool
  ir3_cse(struct ir3 *ir)
  {
-       struct set *instr_set = _mesa_set_create(NULL, hash_instr, cmp_func);
-       foreach_block (block, &ir->block_list) {
-               _mesa_set_clear(instr_set, NULL);
-
-               foreach_instr (instr, &block->instr_list) {
-                       instr->data = NULL;
-
-                       if (!instr_can_cse(instr))
-                               continue;
-
-                       bool found;
-                       struct set_entry *entry =
-                               _mesa_set_search_or_add(instr_set, instr, &found);
-                       if (found)
-                               instr->data = (void *) entry->key;
-               }
-       }
-
-
-       bool progress = false;
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       foreach_src(src, instr) {
-                               if ((src->flags & IR3_REG_SSA) &&
-                                       src->def &&
-                                       src->def->instr->data) {
-                                       progress = true;
-                                       struct ir3_instruction *instr = src->def->instr->data;
-                                       src->def = instr->dsts[0];
-                               }
-                       }
-               }
-       }
-
-       _mesa_set_destroy(instr_set, NULL);
-       return progress;
+   struct set *instr_set = _mesa_set_create(NULL, hash_instr, cmp_func);
+   foreach_block (block, &ir->block_list) {
+      _mesa_set_clear(instr_set, NULL);
+
+      foreach_instr (instr, &block->instr_list) {
+         instr->data = NULL;
+
+         if (!instr_can_cse(instr))
+            continue;
+
+         bool found;
+         struct set_entry *entry =
+            _mesa_set_search_or_add(instr_set, instr, &found);
+         if (found)
+            instr->data = (void *)entry->key;
+      }
+   }
+
+   bool progress = false;
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         foreach_src (src, instr) {
+            if ((src->flags & IR3_REG_SSA) && src->def &&
+                src->def->instr->data) {
+               progress = true;
+               struct ir3_instruction *instr = src->def->instr->data;
+               src->def = instr->dsts[0];
+            }
+         }
+      }
+   }
+
+   _mesa_set_destroy(instr_set, NULL);
+   return progress;
  }
-
diff --git a/src/freedreno/ir3/ir3_dce.c b/src/freedreno/ir3/ir3_dce.c

index 76aaebd..76298e6 100644 (file)
--- a/src/freedreno/ir3/ir3_dce.c
+++ b/src/freedreno/ir3/ir3_dce.c
@@ -36,168 +36,168 @@
  static void
  mark_array_use(struct ir3_instruction *instr, struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_ARRAY) {
-               struct ir3_array *arr =
-                       ir3_lookup_array(instr->block->shader, reg->array.id);
-               arr->unused = false;
-       }
+   if (reg->flags & IR3_REG_ARRAY) {
+      struct ir3_array *arr =
+         ir3_lookup_array(instr->block->shader, reg->array.id);
+      arr->unused = false;
+   }
  }
  
  static void
  instr_dce(struct ir3_instruction *instr, bool falsedep)
  {
-       /* don't mark falsedep's as used, but otherwise process them normally: */
-       if (!falsedep)
-               instr->flags &= ~IR3_INSTR_UNUSED;
+   /* don't mark falsedep's as used, but otherwise process them normally: */
+   if (!falsedep)
+      instr->flags &= ~IR3_INSTR_UNUSED;
  
-       if (ir3_instr_check_mark(instr))
-               return;
+   if (ir3_instr_check_mark(instr))
+      return;
  
-       if (writes_gpr(instr))
-               mark_array_use(instr, instr->dsts[0]);   /* dst */
+   if (writes_gpr(instr))
+      mark_array_use(instr, instr->dsts[0]); /* dst */
  
-       foreach_src (reg, instr)
-               mark_array_use(instr, reg);              /* src */
+   foreach_src (reg, instr)
+      mark_array_use(instr, reg); /* src */
  
-       foreach_ssa_src_n (src, i, instr) {
-               instr_dce(src, __is_false_dep(instr, i));
-       }
+   foreach_ssa_src_n (src, i, instr) {
+      instr_dce(src, __is_false_dep(instr, i));
+   }
  }
  
  static bool
  remove_unused_by_block(struct ir3_block *block)
  {
-       bool progress = false;
-       foreach_instr_safe (instr, &block->instr_list) {
-               if (instr->opc == OPC_END || instr->opc == OPC_CHSH || instr->opc == OPC_CHMASK)
-                       continue;
-               if (instr->flags & IR3_INSTR_UNUSED) {
-                       if (instr->opc == OPC_META_SPLIT) {
-                               struct ir3_instruction *src = ssa(instr->srcs[0]);
-                               /* tex (cat5) instructions have a writemask, so we can
-                                * mask off unused components.  Other instructions do not.
-                                */
-                               if (src && is_tex_or_prefetch(src) && (src->dsts[0]->wrmask > 1)) {
-                                       src->dsts[0]->wrmask &= ~(1 << instr->split.off);
-                               }
-                       }
-
-                       /* prune false-deps, etc: */
-                       foreach_ssa_use (use, instr)
-                               foreach_ssa_srcp_n (srcp, n, use)
-                                       if (*srcp == instr)
-                                               *srcp = NULL;
-
-                       list_delinit(&instr->node);
-                       progress = true;
-               }
-       }
-       return progress;
+   bool progress = false;
+   foreach_instr_safe (instr, &block->instr_list) {
+      if (instr->opc == OPC_END || instr->opc == OPC_CHSH ||
+          instr->opc == OPC_CHMASK)
+         continue;
+      if (instr->flags & IR3_INSTR_UNUSED) {
+         if (instr->opc == OPC_META_SPLIT) {
+            struct ir3_instruction *src = ssa(instr->srcs[0]);
+            /* tex (cat5) instructions have a writemask, so we can
+             * mask off unused components.  Other instructions do not.
+             */
+            if (src && is_tex_or_prefetch(src) && (src->dsts[0]->wrmask > 1)) {
+               src->dsts[0]->wrmask &= ~(1 << instr->split.off);
+            }
+         }
+
+         /* prune false-deps, etc: */
+         foreach_ssa_use (use, instr)
+            foreach_ssa_srcp_n (srcp, n, use)
+               if (*srcp == instr)
+                  *srcp = NULL;
+
+         list_delinit(&instr->node);
+         progress = true;
+      }
+   }
+   return progress;
  }
  
  static bool
  find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
  {
-       unsigned i;
-       bool progress = false;
-
-       ir3_clear_mark(ir);
-
-       /* initially mark everything as unused, we'll clear the flag as we
-        * visit the instructions:
-        */
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       /* special case, if pre-fs texture fetch used, we cannot
-                        * eliminate the barycentric i/j input
-                        */
-                       if (so->num_sampler_prefetch &&
-                                       (instr->opc == OPC_META_INPUT) &&
-                                       (instr->input.sysval == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL))
-                               continue;
-                       instr->flags |= IR3_INSTR_UNUSED;
-               }
-       }
-
-       foreach_array (arr, &ir->array_list)
-               arr->unused = true;
-
-       foreach_block (block, &ir->block_list) {
-               for (i = 0; i < block->keeps_count; i++)
-                       instr_dce(block->keeps[i], false);
-
-               /* We also need to account for if-condition: */
-               if (block->condition)
-                       instr_dce(block->condition, false);
-       }
-
-       /* remove un-used instructions: */
-       foreach_block (block, &ir->block_list) {
-               progress |= remove_unused_by_block(block);
-       }
-
-       /* remove un-used arrays: */
-       foreach_array_safe (arr, &ir->array_list) {
-               if (arr->unused)
-                       list_delinit(&arr->node);
-       }
-
-       /* fixup wrmask of split instructions to account for adjusted tex
-        * wrmask's:
-        */
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (instr->opc != OPC_META_SPLIT)
-                               continue;
-
-                       struct ir3_instruction *src = ssa(instr->srcs[0]);
-                       if (!is_tex_or_prefetch(src))
-                               continue;
-
-                       instr->srcs[0]->wrmask = src->dsts[0]->wrmask;
-               }
-       }
-
-       for (i = 0; i < ir->a0_users_count; i++) {
-               struct ir3_instruction *instr = ir->a0_users[i];
-               if (instr && (instr->flags & IR3_INSTR_UNUSED))
-                       ir->a0_users[i] = NULL;
-       }
-
-       for (i = 0; i < ir->a1_users_count; i++) {
-               struct ir3_instruction *instr = ir->a1_users[i];
-               if (instr && (instr->flags & IR3_INSTR_UNUSED))
-                       ir->a1_users[i] = NULL;
-       }
-
-       for (i = 0; i < ir->predicates_count; i++) {
-               struct ir3_instruction *instr = ir->predicates[i];
-               if (instr && (instr->flags & IR3_INSTR_UNUSED))
-                       ir->predicates[i] = NULL;
-       }
-
-       /* cleanup unused inputs: */
-       foreach_input_n (in, n, ir)
-               if (in->flags & IR3_INSTR_UNUSED)
-                       ir->inputs[n] = NULL;
-
-       return progress;
+   unsigned i;
+   bool progress = false;
+
+   ir3_clear_mark(ir);
+
+   /* initially mark everything as unused, we'll clear the flag as we
+    * visit the instructions:
+    */
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         /* special case, if pre-fs texture fetch used, we cannot
+          * eliminate the barycentric i/j input
+          */
+         if (so->num_sampler_prefetch && (instr->opc == OPC_META_INPUT) &&
+             (instr->input.sysval == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL))
+            continue;
+         instr->flags |= IR3_INSTR_UNUSED;
+      }
+   }
+
+   foreach_array (arr, &ir->array_list)
+      arr->unused = true;
+
+   foreach_block (block, &ir->block_list) {
+      for (i = 0; i < block->keeps_count; i++)
+         instr_dce(block->keeps[i], false);
+
+      /* We also need to account for if-condition: */
+      if (block->condition)
+         instr_dce(block->condition, false);
+   }
+
+   /* remove un-used instructions: */
+   foreach_block (block, &ir->block_list) {
+      progress |= remove_unused_by_block(block);
+   }
+
+   /* remove un-used arrays: */
+   foreach_array_safe (arr, &ir->array_list) {
+      if (arr->unused)
+         list_delinit(&arr->node);
+   }
+
+   /* fixup wrmask of split instructions to account for adjusted tex
+    * wrmask's:
+    */
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_SPLIT)
+            continue;
+
+         struct ir3_instruction *src = ssa(instr->srcs[0]);
+         if (!is_tex_or_prefetch(src))
+            continue;
+
+         instr->srcs[0]->wrmask = src->dsts[0]->wrmask;
+      }
+   }
+
+   for (i = 0; i < ir->a0_users_count; i++) {
+      struct ir3_instruction *instr = ir->a0_users[i];
+      if (instr && (instr->flags & IR3_INSTR_UNUSED))
+         ir->a0_users[i] = NULL;
+   }
+
+   for (i = 0; i < ir->a1_users_count; i++) {
+      struct ir3_instruction *instr = ir->a1_users[i];
+      if (instr && (instr->flags & IR3_INSTR_UNUSED))
+         ir->a1_users[i] = NULL;
+   }
+
+   for (i = 0; i < ir->predicates_count; i++) {
+      struct ir3_instruction *instr = ir->predicates[i];
+      if (instr && (instr->flags & IR3_INSTR_UNUSED))
+         ir->predicates[i] = NULL;
+   }
+
+   /* cleanup unused inputs: */
+   foreach_input_n (in, n, ir)
+      if (in->flags & IR3_INSTR_UNUSED)
+         ir->inputs[n] = NULL;
+
+   return progress;
  }
  
  bool
  ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so)
  {
-       void *mem_ctx = ralloc_context(NULL);
-       bool progress, made_progress = false;
+   void *mem_ctx = ralloc_context(NULL);
+   bool progress, made_progress = false;
  
-       ir3_find_ssa_uses(ir, mem_ctx, true);
+   ir3_find_ssa_uses(ir, mem_ctx, true);
  
-       do {
-               progress = find_and_remove_unused(ir, so);
-               made_progress |= progress;
-       } while (progress);
+   do {
+      progress = find_and_remove_unused(ir, so);
+      made_progress |= progress;
+   } while (progress);
  
-       ralloc_free(mem_ctx);
+   ralloc_free(mem_ctx);
  
-       return made_progress;
+   return made_progress;
  }
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c

index 08fbf27..d06347f 100644 (file)
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -57,116 +57,112 @@
   */
  int
  ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n, bool soft)
+               struct ir3_instruction *consumer, unsigned n, bool soft)
  {
-       /* generally don't count false dependencies, since this can just be
-        * something like a barrier, or SSBO store.
-        */
-       if (__is_false_dep(consumer, n))
-               return 0;
-
-       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
-        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
-        * handled with sync bits
-        */
-
-       if (is_meta(assigner) || is_meta(consumer))
-               return 0;
-
-       if (writes_addr0(assigner) || writes_addr1(assigner))
-               return 6;
-
-       if (soft && is_sfu(assigner))
-               return SOFT_SS_NOPS;
-
-       /* handled via sync flags: */
-       if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
-               return 0;
-
-       /* As far as we know, shader outputs don't need any delay. */
-       if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
-               return 0;
-
-       /* assigner must be alu: */
-       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
-                       is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
-               return 6;
-       } else {
-               /* In mergedregs mode, there is an extra 2-cycle penalty when half of
-                * a full-reg is read as a half-reg or when a half-reg is read as a
-                * full-reg.
-                */
-               bool mismatched_half =
-                       (assigner->dsts[0]->flags & IR3_REG_HALF) !=
-                       (consumer->srcs[n]->flags & IR3_REG_HALF);
-               unsigned penalty = mismatched_half ? 2 : 0;
-               if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-                       (n == 2)) {
-                       /* special case, 3rd src to cat3 not required on first cycle */
-                       return 1 + penalty;
-               } else {
-                       return 3 + penalty;
-               }
-       }
+   /* generally don't count false dependencies, since this can just be
+    * something like a barrier, or SSBO store.
+    */
+   if (__is_false_dep(consumer, n))
+      return 0;
+
+   /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+    * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+    * handled with sync bits
+    */
+
+   if (is_meta(assigner) || is_meta(consumer))
+      return 0;
+
+   if (writes_addr0(assigner) || writes_addr1(assigner))
+      return 6;
+
+   if (soft && is_sfu(assigner))
+      return SOFT_SS_NOPS;
+
+   /* handled via sync flags: */
+   if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+      return 0;
+
+   /* As far as we know, shader outputs don't need any delay. */
+   if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
+      return 0;
+
+   /* assigner must be alu: */
+   if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+       is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
+      return 6;
+   } else {
+      /* In mergedregs mode, there is an extra 2-cycle penalty when half of
+       * a full-reg is read as a half-reg or when a half-reg is read as a
+       * full-reg.
+       */
+      bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
+                             (consumer->srcs[n]->flags & IR3_REG_HALF);
+      unsigned penalty = mismatched_half ? 2 : 0;
+      if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
+         /* special case, 3rd src to cat3 not required on first cycle */
+         return 1 + penalty;
+      } else {
+         return 3 + penalty;
+      }
+   }
  }
  
  static bool
  count_instruction(struct ir3_instruction *n)
  {
-       /* NOTE: don't count branch/jump since we don't know yet if they will
-        * be eliminated later in resolve_jumps().. really should do that
-        * earlier so we don't have this constraint.
-        */
-       return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
+   /* NOTE: don't count branch/jump since we don't know yet if they will
+    * be eliminated later in resolve_jumps().. really should do that
+    * earlier so we don't have this constraint.
+    */
+   return is_alu(n) ||
+          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
  }
  
  static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned maxd)
+distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
  {
-       unsigned d = 0;
-
-       /* Note that this relies on incrementally building up the block's
-        * instruction list.. but this is how scheduling and nopsched
-        * work.
-        */
-       foreach_instr_rev (n, &block->instr_list) {
-               if ((n == instr) || (d >= maxd))
-                       return MIN2(maxd, d + n->nop);
-               if (count_instruction(n))
-                       d = MIN2(maxd, d + 1 + n->repeat + n->nop);
-       }
-
-       return maxd;
+   unsigned d = 0;
+
+   /* Note that this relies on incrementally building up the block's
+    * instruction list.. but this is how scheduling and nopsched
+    * work.
+    */
+   foreach_instr_rev (n, &block->instr_list) {
+      if ((n == instr) || (d >= maxd))
+         return MIN2(maxd, d + n->nop);
+      if (count_instruction(n))
+         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
+   }
+
+   return maxd;
  }
  
  static unsigned
-delay_calc_srcn_prera(struct ir3_block *block,
-               struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer,
-               unsigned srcn)
+delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
+                      struct ir3_instruction *consumer, unsigned srcn)
  {
-       unsigned delay = 0;
+   unsigned delay = 0;
  
-       if (assigner->opc == OPC_META_PHI)
-               return 0;
+   if (assigner->opc == OPC_META_PHI)
+      return 0;
  
-       if (is_meta(assigner)) {
-               foreach_src_n (src, n, assigner) {
-                       unsigned d;
+   if (is_meta(assigner)) {
+      foreach_src_n (src, n, assigner) {
+         unsigned d;
  
-                       if (!src->def)
-                               continue;
+         if (!src->def)
+            continue;
  
-                       d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
-                       delay = MAX2(delay, d);
-               }
-       } else {
-               delay = ir3_delayslots(assigner, consumer, srcn, false);
-               delay -= distance(block, assigner, delay);
-       }
+         d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
+         delay = MAX2(delay, d);
+      }
+   } else {
+      delay = ir3_delayslots(assigner, consumer, srcn, false);
+      delay -= distance(block, assigner, delay);
+   }
  
-       return delay;
+   return delay;
  }
  
  /**
@@ -176,19 +172,19 @@ delay_calc_srcn_prera(struct ir3_block *block,
  unsigned
  ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
  {
-       unsigned delay = 0;
+   unsigned delay = 0;
  
-       foreach_src_n (src, i, instr) {
-               unsigned d = 0;
+   foreach_src_n (src, i, instr) {
+      unsigned d = 0;
  
-               if (src->def && src->def->instr->block == block) {
-                       d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
-               }
+      if (src->def && src->def->instr->block == block) {
+         d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
+      }
  
-               delay = MAX2(delay, d);
-       }
+      delay = MAX2(delay, d);
+   }
  
-       return delay;
+   return delay;
  }
  
  /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
@@ -198,185 +194,186 @@ ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
  static unsigned
  post_ra_reg_elems(struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_RELATIV)
-               return reg->size;
-       return reg_elems(reg);
+   if (reg->flags & IR3_REG_RELATIV)
+      return reg->size;
+   return reg_elems(reg);
  }
  
  static unsigned
  post_ra_reg_num(struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_RELATIV)
-               return reg->array.base;
-       return reg->num;
+   if (reg->flags & IR3_REG_RELATIV)
+      return reg->array.base;
+   return reg->num;
  }
  
  static unsigned
-delay_calc_srcn_postra(struct ir3_instruction *assigner, struct ir3_instruction *consumer,
-                                          unsigned assigner_n, unsigned consumer_n, bool soft, bool mergedregs)
+delay_calc_srcn_postra(struct ir3_instruction *assigner,
+                       struct ir3_instruction *consumer, unsigned assigner_n,
+                       unsigned consumer_n, bool soft, bool mergedregs)
  {
-       struct ir3_register *src = consumer->srcs[consumer_n];
-       struct ir3_register *dst = assigner->dsts[assigner_n];
-       bool mismatched_half =
-               (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
-
-       /* In the mergedregs case or when the register is a special register,
-        * half-registers do not alias with full registers.
-        */
-       if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
-               mismatched_half)
-               return 0;
-
-       unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
-       unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
-       unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
-       unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
-
-       if (dst_start >= src_end || src_start >= dst_end)
-               return 0;
-
-       unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
-
-       if (assigner->repeat == 0 && consumer->repeat == 0)
-               return delay;
-
-       /* If either side is a relative access, we can't really apply most of the
-        * reasoning below because we don't know which component aliases which.
-        * Just bail in this case.
-        */
-       if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
-               return delay;
-
-       /* MOVMSK seems to require that all users wait until the entire
-        * instruction is finished, so just bail here.
-        */
-       if (assigner->opc == OPC_MOVMSK)
-               return delay;
-
-       /* TODO: Handle the combination of (rpt) and different component sizes
-        * better like below. This complicates things significantly because the
-        * components don't line up.
-        */
-       if (mismatched_half)
-               return delay;
-
-       /* If an instruction has a (rpt), then it acts as a sequence of
-        * instructions, reading its non-(r) sources at each cycle. First, get the
-        * register num for the first instruction where they interfere:
-        */
-
-       unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
-
-       /* Now, for that first conflicting half/full register, figure out the
-        * sub-instruction within assigner/consumer it corresponds to. For (r)
-        * sources, this should already return the correct answer of 0. However we
-        * have to special-case the multi-mov instructions, where the
-        * sub-instructions sometimes come from the src/dst indices instead.
-        */
-       unsigned first_src_instr;
-       if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
-               first_src_instr = consumer_n;
-       else
-               first_src_instr = first_num - src->num;
-
-       unsigned first_dst_instr;
-       if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
-               first_dst_instr = assigner_n;
-       else
-               first_dst_instr = first_num - dst->num;
-
-       /* The delay we return is relative to the *end* of assigner and the
-        * *beginning* of consumer, because it's the number of nops (or other
-        * things) needed between them. Any instructions after first_dst_instr
-        * subtract from the delay, and so do any instructions before
-        * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
-        * delay to account for that.
-        *
-        * Now, a priori, we need to go through this process for every
-        * conflicting regnum and take the minimum of the offsets to make sure
-        * that the appropriate number of nop's is inserted for every conflicting
-        * pair of sub-instructions. However, as we go to the next conflicting
-        * regnum (if any), the number of instructions after first_dst_instr
-        * decreases by 1 and the number of source instructions before
-        * first_src_instr correspondingly increases by 1, so the offset stays the
-        * same for all conflicting registers.
-        */
-       unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
-       return offset > delay ? 0 : delay - offset;
+   struct ir3_register *src = consumer->srcs[consumer_n];
+   struct ir3_register *dst = assigner->dsts[assigner_n];
+   bool mismatched_half =
+      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
+
+   /* In the mergedregs case or when the register is a special register,
+    * half-registers do not alias with full registers.
+    */
+   if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
+       mismatched_half)
+      return 0;
+
+   unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
+   unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
+   unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
+   unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
+
+   if (dst_start >= src_end || src_start >= dst_end)
+      return 0;
+
+   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
+
+   if (assigner->repeat == 0 && consumer->repeat == 0)
+      return delay;
+
+   /* If either side is a relative access, we can't really apply most of the
+    * reasoning below because we don't know which component aliases which.
+    * Just bail in this case.
+    */
+   if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
+      return delay;
+
+   /* MOVMSK seems to require that all users wait until the entire
+    * instruction is finished, so just bail here.
+    */
+   if (assigner->opc == OPC_MOVMSK)
+      return delay;
+
+   /* TODO: Handle the combination of (rpt) and different component sizes
+    * better like below. This complicates things significantly because the
+    * components don't line up.
+    */
+   if (mismatched_half)
+      return delay;
+
+   /* If an instruction has a (rpt), then it acts as a sequence of
+    * instructions, reading its non-(r) sources at each cycle. First, get the
+    * register num for the first instruction where they interfere:
+    */
+
+   unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
+
+   /* Now, for that first conflicting half/full register, figure out the
+    * sub-instruction within assigner/consumer it corresponds to. For (r)
+    * sources, this should already return the correct answer of 0. However we
+    * have to special-case the multi-mov instructions, where the
+    * sub-instructions sometimes come from the src/dst indices instead.
+    */
+   unsigned first_src_instr;
+   if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
+      first_src_instr = consumer_n;
+   else
+      first_src_instr = first_num - src->num;
+
+   unsigned first_dst_instr;
+   if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
+      first_dst_instr = assigner_n;
+   else
+      first_dst_instr = first_num - dst->num;
+
+   /* The delay we return is relative to the *end* of assigner and the
+    * *beginning* of consumer, because it's the number of nops (or other
+    * things) needed between them. Any instructions after first_dst_instr
+    * subtract from the delay, and so do any instructions before
+    * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
+    * delay to account for that.
+    *
+    * Now, a priori, we need to go through this process for every
+    * conflicting regnum and take the minimum of the offsets to make sure
+    * that the appropriate number of nop's is inserted for every conflicting
+    * pair of sub-instructions. However, as we go to the next conflicting
+    * regnum (if any), the number of instructions after first_dst_instr
+    * decreases by 1 and the number of source instructions before
+    * first_src_instr correspondingly increases by 1, so the offset stays the
+    * same for all conflicting registers.
+    */
+   unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
+   return offset > delay ? 0 : delay - offset;
  }
  
  static unsigned
-delay_calc_postra(struct ir3_block *block,
-                                 struct ir3_instruction *start,
-                                 struct ir3_instruction *consumer,
-                                 unsigned distance, bool soft, bool pred, bool mergedregs)
+delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
+                  struct ir3_instruction *consumer, unsigned distance,
+                  bool soft, bool pred, bool mergedregs)
  {
-       unsigned delay = 0;
-       /* Search backwards starting at the instruction before start, unless it's
-        * NULL then search backwards from the block end.
-        */
-       struct list_head *start_list = start ? start->node.prev : block->instr_list.prev;
-       list_for_each_entry_from_rev(struct ir3_instruction, assigner, start_list, &block->instr_list, node) {
-               if (count_instruction(assigner))
-                       distance += assigner->nop;
-
-               if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
-                       return delay;
-
-               if (is_meta(assigner))
-                       continue;
-
-               unsigned new_delay = 0;
-
-               foreach_dst_n (dst, dst_n, assigner) {
-                       if (dst->wrmask == 0)
-                               continue;
-                       foreach_src_n (src, src_n, consumer) {
-                               if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
-                                       continue;
-
-                               unsigned src_delay =
-                                       delay_calc_srcn_postra(assigner, consumer, dst_n,
-                                                                                  src_n, soft, mergedregs);
-                               new_delay = MAX2(new_delay, src_delay);
-                       }
-               }
-
-               new_delay = new_delay > distance ? new_delay - distance : 0;
-               delay = MAX2(delay, new_delay);
-
-               if (count_instruction(assigner))
-                       distance += 1 + assigner->repeat;
-       }
-
-       /* Note: this allows recursion into "block" if it has already been
-        * visited, but *not* recursion into its predecessors. We may have to
-        * visit the original block twice, for the loop case where we have to
-        * consider definititons in an earlier iterations of the same loop:
-        *
-        * while (...) {
-        *              mov.u32u32 ..., r0.x
-        *              ...
-        *              mov.u32u32 r0.x, ...
-        * }
-        *
-        * However any other recursion would be unnecessary.
-        */
-
-       if (pred && block->data != block) {
-               block->data = block;
-
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       struct ir3_block *pred = block->predecessors[i];
-                       unsigned pred_delay =
-                               delay_calc_postra(pred, NULL, consumer, distance, soft, pred, mergedregs);
-                       delay = MAX2(delay, pred_delay);
-               }
-
-               block->data = NULL;
-       }
-
-       return delay;
+   unsigned delay = 0;
+   /* Search backwards starting at the instruction before start, unless it's
+    * NULL then search backwards from the block end.
+    */
+   struct list_head *start_list =
+      start ? start->node.prev : block->instr_list.prev;
+   list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
+                                 &block->instr_list, node) {
+      if (count_instruction(assigner))
+         distance += assigner->nop;
+
+      if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
+         return delay;
+
+      if (is_meta(assigner))
+         continue;
+
+      unsigned new_delay = 0;
+
+      foreach_dst_n (dst, dst_n, assigner) {
+         if (dst->wrmask == 0)
+            continue;
+         foreach_src_n (src, src_n, consumer) {
+            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
+               continue;
+
+            unsigned src_delay = delay_calc_srcn_postra(
+               assigner, consumer, dst_n, src_n, soft, mergedregs);
+            new_delay = MAX2(new_delay, src_delay);
+         }
+      }
+
+      new_delay = new_delay > distance ? new_delay - distance : 0;
+      delay = MAX2(delay, new_delay);
+
+      if (count_instruction(assigner))
+         distance += 1 + assigner->repeat;
+   }
+
+   /* Note: this allows recursion into "block" if it has already been
+    * visited, but *not* recursion into its predecessors. We may have to
+    * visit the original block twice, for the loop case where we have to
+    * consider definititons in an earlier iterations of the same loop:
+    *
+    * while (...) {
+    *          mov.u32u32 ..., r0.x
+    *          ...
+    *          mov.u32u32 r0.x, ...
+    * }
+    *
+    * However any other recursion would be unnecessary.
+    */
+
+   if (pred && block->data != block) {
+      block->data = block;
+
+      for (unsigned i = 0; i < block->predecessors_count; i++) {
+         struct ir3_block *pred = block->predecessors[i];
+         unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
+                                                 soft, pred, mergedregs);
+         delay = MAX2(delay, pred_delay);
+      }
+
+      block->data = NULL;
+   }
+
+   return delay;
  }
  
  /**
@@ -392,9 +389,9 @@ delay_calc_postra(struct ir3_block *block,
   */
  unsigned
  ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
-               bool soft, bool mergedregs)
+                      bool soft, bool mergedregs)
  {
-       return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
+   return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
  }
  
  /**
@@ -403,9 +400,9 @@ ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
   */
  unsigned
  ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
-               bool mergedregs)
+                     bool mergedregs)
  {
-       return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
+   return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
  }
  
  /**
@@ -419,12 +416,11 @@ ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
  void
  ir3_remove_nops(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (instr->opc == OPC_NOP) {
-                               list_del(&instr->node);
-                       }
-               }
-       }
-
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_NOP) {
+            list_del(&instr->node);
+         }
+      }
+   }
  }
diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c

index ec99c83..a3b4d01 100644 (file)
--- a/src/freedreno/ir3/ir3_disk_cache.c
+++ b/src/freedreno/ir3/ir3_disk_cache.c
@@ -48,183 +48,185 @@
  void
  ir3_disk_cache_init(struct ir3_compiler *compiler)
  {
-       if (ir3_shader_debug & IR3_DBG_NOCACHE)
-               return;
+   if (ir3_shader_debug & IR3_DBG_NOCACHE)
+      return;
  
-       /* array length = print length + nul char + 1 extra to verify it's unused */
-       char renderer[7];
-       ASSERTED int len =
-                       snprintf(renderer, sizeof(renderer), "FD%03d", compiler->gpu_id);
-       assert(len == sizeof(renderer) - 2);
+   /* array length = print length + nul char + 1 extra to verify it's unused */
+   char renderer[7];
+   ASSERTED int len =
+      snprintf(renderer, sizeof(renderer), "FD%03d", compiler->gpu_id);
+   assert(len == sizeof(renderer) - 2);
  
-       const struct build_id_note *note =
-                       build_id_find_nhdr_for_addr(ir3_disk_cache_init);
-       assert(note && build_id_length(note) == 20); /* sha1 */
+   const struct build_id_note *note =
+      build_id_find_nhdr_for_addr(ir3_disk_cache_init);
+   assert(note && build_id_length(note) == 20); /* sha1 */
  
-       const uint8_t *id_sha1 = build_id_data(note);
-       assert(id_sha1);
+   const uint8_t *id_sha1 = build_id_data(note);
+   assert(id_sha1);
  
-       char timestamp[41];
-       _mesa_sha1_format(timestamp, id_sha1);
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, id_sha1);
  
-       uint64_t driver_flags = ir3_shader_debug;
-       if (compiler->robust_ubo_access)
-               driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
-       compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+   uint64_t driver_flags = ir3_shader_debug;
+   if (compiler->robust_ubo_access)
+      driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
+   compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
  }
  
  void
  ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
-               struct ir3_shader *shader)
+                               struct ir3_shader *shader)
  {
-       if (!compiler->disk_cache)
-               return;
-
-       struct mesa_sha1 ctx;
-
-       _mesa_sha1_init(&ctx);
-
-       /* Serialize the NIR to a binary blob that we can hash for the disk
-        * cache.  Drop unnecessary information (like variable names)
-        * so the serialized NIR is smaller, and also to let us detect more
-        * isomorphic shaders when hashing, increasing cache hits.
-        */
-       struct blob blob;
-       blob_init(&blob);
-       nir_serialize(&blob, shader->nir, true);
-       _mesa_sha1_update(&ctx, blob.data, blob.size);
-       blob_finish(&blob);
-
-       /* Note that on some gens stream-out is lowered in ir3 to stg.  For later
-        * gens we maybe don't need to include stream-out in the cache key.
-        */
-       _mesa_sha1_update(&ctx, &shader->stream_output, sizeof(shader->stream_output));
-
-       _mesa_sha1_final(&ctx, shader->cache_key);
+   if (!compiler->disk_cache)
+      return;
+
+   struct mesa_sha1 ctx;
+
+   _mesa_sha1_init(&ctx);
+
+   /* Serialize the NIR to a binary blob that we can hash for the disk
+    * cache.  Drop unnecessary information (like variable names)
+    * so the serialized NIR is smaller, and also to let us detect more
+    * isomorphic shaders when hashing, increasing cache hits.
+    */
+   struct blob blob;
+   blob_init(&blob);
+   nir_serialize(&blob, shader->nir, true);
+   _mesa_sha1_update(&ctx, blob.data, blob.size);
+   blob_finish(&blob);
+
+   /* Note that on some gens stream-out is lowered in ir3 to stg.  For later
+    * gens we maybe don't need to include stream-out in the cache key.
+    */
+   _mesa_sha1_update(&ctx, &shader->stream_output,
+                     sizeof(shader->stream_output));
+
+   _mesa_sha1_final(&ctx, shader->cache_key);
  }
  
  static void
-compute_variant_key(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *v, cache_key cache_key)
+compute_variant_key(struct ir3_compiler *compiler, struct ir3_shader_variant *v,
+                    cache_key cache_key)
  {
-       struct blob blob;
-       blob_init(&blob);
+   struct blob blob;
+   blob_init(&blob);
  
-       blob_write_bytes(&blob, &v->shader->cache_key, sizeof(v->shader->cache_key));
-       blob_write_bytes(&blob, &v->key, sizeof(v->key));
-       blob_write_uint8(&blob, v->binning_pass);
+   blob_write_bytes(&blob, &v->shader->cache_key, sizeof(v->shader->cache_key));
+   blob_write_bytes(&blob, &v->key, sizeof(v->key));
+   blob_write_uint8(&blob, v->binning_pass);
  
-       disk_cache_compute_key(compiler->disk_cache, blob.data, blob.size, cache_key);
+   disk_cache_compute_key(compiler->disk_cache, blob.data, blob.size,
+                          cache_key);
  
-       blob_finish(&blob);
+   blob_finish(&blob);
  }
  
  static void
  retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
  {
-       blob_copy_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
-
-       /*
-        * pointers need special handling:
-        */
-
-       v->bin = rzalloc_size(v, v->info.size);
-       blob_copy_bytes(blob, v->bin, v->info.size);
-
-       if (!v->binning_pass) {
-               blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
-               unsigned immeds_sz = v->const_state->immediates_size *
-                               sizeof(v->const_state->immediates[0]);
-               v->const_state->immediates = ralloc_size(v->const_state, immeds_sz);
-               blob_copy_bytes(blob, v->const_state->immediates, immeds_sz);
-       }
+   blob_copy_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
+
+   /*
+    * pointers need special handling:
+    */
+
+   v->bin = rzalloc_size(v, v->info.size);
+   blob_copy_bytes(blob, v->bin, v->info.size);
+
+   if (!v->binning_pass) {
+      blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
+      unsigned immeds_sz = v->const_state->immediates_size *
+                           sizeof(v->const_state->immediates[0]);
+      v->const_state->immediates = ralloc_size(v->const_state, immeds_sz);
+      blob_copy_bytes(blob, v->const_state->immediates, immeds_sz);
+   }
  }
  
  static void
  store_variant(struct blob *blob, struct ir3_shader_variant *v)
  {
-       blob_write_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
+   blob_write_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
  
-       /*
-        * pointers need special handling:
-        */
+   /*
+    * pointers need special handling:
+    */
  
-       blob_write_bytes(blob, v->bin, v->info.size);
+   blob_write_bytes(blob, v->bin, v->info.size);
  
-       /* No saving constant_data, it's already baked into bin at this point. */
+   /* No saving constant_data, it's already baked into bin at this point. */
  
-       if (!v->binning_pass) {
-               blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
-               unsigned immeds_sz = v->const_state->immediates_size *
-                               sizeof(v->const_state->immediates[0]);
-               blob_write_bytes(blob, v->const_state->immediates, immeds_sz);
-       }
+   if (!v->binning_pass) {
+      blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
+      unsigned immeds_sz = v->const_state->immediates_size *
+                           sizeof(v->const_state->immediates[0]);
+      blob_write_bytes(blob, v->const_state->immediates, immeds_sz);
+   }
  }
  
  bool
  ir3_disk_cache_retrieve(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *v)
+                        struct ir3_shader_variant *v)
  {
-       if (!compiler->disk_cache)
-               return false;
+   if (!compiler->disk_cache)
+      return false;
  
-       cache_key cache_key;
+   cache_key cache_key;
  
-       compute_variant_key(compiler, v, cache_key);
+   compute_variant_key(compiler, v, cache_key);
  
-       if (debug) {
-               char sha1[41];
-               _mesa_sha1_format(sha1, cache_key);
-               fprintf(stderr, "[mesa disk cache] retrieving variant %s: ", sha1);
-       }
+   if (debug) {
+      char sha1[41];
+      _mesa_sha1_format(sha1, cache_key);
+      fprintf(stderr, "[mesa disk cache] retrieving variant %s: ", sha1);
+   }
  
-       size_t size;
-       void *buffer = disk_cache_get(compiler->disk_cache, cache_key, &size);
+   size_t size;
+   void *buffer = disk_cache_get(compiler->disk_cache, cache_key, &size);
  
-       if (debug)
-               fprintf(stderr, "%s\n", buffer ? "found" : "missing");
+   if (debug)
+      fprintf(stderr, "%s\n", buffer ? "found" : "missing");
  
-       if (!buffer)
-               return false;
+   if (!buffer)
+      return false;
  
-       struct blob_reader blob;
-       blob_reader_init(&blob, buffer, size);
+   struct blob_reader blob;
+   blob_reader_init(&blob, buffer, size);
  
-       retrieve_variant(&blob, v);
+   retrieve_variant(&blob, v);
  
-       if (v->binning)
-               retrieve_variant(&blob, v->binning);
+   if (v->binning)
+      retrieve_variant(&blob, v->binning);
  
-       free(buffer);
+   free(buffer);
  
-       return true;
+   return true;
  }
  
  void
  ir3_disk_cache_store(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *v)
+                     struct ir3_shader_variant *v)
  {
-       if (!compiler->disk_cache)
-               return;
+   if (!compiler->disk_cache)
+      return;
  
-       cache_key cache_key;
+   cache_key cache_key;
  
-       compute_variant_key(compiler, v, cache_key);
+   compute_variant_key(compiler, v, cache_key);
  
-       if (debug) {
-               char sha1[41];
-               _mesa_sha1_format(sha1, cache_key);
-               fprintf(stderr, "[mesa disk cache] storing variant %s\n", sha1);
-       }
+   if (debug) {
+      char sha1[41];
+      _mesa_sha1_format(sha1, cache_key);
+      fprintf(stderr, "[mesa disk cache] storing variant %s\n", sha1);
+   }
  
-       struct blob blob;
-       blob_init(&blob);
+   struct blob blob;
+   blob_init(&blob);
  
-       store_variant(&blob, v);
+   store_variant(&blob, v);
  
-       if (v->binning)
-               store_variant(&blob, v->binning);
+   if (v->binning)
+      store_variant(&blob, v->binning);
  
-       disk_cache_put(compiler->disk_cache, cache_key, blob.data, blob.size, NULL);
-       blob_finish(&blob);
+   disk_cache_put(compiler->disk_cache, cache_key, blob.data, blob.size, NULL);
+   blob_finish(&blob);
  }
diff --git a/src/freedreno/ir3/ir3_dominance.c b/src/freedreno/ir3/ir3_dominance.c

index 0b6062e..01f2da0 100644 (file)
--- a/src/freedreno/ir3/ir3_dominance.c
+++ b/src/freedreno/ir3/ir3_dominance.c
@@ -35,92 +35,91 @@
  static struct ir3_block *
  intersect(struct ir3_block *b1, struct ir3_block *b2)
  {
-       while (b1 != b2) {
-               /*
-                * Note, the comparisons here are the opposite of what the paper says
-                * because we index blocks from beginning -> end (i.e. reverse
-                * post-order) instead of post-order like they assume.
-                */
-               while (b1->index > b2->index)
-                       b1 = b1->imm_dom;
-               while (b2->index > b1->index)
-                       b2 = b2->imm_dom;
-       }
-
-       return b1;
+   while (b1 != b2) {
+      /*
+       * Note, the comparisons here are the opposite of what the paper says
+       * because we index blocks from beginning -> end (i.e. reverse
+       * post-order) instead of post-order like they assume.
+       */
+      while (b1->index > b2->index)
+         b1 = b1->imm_dom;
+      while (b2->index > b1->index)
+         b2 = b2->imm_dom;
+   }
+
+   return b1;
  }
  
-
  static bool
  calc_dominance(struct ir3_block *block)
  {
-       struct ir3_block *new_idom = NULL;
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               struct ir3_block *pred = block->predecessors[i];
-
-               if (pred->imm_dom) {
-                       if (new_idom)
-                               new_idom = intersect(pred, new_idom);
-                       else
-                               new_idom = pred;
-               }
-       }
-
-       if (block->imm_dom != new_idom) {
-               block->imm_dom = new_idom;
-               return true;
-       }
-
-       return false;
+   struct ir3_block *new_idom = NULL;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+
+      if (pred->imm_dom) {
+         if (new_idom)
+            new_idom = intersect(pred, new_idom);
+         else
+            new_idom = pred;
+      }
+   }
+
+   if (block->imm_dom != new_idom) {
+      block->imm_dom = new_idom;
+      return true;
+   }
+
+   return false;
  }
  
  static unsigned
  calc_dfs_indices(struct ir3_block *block, unsigned index)
  {
-       block->dom_pre_index = index++;
-       for (unsigned i = 0; i < block->dom_children_count; i++)
-               index = calc_dfs_indices(block->dom_children[i], index);
-       block->dom_post_index = index++;
-       return index;
+   block->dom_pre_index = index++;
+   for (unsigned i = 0; i < block->dom_children_count; i++)
+      index = calc_dfs_indices(block->dom_children[i], index);
+   block->dom_post_index = index++;
+   return index;
  }
  
  void
  ir3_calc_dominance(struct ir3 *ir)
  {
-       unsigned i = 0;
-       foreach_block (block, &ir->block_list) {
-               block->index = i++;
-               if (block == ir3_start_block(ir))
-                       block->imm_dom = block;
-               else
-                       block->imm_dom = NULL;
-               block->dom_children = NULL;
-               block->dom_children_count = block->dom_children_sz = 0;
-       }
-
-       bool progress = true;
-       while (progress) {
-               progress = false;
-               foreach_block (block, &ir->block_list) {
-                       if (block != ir3_start_block(ir))
-                               progress |= calc_dominance(block);
-               }
-       }
-
-       ir3_start_block(ir)->imm_dom = NULL;
-
-       foreach_block (block, &ir->block_list) {
-               if (block->imm_dom)
-                       array_insert(block->imm_dom, block->imm_dom->dom_children, block);
-       }
-
-       calc_dfs_indices(ir3_start_block(ir), 0);
+   unsigned i = 0;
+   foreach_block (block, &ir->block_list) {
+      block->index = i++;
+      if (block == ir3_start_block(ir))
+         block->imm_dom = block;
+      else
+         block->imm_dom = NULL;
+      block->dom_children = NULL;
+      block->dom_children_count = block->dom_children_sz = 0;
+   }
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+      foreach_block (block, &ir->block_list) {
+         if (block != ir3_start_block(ir))
+            progress |= calc_dominance(block);
+      }
+   }
+
+   ir3_start_block(ir)->imm_dom = NULL;
+
+   foreach_block (block, &ir->block_list) {
+      if (block->imm_dom)
+         array_insert(block->imm_dom, block->imm_dom->dom_children, block);
+   }
+
+   calc_dfs_indices(ir3_start_block(ir), 0);
  }
  
  /* Return true if a dominates b. This includes if a == b. */
-bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b)
+bool
+ir3_block_dominates(struct ir3_block *a, struct ir3_block *b)
  {
-       return a->dom_pre_index <= b->dom_pre_index &&
-                  a->dom_post_index >= b->dom_post_index;
+   return a->dom_pre_index <= b->dom_pre_index &&
+          a->dom_post_index >= b->dom_post_index;
  }
-
diff --git a/src/freedreno/ir3/ir3_image.c b/src/freedreno/ir3/ir3_image.c

index e2ee4e6..d4e64ed 100644 (file)
--- a/src/freedreno/ir3/ir3_image.c
+++ b/src/freedreno/ir3/ir3_image.c
@@ -26,7 +26,6 @@
  
  #include "ir3_image.h"
  
-
  /*
   * SSBO/Image to/from IBO/tex hw mapping table:
   */
@@ -34,57 +33,57 @@
  void
  ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping, unsigned num_textures)
  {
-       memset(mapping, IBO_INVALID, sizeof(*mapping));
-       mapping->num_tex = 0;
-       mapping->tex_base = num_textures;
+   memset(mapping, IBO_INVALID, sizeof(*mapping));
+   mapping->num_tex = 0;
+   mapping->tex_base = num_textures;
  }
  
  struct ir3_instruction *
  ir3_ssbo_to_ibo(struct ir3_context *ctx, nir_src src)
  {
-       if (ir3_bindless_resource(src)) {
-               ctx->so->bindless_ibo = true;
-               return ir3_get_src(ctx, &src)[0];
-       } else {
-               /* can this be non-const buffer_index?  how do we handle that? */
-               int ssbo_idx = nir_src_as_uint(src);
-               return create_immed(ctx->block, ssbo_idx);
-       }
+   if (ir3_bindless_resource(src)) {
+      ctx->so->bindless_ibo = true;
+      return ir3_get_src(ctx, &src)[0];
+   } else {
+      /* can this be non-const buffer_index?  how do we handle that? */
+      int ssbo_idx = nir_src_as_uint(src);
+      return create_immed(ctx->block, ssbo_idx);
+   }
  }
  
  unsigned
  ir3_ssbo_to_tex(struct ir3_ibo_mapping *mapping, unsigned ssbo)
  {
-       if (mapping->ssbo_to_tex[ssbo] == IBO_INVALID) {
-               unsigned tex = mapping->num_tex++;
-               mapping->ssbo_to_tex[ssbo] = tex;
-               mapping->tex_to_image[tex] = IBO_SSBO | ssbo;
-       }
-       return mapping->ssbo_to_tex[ssbo] + mapping->tex_base;
+   if (mapping->ssbo_to_tex[ssbo] == IBO_INVALID) {
+      unsigned tex = mapping->num_tex++;
+      mapping->ssbo_to_tex[ssbo] = tex;
+      mapping->tex_to_image[tex] = IBO_SSBO | ssbo;
+   }
+   return mapping->ssbo_to_tex[ssbo] + mapping->tex_base;
  }
  
  struct ir3_instruction *
  ir3_image_to_ibo(struct ir3_context *ctx, nir_src src)
  {
-       if (ir3_bindless_resource(src)) {
-               ctx->so->bindless_ibo = true;
-               return ir3_get_src(ctx, &src)[0];
-       } else {
-               /* can this be non-const buffer_index?  how do we handle that? */
-               int image_idx = nir_src_as_uint(src);
-               return create_immed(ctx->block, ctx->s->info.num_ssbos + image_idx);
-       }
+   if (ir3_bindless_resource(src)) {
+      ctx->so->bindless_ibo = true;
+      return ir3_get_src(ctx, &src)[0];
+   } else {
+      /* can this be non-const buffer_index?  how do we handle that? */
+      int image_idx = nir_src_as_uint(src);
+      return create_immed(ctx->block, ctx->s->info.num_ssbos + image_idx);
+   }
  }
  
  unsigned
  ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image)
  {
-       if (mapping->image_to_tex[image] == IBO_INVALID) {
-               unsigned tex = mapping->num_tex++;
-               mapping->image_to_tex[image] = tex;
-               mapping->tex_to_image[tex] = image;
-       }
-       return mapping->image_to_tex[image] + mapping->tex_base;
+   if (mapping->image_to_tex[image] == IBO_INVALID) {
+      unsigned tex = mapping->num_tex++;
+      mapping->image_to_tex[image] = tex;
+      mapping->tex_to_image[tex] = image;
+   }
+   return mapping->image_to_tex[image] + mapping->tex_base;
  }
  
  /* see tex_info() for equiv logic for texture instructions.. it would be
@@ -93,87 +92,87 @@ ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image)
  unsigned
  ir3_get_image_coords(const nir_intrinsic_instr *instr, unsigned *flagsp)
  {
-       unsigned coords = nir_image_intrinsic_coord_components(instr);
-       unsigned flags = 0;
+   unsigned coords = nir_image_intrinsic_coord_components(instr);
+   unsigned flags = 0;
  
-       if (coords == 3)
-               flags |= IR3_INSTR_3D;
+   if (coords == 3)
+      flags |= IR3_INSTR_3D;
  
-       if (nir_intrinsic_image_array(instr))
-               flags |= IR3_INSTR_A;
+   if (nir_intrinsic_image_array(instr))
+      flags |= IR3_INSTR_A;
  
-       if (flagsp)
-               *flagsp = flags;
+   if (flagsp)
+      *flagsp = flags;
  
-       return coords;
+   return coords;
  }
  
  type_t
  ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr)
  {
-       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-       int bit_size = info->has_dest ? nir_dest_bit_size(instr->dest) : 32;
-
-       nir_alu_type type = nir_type_uint;
-       switch (instr->intrinsic) {
-       case nir_intrinsic_image_load:
-       case nir_intrinsic_bindless_image_load:
-               type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr));
-               /* SpvOpAtomicLoad doesn't have dest type */
-               if (type == nir_type_invalid)
-                       type = nir_type_uint;
-               break;
-
-       case nir_intrinsic_image_store:
-       case nir_intrinsic_bindless_image_store:
-               type = nir_alu_type_get_base_type(nir_intrinsic_src_type(instr));
-               /* SpvOpAtomicStore doesn't have src type */
-               if (type == nir_type_invalid)
-                       type = nir_type_uint;
-               break;
-
-       case nir_intrinsic_image_atomic_add:
-       case nir_intrinsic_bindless_image_atomic_add:
-       case nir_intrinsic_image_atomic_umin:
-       case nir_intrinsic_bindless_image_atomic_umin:
-       case nir_intrinsic_image_atomic_umax:
-       case nir_intrinsic_bindless_image_atomic_umax:
-       case nir_intrinsic_image_atomic_and:
-       case nir_intrinsic_bindless_image_atomic_and:
-       case nir_intrinsic_image_atomic_or:
-       case nir_intrinsic_bindless_image_atomic_or:
-       case nir_intrinsic_image_atomic_xor:
-       case nir_intrinsic_bindless_image_atomic_xor:
-       case nir_intrinsic_image_atomic_exchange:
-       case nir_intrinsic_bindless_image_atomic_exchange:
-       case nir_intrinsic_image_atomic_comp_swap:
-       case nir_intrinsic_bindless_image_atomic_comp_swap:
-       case nir_intrinsic_image_atomic_inc_wrap:
-       case nir_intrinsic_bindless_image_atomic_inc_wrap:
-               type = nir_type_uint;
-               break;
-
-       case nir_intrinsic_image_atomic_imin:
-       case nir_intrinsic_bindless_image_atomic_imin:
-       case nir_intrinsic_image_atomic_imax:
-       case nir_intrinsic_bindless_image_atomic_imax:
-               type = nir_type_int;
-               break;
-
-       default:
-               unreachable("Unhandled NIR image intrinsic");
-       }
-
-       switch (type) {
-       case nir_type_uint:
-               return bit_size == 16 ? TYPE_U16 : TYPE_U32;
-       case nir_type_int:
-               return bit_size == 16 ? TYPE_S16 : TYPE_S32;
-       case nir_type_float:
-               return bit_size == 16 ? TYPE_F16 : TYPE_F32;
-       default:
-               unreachable("bad type");
-       }
+   const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+   int bit_size = info->has_dest ? nir_dest_bit_size(instr->dest) : 32;
+
+   nir_alu_type type = nir_type_uint;
+   switch (instr->intrinsic) {
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_bindless_image_load:
+      type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr));
+      /* SpvOpAtomicLoad doesn't have dest type */
+      if (type == nir_type_invalid)
+         type = nir_type_uint;
+      break;
+
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_bindless_image_store:
+      type = nir_alu_type_get_base_type(nir_intrinsic_src_type(instr));
+      /* SpvOpAtomicStore doesn't have src type */
+      if (type == nir_type_invalid)
+         type = nir_type_uint;
+      break;
+
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_bindless_image_atomic_add:
+   case nir_intrinsic_image_atomic_umin:
+   case nir_intrinsic_bindless_image_atomic_umin:
+   case nir_intrinsic_image_atomic_umax:
+   case nir_intrinsic_bindless_image_atomic_umax:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_bindless_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_bindless_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_bindless_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_bindless_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap:
+   case nir_intrinsic_bindless_image_atomic_comp_swap:
+   case nir_intrinsic_image_atomic_inc_wrap:
+   case nir_intrinsic_bindless_image_atomic_inc_wrap:
+      type = nir_type_uint;
+      break;
+
+   case nir_intrinsic_image_atomic_imin:
+   case nir_intrinsic_bindless_image_atomic_imin:
+   case nir_intrinsic_image_atomic_imax:
+   case nir_intrinsic_bindless_image_atomic_imax:
+      type = nir_type_int;
+      break;
+
+   default:
+      unreachable("Unhandled NIR image intrinsic");
+   }
+
+   switch (type) {
+   case nir_type_uint:
+      return bit_size == 16 ? TYPE_U16 : TYPE_U32;
+   case nir_type_int:
+      return bit_size == 16 ? TYPE_S16 : TYPE_S32;
+   case nir_type_float:
+      return bit_size == 16 ? TYPE_F16 : TYPE_F32;
+   default:
+      unreachable("bad type");
+   }
  }
  
  /* Returns the number of components for the different image formats
@@ -183,8 +182,8 @@ ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr)
  unsigned
  ir3_get_num_components_for_image_format(enum pipe_format format)
  {
-       if (format == PIPE_FORMAT_NONE)
-               return 4;
-       else
-               return util_format_get_nr_components(format);
+   if (format == PIPE_FORMAT_NONE)
+      return 4;
+   else
+      return util_format_get_nr_components(format);
  }
diff --git a/src/freedreno/ir3/ir3_image.h b/src/freedreno/ir3/ir3_image.h

index d2c1f23..8499917 100644 (file)
--- a/src/freedreno/ir3/ir3_image.h
+++ b/src/freedreno/ir3/ir3_image.h
@@ -29,14 +29,15 @@
  
  #include "ir3_context.h"
  
-
-void ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping, unsigned num_textures);
+void ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping,
+                          unsigned num_textures);
  struct ir3_instruction *ir3_ssbo_to_ibo(struct ir3_context *ctx, nir_src src);
  unsigned ir3_ssbo_to_tex(struct ir3_ibo_mapping *mapping, unsigned ssbo);
  struct ir3_instruction *ir3_image_to_ibo(struct ir3_context *ctx, nir_src src);
  unsigned ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image);
  
-unsigned ir3_get_image_coords(const nir_intrinsic_instr *instr, unsigned *flagsp);
+unsigned ir3_get_image_coords(const nir_intrinsic_instr *instr,
+                              unsigned *flagsp);
  type_t ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr);
  unsigned ir3_get_num_components_for_image_format(enum pipe_format);
  
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c

index 003d439..1e64e10 100644 (file)
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -46,22 +46,22 @@
   */
  
  struct ir3_legalize_ctx {
-       struct ir3_compiler *compiler;
-       struct ir3_shader_variant *so;
-       gl_shader_stage type;
-       int max_bary;
-       bool early_input_release;
+   struct ir3_compiler *compiler;
+   struct ir3_shader_variant *so;
+   gl_shader_stage type;
+   int max_bary;
+   bool early_input_release;
  };
  
  struct ir3_legalize_state {
-       regmask_t needs_ss;
-       regmask_t needs_ss_war;       /* write after read */
-       regmask_t needs_sy;
+   regmask_t needs_ss;
+   regmask_t needs_ss_war; /* write after read */
+   regmask_t needs_sy;
  };
  
  struct ir3_legalize_block_data {
-       bool valid;
-       struct ir3_legalize_state state;
+   bool valid;
+   struct ir3_legalize_state state;
  };
  
  /* We want to evaluate each block from the position of any other
@@ -82,298 +82,297 @@ struct ir3_legalize_block_data {
  static bool
  legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
  {
-       struct ir3_legalize_block_data *bd = block->data;
-
-       if (bd->valid)
-               return false;
-
-       struct ir3_instruction *last_rel = NULL;
-       struct ir3_instruction *last_n = NULL;
-       struct list_head instr_list;
-       struct ir3_legalize_state prev_state = bd->state;
-       struct ir3_legalize_state *state = &bd->state;
-       bool last_input_needs_ss = false;
-       bool has_tex_prefetch = false;
-       bool mergedregs = ctx->so->mergedregs;
-
-       /* our input state is the OR of all predecessor blocks' state: */
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               struct ir3_block *predecessor = block->predecessors[i];
-               struct ir3_legalize_block_data *pbd = predecessor->data;
-               struct ir3_legalize_state *pstate = &pbd->state;
-
-               /* Our input (ss)/(sy) state is based on OR'ing the output
-                * state of all our predecessor blocks
-                */
-               regmask_or(&state->needs_ss,
-                               &state->needs_ss, &pstate->needs_ss);
-               regmask_or(&state->needs_ss_war,
-                               &state->needs_ss_war, &pstate->needs_ss_war);
-               regmask_or(&state->needs_sy,
-                               &state->needs_sy, &pstate->needs_sy);
-       }
-
-       unsigned input_count = 0;
-
-       foreach_instr (n, &block->instr_list) {
-               if (is_input(n)) {
-                       input_count++;
-               }
-       }
-
-       unsigned inputs_remaining = input_count;
-
-       /* Either inputs are in the first block or we expect inputs to be released
-        * with the end of the program.
-        */
-       assert(input_count == 0 || !ctx->early_input_release ||
-                  block == ir3_start_block(block->shader));
-
-       /* remove all the instructions from the list, we'll be adding
-        * them back in as we go
-        */
-       list_replace(&block->instr_list, &instr_list);
-       list_inithead(&block->instr_list);
-
-       foreach_instr_safe (n, &instr_list) {
-               unsigned i;
-
-               n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
-
-               /* _meta::tex_prefetch instructions removed later in
-                * collect_tex_prefetches()
-                */
-               if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
-                       continue;
-
-               if (is_input(n)) {
-                       struct ir3_register *inloc = n->srcs[0];
-                       assert(inloc->flags & IR3_REG_IMMED);
-                       ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
-               }
-
-               if (last_n && is_barrier(last_n)) {
-                       n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-                       last_input_needs_ss = false;
-                       regmask_init(&state->needs_ss_war, mergedregs);
-                       regmask_init(&state->needs_ss, mergedregs);
-                       regmask_init(&state->needs_sy, mergedregs);
-               }
-
-               if (last_n && (last_n->opc == OPC_PREDT)) {
-                       n->flags |= IR3_INSTR_SS;
-                       regmask_init(&state->needs_ss_war, mergedregs);
-                       regmask_init(&state->needs_ss, mergedregs);
-               }
-
-               /* NOTE: consider dst register too.. it could happen that
-                * texture sample instruction (for example) writes some
-                * components which are unused.  A subsequent instruction
-                * that writes the same register can race w/ the sam instr
-                * resulting in undefined results:
-                */
-               for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
-                       struct ir3_register *reg;
-                       if (i < n->dsts_count)
-                               reg = n->dsts[i];
-                       else
-                               reg = n->srcs[i - n->dsts_count];
-
-                       if (reg_gpr(reg)) {
-
-                               /* TODO: we probably only need (ss) for alu
-                                * instr consuming sfu result.. need to make
-                                * some tests for both this and (sy)..
-                                */
-                               if (regmask_get(&state->needs_ss, reg)) {
-                                       n->flags |= IR3_INSTR_SS;
-                                       last_input_needs_ss = false;
-                                       regmask_init(&state->needs_ss_war, mergedregs);
-                                       regmask_init(&state->needs_ss, mergedregs);
-                               }
-
-                               if (regmask_get(&state->needs_sy, reg)) {
-                                       n->flags |= IR3_INSTR_SY;
-                                       regmask_init(&state->needs_sy, mergedregs);
-                               }
-                       }
-
-                       /* TODO: is it valid to have address reg loaded from a
-                        * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
-                        * last_rel check below should be moved ahead of this:
-                        */
-                       if (reg->flags & IR3_REG_RELATIV)
-                               last_rel = n;
-               }
-
-               foreach_dst (reg, n) {
-                       if (regmask_get(&state->needs_ss_war, reg)) {
-                               n->flags |= IR3_INSTR_SS;
-                               last_input_needs_ss = false;
-                               regmask_init(&state->needs_ss_war, mergedregs);
-                               regmask_init(&state->needs_ss, mergedregs);
-                       }
-
-                       if (last_rel && (reg->num == regid(REG_A0, 0))) {
-                               last_rel->flags |= IR3_INSTR_UL;
-                               last_rel = NULL;
-                       }
-               }
-
-               /* cat5+ does not have an (ss) bit, if needed we need to
-                * insert a nop to carry the sync flag.  Would be kinda
-                * clever if we were aware of this during scheduling, but
-                * this should be a pretty rare case:
-                */
-               if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
-                       struct ir3_instruction *nop;
-                       nop = ir3_NOP(block);
-                       nop->flags |= IR3_INSTR_SS;
-                       n->flags &= ~IR3_INSTR_SS;
-               }
-
-               /* need to be able to set (ss) on first instruction: */
-               if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
-                       ir3_NOP(block);
-
-               if (ctx->compiler->samgq_workaround &&
-                       ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
-                       struct ir3_instruction *samgp;
-
-                       list_delinit(&n->node);
-
-                       for (i = 0; i < 4; i++) {
-                               samgp = ir3_instr_clone(n);
-                               samgp->opc = OPC_SAMGP0 + i;
-                               if (i > 1)
-                                       samgp->flags |= IR3_INSTR_SY;
-                       }
-               } else {
-                       list_delinit(&n->node);
-                       list_addtail(&n->node, &block->instr_list);
-               }
-
-               if (is_sfu(n))
-                       regmask_set(&state->needs_ss, n->dsts[0]);
-
-               if (is_tex_or_prefetch(n)) {
-                       regmask_set(&state->needs_sy, n->dsts[0]);
-                       if (n->opc == OPC_META_TEX_PREFETCH)
-                               has_tex_prefetch = true;
-               } else if (n->opc == OPC_RESINFO) {
-                       regmask_set(&state->needs_ss, n->dsts[0]);
-                       ir3_NOP(block)->flags |= IR3_INSTR_SS;
-                       last_input_needs_ss = false;
-               } else if (is_load(n)) {
-                       /* seems like ldlv needs (ss) bit instead??  which is odd but
-                        * makes a bunch of flat-varying tests start working on a4xx.
-                        */
-                       if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || (n->opc == OPC_LDLW))
-                               regmask_set(&state->needs_ss, n->dsts[0]);
-                       else
-                               regmask_set(&state->needs_sy, n->dsts[0]);
-               } else if (is_atomic(n->opc)) {
-                       if (n->flags & IR3_INSTR_G) {
-                               if (ctx->compiler->gpu_id >= 600) {
-                                       /* New encoding, returns  result via second src: */
-                                       regmask_set(&state->needs_sy, n->srcs[2]);
-                               } else {
-                                       regmask_set(&state->needs_sy, n->dsts[0]);
-                               }
-                       } else {
-                               regmask_set(&state->needs_ss, n->dsts[0]);
-                       }
-               }
-
-               if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
-                       ctx->so->has_ssbo = true;
-
-               /* both tex/sfu appear to not always immediately consume
-                * their src register(s):
-                */
-               if (is_tex(n) || is_sfu(n) || is_mem(n)) {
-                       foreach_src (reg, n) {
-                               if (reg_gpr(reg))
-                                       regmask_set(&state->needs_ss_war, reg);
-                       }
-               }
-
-               if (ctx->early_input_release && is_input(n)) {
-                       last_input_needs_ss |= (n->opc == OPC_LDLV);
-
-                       assert(inputs_remaining > 0);
-                       inputs_remaining--;
-                       if (inputs_remaining == 0) {
-                               /* This is the last input. We add the (ei) flag to release
-                                * varying memory after this executes. If it's an ldlv,
-                                * however, we need to insert a dummy bary.f on which we can
-                                * set the (ei) flag. We may also need to insert an (ss) to
-                                * guarantee that all ldlv's have finished fetching their
-                                * results before releasing the varying memory.
-                                */
-                               struct ir3_instruction *last_input = n;
-                               if (n->opc == OPC_LDLV) {
-                                       struct ir3_instruction *baryf;
-
-                                       /* (ss)bary.f (ei)r63.x, 0, r0.x */
-                                       baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
-                                       ir3_dst_create(baryf, regid(63, 0), 0);
-                                       ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
-                                       ir3_src_create(baryf, regid(0, 0), 0);
-
-                                       last_input = baryf;
-                               }
-
-                               last_input->dsts[0]->flags |= IR3_REG_EI;
-                               if (last_input_needs_ss) {
-                                       last_input->flags |= IR3_INSTR_SS;
-                                       regmask_init(&state->needs_ss_war, mergedregs);
-                                       regmask_init(&state->needs_ss, mergedregs);
-                               }
-                       }
-               }
-
-               last_n = n;
-       }
-
-       assert(inputs_remaining == 0 || !ctx->early_input_release);
-
-       if (has_tex_prefetch && input_count == 0) {
-               /* texture prefetch, but *no* inputs.. we need to insert a
-                * dummy bary.f at the top of the shader to unblock varying
-                * storage:
-                */
-               struct ir3_instruction *baryf;
-
-               /* (ss)bary.f (ei)r63.x, 0, r0.x */
-               baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
-               ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
-               ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
-               ir3_src_create(baryf, regid(0, 0), 0);
-
-               /* insert the dummy bary.f at head: */
-               list_delinit(&baryf->node);
-               list_add(&baryf->node, &block->instr_list);
-       }
-
-       if (last_rel)
-               last_rel->flags |= IR3_INSTR_UL;
-
-       bd->valid = true;
-
-       if (memcmp(&prev_state, state, sizeof(*state))) {
-               /* our output state changed, this invalidates all of our
-                * successors:
-                */
-               for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
-                       if (!block->successors[i])
-                               break;
-                       struct ir3_legalize_block_data *pbd = block->successors[i]->data;
-                       pbd->valid = false;
-               }
-       }
-
-       return true;
+   struct ir3_legalize_block_data *bd = block->data;
+
+   if (bd->valid)
+      return false;
+
+   struct ir3_instruction *last_rel = NULL;
+   struct ir3_instruction *last_n = NULL;
+   struct list_head instr_list;
+   struct ir3_legalize_state prev_state = bd->state;
+   struct ir3_legalize_state *state = &bd->state;
+   bool last_input_needs_ss = false;
+   bool has_tex_prefetch = false;
+   bool mergedregs = ctx->so->mergedregs;
+
+   /* our input state is the OR of all predecessor blocks' state: */
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *predecessor = block->predecessors[i];
+      struct ir3_legalize_block_data *pbd = predecessor->data;
+      struct ir3_legalize_state *pstate = &pbd->state;
+
+      /* Our input (ss)/(sy) state is based on OR'ing the output
+       * state of all our predecessor blocks
+       */
+      regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
+      regmask_or(&state->needs_ss_war, &state->needs_ss_war,
+                 &pstate->needs_ss_war);
+      regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
+   }
+
+   unsigned input_count = 0;
+
+   foreach_instr (n, &block->instr_list) {
+      if (is_input(n)) {
+         input_count++;
+      }
+   }
+
+   unsigned inputs_remaining = input_count;
+
+   /* Either inputs are in the first block or we expect inputs to be released
+    * with the end of the program.
+    */
+   assert(input_count == 0 || !ctx->early_input_release ||
+          block == ir3_start_block(block->shader));
+
+   /* remove all the instructions from the list, we'll be adding
+    * them back in as we go
+    */
+   list_replace(&block->instr_list, &instr_list);
+   list_inithead(&block->instr_list);
+
+   foreach_instr_safe (n, &instr_list) {
+      unsigned i;
+
+      n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
+
+      /* _meta::tex_prefetch instructions removed later in
+       * collect_tex_prefetches()
+       */
+      if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
+         continue;
+
+      if (is_input(n)) {
+         struct ir3_register *inloc = n->srcs[0];
+         assert(inloc->flags & IR3_REG_IMMED);
+         ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+      }
+
+      if (last_n && is_barrier(last_n)) {
+         n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+         last_input_needs_ss = false;
+         regmask_init(&state->needs_ss_war, mergedregs);
+         regmask_init(&state->needs_ss, mergedregs);
+         regmask_init(&state->needs_sy, mergedregs);
+      }
+
+      if (last_n && (last_n->opc == OPC_PREDT)) {
+         n->flags |= IR3_INSTR_SS;
+         regmask_init(&state->needs_ss_war, mergedregs);
+         regmask_init(&state->needs_ss, mergedregs);
+      }
+
+      /* NOTE: consider dst register too.. it could happen that
+       * texture sample instruction (for example) writes some
+       * components which are unused.  A subsequent instruction
+       * that writes the same register can race w/ the sam instr
+       * resulting in undefined results:
+       */
+      for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
+         struct ir3_register *reg;
+         if (i < n->dsts_count)
+            reg = n->dsts[i];
+         else
+            reg = n->srcs[i - n->dsts_count];
+
+         if (reg_gpr(reg)) {
+
+            /* TODO: we probably only need (ss) for alu
+             * instr consuming sfu result.. need to make
+             * some tests for both this and (sy)..
+             */
+            if (regmask_get(&state->needs_ss, reg)) {
+               n->flags |= IR3_INSTR_SS;
+               last_input_needs_ss = false;
+               regmask_init(&state->needs_ss_war, mergedregs);
+               regmask_init(&state->needs_ss, mergedregs);
+            }
+
+            if (regmask_get(&state->needs_sy, reg)) {
+               n->flags |= IR3_INSTR_SY;
+               regmask_init(&state->needs_sy, mergedregs);
+            }
+         }
+
+         /* TODO: is it valid to have address reg loaded from a
+          * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
+          * last_rel check below should be moved ahead of this:
+          */
+         if (reg->flags & IR3_REG_RELATIV)
+            last_rel = n;
+      }
+
+      foreach_dst (reg, n) {
+         if (regmask_get(&state->needs_ss_war, reg)) {
+            n->flags |= IR3_INSTR_SS;
+            last_input_needs_ss = false;
+            regmask_init(&state->needs_ss_war, mergedregs);
+            regmask_init(&state->needs_ss, mergedregs);
+         }
+
+         if (last_rel && (reg->num == regid(REG_A0, 0))) {
+            last_rel->flags |= IR3_INSTR_UL;
+            last_rel = NULL;
+         }
+      }
+
+      /* cat5+ does not have an (ss) bit, if needed we need to
+       * insert a nop to carry the sync flag.  Would be kinda
+       * clever if we were aware of this during scheduling, but
+       * this should be a pretty rare case:
+       */
+      if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
+         struct ir3_instruction *nop;
+         nop = ir3_NOP(block);
+         nop->flags |= IR3_INSTR_SS;
+         n->flags &= ~IR3_INSTR_SS;
+      }
+
+      /* need to be able to set (ss) on first instruction: */
+      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+         ir3_NOP(block);
+
+      if (ctx->compiler->samgq_workaround && ctx->type == MESA_SHADER_VERTEX &&
+          n->opc == OPC_SAMGQ) {
+         struct ir3_instruction *samgp;
+
+         list_delinit(&n->node);
+
+         for (i = 0; i < 4; i++) {
+            samgp = ir3_instr_clone(n);
+            samgp->opc = OPC_SAMGP0 + i;
+            if (i > 1)
+               samgp->flags |= IR3_INSTR_SY;
+         }
+      } else {
+         list_delinit(&n->node);
+         list_addtail(&n->node, &block->instr_list);
+      }
+
+      if (is_sfu(n))
+         regmask_set(&state->needs_ss, n->dsts[0]);
+
+      if (is_tex_or_prefetch(n)) {
+         regmask_set(&state->needs_sy, n->dsts[0]);
+         if (n->opc == OPC_META_TEX_PREFETCH)
+            has_tex_prefetch = true;
+      } else if (n->opc == OPC_RESINFO) {
+         regmask_set(&state->needs_ss, n->dsts[0]);
+         ir3_NOP(block)->flags |= IR3_INSTR_SS;
+         last_input_needs_ss = false;
+      } else if (is_load(n)) {
+         /* seems like ldlv needs (ss) bit instead??  which is odd but
+          * makes a bunch of flat-varying tests start working on a4xx.
+          */
+         if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) ||
+             (n->opc == OPC_LDLW))
+            regmask_set(&state->needs_ss, n->dsts[0]);
+         else
+            regmask_set(&state->needs_sy, n->dsts[0]);
+      } else if (is_atomic(n->opc)) {
+         if (n->flags & IR3_INSTR_G) {
+            if (ctx->compiler->gpu_id >= 600) {
+               /* New encoding, returns  result via second src: */
+               regmask_set(&state->needs_sy, n->srcs[2]);
+            } else {
+               regmask_set(&state->needs_sy, n->dsts[0]);
+            }
+         } else {
+            regmask_set(&state->needs_ss, n->dsts[0]);
+         }
+      }
+
+      if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
+         ctx->so->has_ssbo = true;
+
+      /* both tex/sfu appear to not always immediately consume
+       * their src register(s):
+       */
+      if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+         foreach_src (reg, n) {
+            if (reg_gpr(reg))
+               regmask_set(&state->needs_ss_war, reg);
+         }
+      }
+
+      if (ctx->early_input_release && is_input(n)) {
+         last_input_needs_ss |= (n->opc == OPC_LDLV);
+
+         assert(inputs_remaining > 0);
+         inputs_remaining--;
+         if (inputs_remaining == 0) {
+            /* This is the last input. We add the (ei) flag to release
+             * varying memory after this executes. If it's an ldlv,
+             * however, we need to insert a dummy bary.f on which we can
+             * set the (ei) flag. We may also need to insert an (ss) to
+             * guarantee that all ldlv's have finished fetching their
+             * results before releasing the varying memory.
+             */
+            struct ir3_instruction *last_input = n;
+            if (n->opc == OPC_LDLV) {
+               struct ir3_instruction *baryf;
+
+               /* (ss)bary.f (ei)r63.x, 0, r0.x */
+               baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
+               ir3_dst_create(baryf, regid(63, 0), 0);
+               ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+               ir3_src_create(baryf, regid(0, 0), 0);
+
+               last_input = baryf;
+            }
+
+            last_input->dsts[0]->flags |= IR3_REG_EI;
+            if (last_input_needs_ss) {
+               last_input->flags |= IR3_INSTR_SS;
+               regmask_init(&state->needs_ss_war, mergedregs);
+               regmask_init(&state->needs_ss, mergedregs);
+            }
+         }
+      }
+
+      last_n = n;
+   }
+
+   assert(inputs_remaining == 0 || !ctx->early_input_release);
+
+   if (has_tex_prefetch && input_count == 0) {
+      /* texture prefetch, but *no* inputs.. we need to insert a
+       * dummy bary.f at the top of the shader to unblock varying
+       * storage:
+       */
+      struct ir3_instruction *baryf;
+
+      /* (ss)bary.f (ei)r63.x, 0, r0.x */
+      baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
+      ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
+      ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+      ir3_src_create(baryf, regid(0, 0), 0);
+
+      /* insert the dummy bary.f at head: */
+      list_delinit(&baryf->node);
+      list_add(&baryf->node, &block->instr_list);
+   }
+
+   if (last_rel)
+      last_rel->flags |= IR3_INSTR_UL;
+
+   bd->valid = true;
+
+   if (memcmp(&prev_state, state, sizeof(*state))) {
+      /* our output state changed, this invalidates all of our
+       * successors:
+       */
+      for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+         if (!block->successors[i])
+            break;
+         struct ir3_legalize_block_data *pbd = block->successors[i]->data;
+         pbd->valid = false;
+      }
+   }
+
+   return true;
  }
  
  /* Expands dsxpp and dsypp macros to:
@@ -388,28 +387,28 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
  static bool
  apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
  {
-       struct list_head instr_list;
+   struct list_head instr_list;
  
-       /* remove all the instructions from the list, we'll be adding
-        * them back in as we go
-        */
-       list_replace(&block->instr_list, &instr_list);
-       list_inithead(&block->instr_list);
+   /* remove all the instructions from the list, we'll be adding
+    * them back in as we go
+    */
+   list_replace(&block->instr_list, &instr_list);
+   list_inithead(&block->instr_list);
  
-       foreach_instr_safe (n, &instr_list) {
-               list_addtail(&n->node, &block->instr_list);
+   foreach_instr_safe (n, &instr_list) {
+      list_addtail(&n->node, &block->instr_list);
  
-               if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
-                       n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
+      if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
+         n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
  
-                       struct ir3_instruction *op_p = ir3_instr_clone(n);
-                       op_p->flags = IR3_INSTR_P;
+         struct ir3_instruction *op_p = ir3_instr_clone(n);
+         op_p->flags = IR3_INSTR_P;
  
-                       ctx->so->need_fine_derivatives = true;
-               }
-       }
+         ctx->so->need_fine_derivatives = true;
+      }
+   }
  
-       return true;
+   return true;
  }
  
  /* NOTE: branch instructions are always the last instruction(s)
@@ -450,166 +449,165 @@ apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
  static struct ir3_block *
  resolve_dest_block(struct ir3_block *block)
  {
-       /* special case for last block: */
-       if (!block->successors[0])
-               return block;
-
-       /* NOTE that we may or may not have inserted the jump
-        * in the target block yet, so conditions to resolve
-        * the dest to the dest block's successor are:
-        *
-        *   (1) successor[1] == NULL &&
-        *   (2) (block-is-empty || only-instr-is-jump)
-        */
-       if (block->successors[1] == NULL) {
-               if (list_is_empty(&block->instr_list)) {
-                       return block->successors[0];
-               } else if (list_length(&block->instr_list) == 1) {
-                       struct ir3_instruction *instr = list_first_entry(
-                                       &block->instr_list, struct ir3_instruction, node);
-                       if (instr->opc == OPC_JUMP) {
-                               /* If this jump is backwards, then we will probably convert
-                                * the jump being resolved to a backwards jump, which will
-                                * change a loop-with-continue or loop-with-if into a
-                                * doubly-nested loop and change the convergence behavior.
-                                * Disallow this here.
-                                */
-                               if (block->successors[0]->index <= block->index)
-                                       return block;
-                               return block->successors[0];
-                       }
-               }
-       }
-       return block;
+   /* special case for last block: */
+   if (!block->successors[0])
+      return block;
+
+   /* NOTE that we may or may not have inserted the jump
+    * in the target block yet, so conditions to resolve
+    * the dest to the dest block's successor are:
+    *
+    *   (1) successor[1] == NULL &&
+    *   (2) (block-is-empty || only-instr-is-jump)
+    */
+   if (block->successors[1] == NULL) {
+      if (list_is_empty(&block->instr_list)) {
+         return block->successors[0];
+      } else if (list_length(&block->instr_list) == 1) {
+         struct ir3_instruction *instr =
+            list_first_entry(&block->instr_list, struct ir3_instruction, node);
+         if (instr->opc == OPC_JUMP) {
+            /* If this jump is backwards, then we will probably convert
+             * the jump being resolved to a backwards jump, which will
+             * change a loop-with-continue or loop-with-if into a
+             * doubly-nested loop and change the convergence behavior.
+             * Disallow this here.
+             */
+            if (block->successors[0]->index <= block->index)
+               return block;
+            return block->successors[0];
+         }
+      }
+   }
+   return block;
  }
  
  static void
  remove_unused_block(struct ir3_block *old_target)
  {
-       list_delinit(&old_target->node);
-
-       /* cleanup dangling predecessors: */
-       for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
-               if (old_target->successors[i]) {
-                       struct ir3_block *succ = old_target->successors[i];
-                       ir3_block_remove_predecessor(succ, old_target);
-               }
-       }
+   list_delinit(&old_target->node);
+
+   /* cleanup dangling predecessors: */
+   for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
+      if (old_target->successors[i]) {
+         struct ir3_block *succ = old_target->successors[i];
+         ir3_block_remove_predecessor(succ, old_target);
+      }
+   }
  }
  
  static bool
  retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
  {
-       struct ir3_block *old_target = instr->cat0.target;
-       struct ir3_block *cur_block = instr->block;
+   struct ir3_block *old_target = instr->cat0.target;
+   struct ir3_block *cur_block = instr->block;
  
-       /* update current blocks successors to reflect the retargetting: */
-       if (cur_block->successors[0] == old_target) {
-               cur_block->successors[0] = new_target;
-       } else {
-               debug_assert(cur_block->successors[1] == old_target);
-               cur_block->successors[1] = new_target;
-       }
+   /* update current blocks successors to reflect the retargetting: */
+   if (cur_block->successors[0] == old_target) {
+      cur_block->successors[0] = new_target;
+   } else {
+      debug_assert(cur_block->successors[1] == old_target);
+      cur_block->successors[1] = new_target;
+   }
  
-       /* update new target's predecessors: */
-       ir3_block_add_predecessor(new_target, cur_block);
+   /* update new target's predecessors: */
+   ir3_block_add_predecessor(new_target, cur_block);
  
-       /* and remove old_target's predecessor: */
-       ir3_block_remove_predecessor(old_target, cur_block);
+   /* and remove old_target's predecessor: */
+   ir3_block_remove_predecessor(old_target, cur_block);
  
-       instr->cat0.target = new_target;
+   instr->cat0.target = new_target;
  
-       if (old_target->predecessors_count == 0) {
-               remove_unused_block(old_target);
-               return true;
-       }
+   if (old_target->predecessors_count == 0) {
+      remove_unused_block(old_target);
+      return true;
+   }
  
-       return false;
+   return false;
  }
  
  static bool
  opt_jump(struct ir3 *ir)
  {
-       bool progress = false;
-
-       unsigned index = 0;
-       foreach_block (block, &ir->block_list)
-               block->index = index++;
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (!is_flow(instr) || !instr->cat0.target)
-                               continue;
-
-                       struct ir3_block *tblock =
-                               resolve_dest_block(instr->cat0.target);
-                       if (tblock != instr->cat0.target) {
-                               progress = true;
-
-                               /* Exit early if we deleted a block to avoid iterator
-                                * weirdness/assert fails
-                                */
-                               if (retarget_jump(instr, tblock))
-                                       return true;
-                       }
-               }
-
-               /* Detect the case where the block ends either with:
-                * - A single unconditional jump to the next block.
-                * - Two jump instructions with opposite conditions, and one of the
-                *   them jumps to the next block.
-                * We can remove the one that jumps to the next block in either case.
-                */
-               if (list_is_empty(&block->instr_list))
-                       continue;
-
-               struct ir3_instruction *jumps[2] = { NULL, NULL };
-               jumps[0] = list_last_entry(&block->instr_list, struct ir3_instruction, node);
-               if (!list_is_singular(&block->instr_list))
-                       jumps[1] = list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
-
-               if (jumps[0]->opc == OPC_JUMP)
-                       jumps[1] = NULL;
-               else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B)
-                       continue;
-               
-               for (unsigned i = 0; i < 2; i++) {
-                       if (!jumps[i])
-                               continue;
-
-                       struct ir3_block *tblock = jumps[i]->cat0.target;
-                       if (&tblock->node == block->node.next) {
-                               list_delinit(&jumps[i]->node);
-                               progress = true;
-                               break;
-                       }
-               }
-       }
-
-       return progress;
+   bool progress = false;
+
+   unsigned index = 0;
+   foreach_block (block, &ir->block_list)
+      block->index = index++;
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (!is_flow(instr) || !instr->cat0.target)
+            continue;
+
+         struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
+         if (tblock != instr->cat0.target) {
+            progress = true;
+
+            /* Exit early if we deleted a block to avoid iterator
+             * weirdness/assert fails
+             */
+            if (retarget_jump(instr, tblock))
+               return true;
+         }
+      }
+
+      /* Detect the case where the block ends either with:
+       * - A single unconditional jump to the next block.
+       * - Two jump instructions with opposite conditions, and one of the
+       *   them jumps to the next block.
+       * We can remove the one that jumps to the next block in either case.
+       */
+      if (list_is_empty(&block->instr_list))
+         continue;
+
+      struct ir3_instruction *jumps[2] = {NULL, NULL};
+      jumps[0] =
+         list_last_entry(&block->instr_list, struct ir3_instruction, node);
+      if (!list_is_singular(&block->instr_list))
+         jumps[1] =
+            list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
+
+      if (jumps[0]->opc == OPC_JUMP)
+         jumps[1] = NULL;
+      else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B)
+         continue;
+
+      for (unsigned i = 0; i < 2; i++) {
+         if (!jumps[i])
+            continue;
+
+         struct ir3_block *tblock = jumps[i]->cat0.target;
+         if (&tblock->node == block->node.next) {
+            list_delinit(&jumps[i]->node);
+            progress = true;
+            break;
+         }
+      }
+   }
+
+   return progress;
  }
  
  static void
  resolve_jumps(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list)
-               foreach_instr (instr, &block->instr_list)
-                       if (is_flow(instr) && instr->cat0.target) {
-                               struct ir3_instruction *target =
-                                       list_first_entry(&instr->cat0.target->instr_list,
-                                                       struct ir3_instruction, node);
-
-                               instr->cat0.immed =
-                                       (int)target->ip - (int)instr->ip;
-                       }
-
+   foreach_block (block, &ir->block_list)
+      foreach_instr (instr, &block->instr_list)
+         if (is_flow(instr) && instr->cat0.target) {
+            struct ir3_instruction *target = list_first_entry(
+               &instr->cat0.target->instr_list, struct ir3_instruction, node);
+
+            instr->cat0.immed = (int)target->ip - (int)instr->ip;
+         }
  }
  
-static void mark_jp(struct ir3_block *block)
+static void
+mark_jp(struct ir3_block *block)
  {
-       struct ir3_instruction *target = list_first_entry(&block->instr_list,
-                       struct ir3_instruction, node);
-       target->flags |= IR3_INSTR_JP;
+   struct ir3_instruction *target =
+      list_first_entry(&block->instr_list, struct ir3_instruction, node);
+   target->flags |= IR3_INSTR_JP;
  }
  
  /* Mark points where control flow converges or diverges.
@@ -622,24 +620,24 @@ static void mark_jp(struct ir3_block *block)
  static void
  mark_xvergence_points(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list) {
-               if (block->predecessors_count > 1) {
-                       /* if a block has more than one possible predecessor, then
-                        * the first instruction is a convergence point.
-                        */
-                       mark_jp(block);
-               } else if (block->predecessors_count == 1) {
-                       /* If a block has one predecessor, which has multiple possible
-                        * successors, it is a divergence point.
-                        */
-                       for (unsigned i = 0; i < block->predecessors_count; i++) {
-                               struct ir3_block *predecessor = block->predecessors[i];
-                               if (predecessor->successors[1]) {
-                                       mark_jp(block);
-                               }
-                       }
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      if (block->predecessors_count > 1) {
+         /* if a block has more than one possible predecessor, then
+          * the first instruction is a convergence point.
+          */
+         mark_jp(block);
+      } else if (block->predecessors_count == 1) {
+         /* If a block has one predecessor, which has multiple possible
+          * successors, it is a divergence point.
+          */
+         for (unsigned i = 0; i < block->predecessors_count; i++) {
+            struct ir3_block *predecessor = block->predecessors[i];
+            if (predecessor->successors[1]) {
+               mark_jp(block);
+            }
+         }
+      }
+   }
  }
  
  /* Insert the branch/jump instructions for flow control between blocks.
@@ -653,62 +651,64 @@ mark_xvergence_points(struct ir3 *ir)
  static void
  block_sched(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list) {
-               if (block->successors[1]) {
-                       /* if/else, conditional branches to "then" or "else": */
-                       struct ir3_instruction *br1, *br2;
-
-                       if (block->brtype == IR3_BRANCH_GETONE) {
-                               /* getone can't be inverted, and it wouldn't even make sense
-                                * to follow it with an inverted branch, so follow it by an
-                                * unconditional branch.
-                                */
-                               debug_assert(!block->condition);
-                               br1 = ir3_GETONE(block);
-                               br1->cat0.target = block->successors[1];
-
-                               br2 = ir3_JUMP(block);
-                               br2->cat0.target = block->successors[0];
-                       } else {
-                               debug_assert(block->condition);
-
-                               /* create "else" branch first (since "then" block should
-                                * frequently/always end up being a fall-thru):
-                                */
-                               br1 = ir3_instr_create(block, OPC_B, 0, 1);
-                               ir3_src_create(br1, regid(REG_P0, 0), 0)->def = block->condition->dsts[0];
-                               br1->cat0.inv1 = true;
-                               br1->cat0.target = block->successors[1];
-
-                               /* "then" branch: */
-                               br2 = ir3_instr_create(block, OPC_B, 0, 1);
-                               ir3_src_create(br2, regid(REG_P0, 0), 0)->def = block->condition->dsts[0];
-                               br2->cat0.target = block->successors[0];
-
-                               switch (block->brtype) {
-                               case IR3_BRANCH_COND:
-                                       br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN;
-                                       break;
-                               case IR3_BRANCH_ALL:
-                                       br1->cat0.brtype = BRANCH_ANY;
-                                       br2->cat0.brtype = BRANCH_ALL;
-                                       break;
-                               case IR3_BRANCH_ANY:
-                                       br1->cat0.brtype = BRANCH_ALL;
-                                       br2->cat0.brtype = BRANCH_ANY;
-                                       break;
-                               case IR3_BRANCH_GETONE:
-                                       unreachable("can't get here");
-                               }
-                       }
-               } else if (block->successors[0]) {
-                       /* otherwise unconditional jump to next block: */
-                       struct ir3_instruction *jmp;
-
-                       jmp = ir3_JUMP(block);
-                       jmp->cat0.target = block->successors[0];
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      if (block->successors[1]) {
+         /* if/else, conditional branches to "then" or "else": */
+         struct ir3_instruction *br1, *br2;
+
+         if (block->brtype == IR3_BRANCH_GETONE) {
+            /* getone can't be inverted, and it wouldn't even make sense
+             * to follow it with an inverted branch, so follow it by an
+             * unconditional branch.
+             */
+            debug_assert(!block->condition);
+            br1 = ir3_GETONE(block);
+            br1->cat0.target = block->successors[1];
+
+            br2 = ir3_JUMP(block);
+            br2->cat0.target = block->successors[0];
+         } else {
+            debug_assert(block->condition);
+
+            /* create "else" branch first (since "then" block should
+             * frequently/always end up being a fall-thru):
+             */
+            br1 = ir3_instr_create(block, OPC_B, 0, 1);
+            ir3_src_create(br1, regid(REG_P0, 0), 0)->def =
+               block->condition->dsts[0];
+            br1->cat0.inv1 = true;
+            br1->cat0.target = block->successors[1];
+
+            /* "then" branch: */
+            br2 = ir3_instr_create(block, OPC_B, 0, 1);
+            ir3_src_create(br2, regid(REG_P0, 0), 0)->def =
+               block->condition->dsts[0];
+            br2->cat0.target = block->successors[0];
+
+            switch (block->brtype) {
+            case IR3_BRANCH_COND:
+               br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN;
+               break;
+            case IR3_BRANCH_ALL:
+               br1->cat0.brtype = BRANCH_ANY;
+               br2->cat0.brtype = BRANCH_ALL;
+               break;
+            case IR3_BRANCH_ANY:
+               br1->cat0.brtype = BRANCH_ALL;
+               br2->cat0.brtype = BRANCH_ANY;
+               break;
+            case IR3_BRANCH_GETONE:
+               unreachable("can't get here");
+            }
+         }
+      } else if (block->successors[0]) {
+         /* otherwise unconditional jump to next block: */
+         struct ir3_instruction *jmp;
+
+         jmp = ir3_JUMP(block);
+         jmp->cat0.target = block->successors[0];
+      }
+   }
  }
  
  /* Here we workaround the fact that kill doesn't actually kill the thread as
@@ -731,176 +731,176 @@ block_sched(struct ir3 *ir)
  static void
  kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
  {
-       /* True if we know that this block will always eventually lead to the end
-        * block:
-        */
-       bool always_ends = true;
-       bool added = false;
-       struct ir3_block *last_block =
-               list_last_entry(&ir->block_list, struct ir3_block, node);
-
-       foreach_block_rev (block, &ir->block_list) {
-               for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
-                       if (block->successors[i]->start_ip <= block->end_ip)
-                               always_ends = false;
-               }
-
-               if (always_ends)
-                       continue;
-
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (instr->opc != OPC_KILL)
-                               continue;
-
-                       struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1);
-                       ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask = 1;
-                       br->cat0.target =
-                               list_last_entry(&ir->block_list, struct ir3_block, node);
-
-                       list_del(&br->node);
-                       list_add(&br->node, &instr->node);
-
-                       added = true;
-               }
-       }
-
-       if (added) {
-               /* I'm not entirely sure how the branchstack works, but we probably
-                * need to add at least one entry for the divergence which is resolved
-                * at the end:
-                */
-               so->branchstack++;
-
-               /* We don't update predecessors/successors, so we have to do this
-                * manually:
-                */
-               mark_jp(last_block);
-       }
+   /* True if we know that this block will always eventually lead to the end
+    * block:
+    */
+   bool always_ends = true;
+   bool added = false;
+   struct ir3_block *last_block =
+      list_last_entry(&ir->block_list, struct ir3_block, node);
+
+   foreach_block_rev (block, &ir->block_list) {
+      for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
+         if (block->successors[i]->start_ip <= block->end_ip)
+            always_ends = false;
+      }
+
+      if (always_ends)
+         continue;
+
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc != OPC_KILL)
+            continue;
+
+         struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1);
+         ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
+            1;
+         br->cat0.target =
+            list_last_entry(&ir->block_list, struct ir3_block, node);
+
+         list_del(&br->node);
+         list_add(&br->node, &instr->node);
+
+         added = true;
+      }
+   }
+
+   if (added) {
+      /* I'm not entirely sure how the branchstack works, but we probably
+       * need to add at least one entry for the divergence which is resolved
+       * at the end:
+       */
+      so->branchstack++;
+
+      /* We don't update predecessors/successors, so we have to do this
+       * manually:
+       */
+      mark_jp(last_block);
+   }
  }
  
  /* Insert nop's required to make this a legal/valid shader program: */
  static void
  nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
  {
-       foreach_block (block, &ir->block_list) {
-               struct ir3_instruction *last = NULL;
-               struct list_head instr_list;
-
-               /* remove all the instructions from the list, we'll be adding
-                * them back in as we go
-                */
-               list_replace(&block->instr_list, &instr_list);
-               list_inithead(&block->instr_list);
-
-               foreach_instr_safe (instr, &instr_list) {
-                       unsigned delay =
-                               ir3_delay_calc_exact(block, instr, so->mergedregs);
-
-                       /* NOTE: I think the nopN encoding works for a5xx and
-                        * probably a4xx, but not a3xx.  So far only tested on
-                        * a6xx.
-                        */
-
-                       if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
-                                       ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
-                                       (last->repeat == 0)) {
-                               /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
-                               unsigned transfer = MIN2(delay, 3 - last->nop);
-                               last->nop += transfer;
-                               delay -= transfer;
-                       }
-
-                       if ((delay > 0) && last && (last->opc == OPC_NOP)) {
-                               /* the previous nop can encode at most 5 repeats: */
-                               unsigned transfer = MIN2(delay, 5 - last->repeat);
-                               last->repeat += transfer;
-                               delay -= transfer;
-                       }
-
-                       if (delay > 0) {
-                               debug_assert(delay <= 6);
-                               ir3_NOP(block)->repeat = delay - 1;
-                       }
-
-                       list_addtail(&instr->node, &block->instr_list);
-                       last = instr;
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      struct ir3_instruction *last = NULL;
+      struct list_head instr_list;
+
+      /* remove all the instructions from the list, we'll be adding
+       * them back in as we go
+       */
+      list_replace(&block->instr_list, &instr_list);
+      list_inithead(&block->instr_list);
+
+      foreach_instr_safe (instr, &instr_list) {
+         unsigned delay = ir3_delay_calc_exact(block, instr, so->mergedregs);
+
+         /* NOTE: I think the nopN encoding works for a5xx and
+          * probably a4xx, but not a3xx.  So far only tested on
+          * a6xx.
+          */
+
+         if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
+             ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
+             (last->repeat == 0)) {
+            /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+            unsigned transfer = MIN2(delay, 3 - last->nop);
+            last->nop += transfer;
+            delay -= transfer;
+         }
+
+         if ((delay > 0) && last && (last->opc == OPC_NOP)) {
+            /* the previous nop can encode at most 5 repeats: */
+            unsigned transfer = MIN2(delay, 5 - last->repeat);
+            last->repeat += transfer;
+            delay -= transfer;
+         }
+
+         if (delay > 0) {
+            debug_assert(delay <= 6);
+            ir3_NOP(block)->repeat = delay - 1;
+         }
+
+         list_addtail(&instr->node, &block->instr_list);
+         last = instr;
+      }
+   }
  }
  
  bool
  ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
  {
-       struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
-       bool mergedregs = so->mergedregs;
-       bool progress;
-
-       ctx->so = so;
-       ctx->max_bary = -1;
-       ctx->compiler = ir->compiler;
-       ctx->type = ir->type;
-
-       /* allocate per-block data: */
-       foreach_block (block, &ir->block_list) {
-               struct ir3_legalize_block_data *bd =
-                               rzalloc(ctx, struct ir3_legalize_block_data);
-
-               regmask_init(&bd->state.needs_ss_war, mergedregs);
-               regmask_init(&bd->state.needs_ss, mergedregs);
-               regmask_init(&bd->state.needs_sy, mergedregs);
-
-               block->data = bd;
-       }
-
-       ir3_remove_nops(ir);
-
-       /* We may have failed to pull all input loads into the first block.
-        * In such case at the moment we aren't able to find a better place
-        * to for (ei) than the end of the program.
-        * a5xx and a6xx do automatically release varying storage at the end.
-        */
-       ctx->early_input_release = true;
-       struct ir3_block *start_block = ir3_start_block(ir);
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (is_input(instr) && block != start_block) {
-                               ctx->early_input_release = false;
-                               break;
-                       }
-               }
-       }
-
-       assert(ctx->early_input_release || ctx->compiler->gpu_id > 500);
-
-       /* process each block: */
-       do {
-               progress = false;
-               foreach_block (block, &ir->block_list) {
-                       progress |= legalize_block(ctx, block);
-               }
-       } while (progress);
-
-       *max_bary = ctx->max_bary;
-
-       block_sched(ir);
-       if (so->type == MESA_SHADER_FRAGMENT)
-               kill_sched(ir, so);
-
-       foreach_block (block, &ir->block_list) {
-               progress |= apply_fine_deriv_macro(ctx, block);
-       }
-
-       nop_sched(ir, so);
-
-       while (opt_jump(ir))
-               ;
-
-       ir3_count_instructions(ir);
-       resolve_jumps(ir);
-
-       mark_xvergence_points(ir);
-
-       ralloc_free(ctx);
-
-       return true;
+   struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
+   bool mergedregs = so->mergedregs;
+   bool progress;
+
+   ctx->so = so;
+   ctx->max_bary = -1;
+   ctx->compiler = ir->compiler;
+   ctx->type = ir->type;
+
+   /* allocate per-block data: */
+   foreach_block (block, &ir->block_list) {
+      struct ir3_legalize_block_data *bd =
+         rzalloc(ctx, struct ir3_legalize_block_data);
+
+      regmask_init(&bd->state.needs_ss_war, mergedregs);
+      regmask_init(&bd->state.needs_ss, mergedregs);
+      regmask_init(&bd->state.needs_sy, mergedregs);
+
+      block->data = bd;
+   }
+
+   ir3_remove_nops(ir);
+
+   /* We may have failed to pull all input loads into the first block.
+    * In such case at the moment we aren't able to find a better place
+    * to for (ei) than the end of the program.
+    * a5xx and a6xx do automatically release varying storage at the end.
+    */
+   ctx->early_input_release = true;
+   struct ir3_block *start_block = ir3_start_block(ir);
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (is_input(instr) && block != start_block) {
+            ctx->early_input_release = false;
+            break;
+         }
+      }
+   }
+
+   assert(ctx->early_input_release || ctx->compiler->gpu_id > 500);
+
+   /* process each block: */
+   do {
+      progress = false;
+      foreach_block (block, &ir->block_list) {
+         progress |= legalize_block(ctx, block);
+      }
+   } while (progress);
+
+   *max_bary = ctx->max_bary;
+
+   block_sched(ir);
+   if (so->type == MESA_SHADER_FRAGMENT)
+      kill_sched(ir, so);
+
+   foreach_block (block, &ir->block_list) {
+      progress |= apply_fine_deriv_macro(ctx, block);
+   }
+
+   nop_sched(ir, so);
+
+   while (opt_jump(ir))
+      ;
+
+   ir3_count_instructions(ir);
+   resolve_jumps(ir);
+
+   mark_xvergence_points(ir);
+
+   ralloc_free(ctx);
+
+   return true;
  }
diff --git a/src/freedreno/ir3/ir3_liveness.c b/src/freedreno/ir3/ir3_liveness.c

index 86e4a0f..4cdf5fb 100644 (file)
--- a/src/freedreno/ir3/ir3_liveness.c
+++ b/src/freedreno/ir3/ir3_liveness.c
@@ -37,127 +37,130 @@
  
  static bool
  compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block,
-                                          BITSET_WORD *tmp_live, unsigned bitset_words)
+                       BITSET_WORD *tmp_live, unsigned bitset_words)
  {
-       memcpy(tmp_live, live->live_out[block->index], bitset_words *
-                       sizeof(BITSET_WORD));
-
-       /* Process instructions */
-       foreach_instr_rev (instr, &block->instr_list) {
-               ra_foreach_dst(dst, instr) {
-                       if (BITSET_TEST(tmp_live, dst->name))
-                               dst->flags &= ~IR3_REG_UNUSED;
-                       else
-                               dst->flags |= IR3_REG_UNUSED;
-                       BITSET_CLEAR(tmp_live, dst->name);
-               }
-
-               /* Phi node uses occur after the predecessor block */
-               if (instr->opc != OPC_META_PHI) {
-                       ra_foreach_src(src, instr) {
-                               if (BITSET_TEST(tmp_live, src->def->name))
-                                       src->flags &= ~IR3_REG_KILL;
-                               else
-                                       src->flags |= IR3_REG_KILL;
-                       }
-
-                       ra_foreach_src(src, instr) {
-                               if (BITSET_TEST(tmp_live, src->def->name))
-                                       src->flags &= ~IR3_REG_FIRST_KILL;
-                               else
-                                       src->flags |= IR3_REG_FIRST_KILL;
-                               BITSET_SET(tmp_live, src->def->name);
-                       }
-               }
-       }
-
-       memcpy(live->live_in[block->index], tmp_live,
-                       bitset_words * sizeof(BITSET_WORD));
-
-       bool progress = false;
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               const struct ir3_block *pred = block->predecessors[i];
-               for (unsigned j = 0; j < bitset_words; j++) {
-                       if (tmp_live[j] & ~live->live_out[pred->index][j])
-                               progress = true;
-                       live->live_out[pred->index][j] |= tmp_live[j];
-               }
-
-               /* Process phi sources. */
-               foreach_instr (phi, &block->instr_list) {
-                       if (phi->opc != OPC_META_PHI)
-                               break;
-                       if (!phi->srcs[i]->def)
-                               continue;
-                       unsigned name = phi->srcs[i]->def->name;
-                       if (!BITSET_TEST(live->live_out[pred->index], name)) {
-                               progress = true;
-                               BITSET_SET(live->live_out[pred->index], name);
-                       }
-               }
-       }
-
-       for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
-               const struct ir3_block *pred = block->physical_predecessors[i];
-               unsigned name;
-               BITSET_FOREACH_SET(name, tmp_live, live->definitions_count) {
-                       struct ir3_register *reg = live->definitions[name];
-                       if (!(reg->flags & IR3_REG_SHARED))
-                               continue;
-                       if (!BITSET_TEST(live->live_out[pred->index], name)) {
-                               progress = true;
-                               BITSET_SET(live->live_out[pred->index], name);
-                       }
-               }
-       }
-       
-       return progress;
+   memcpy(tmp_live, live->live_out[block->index],
+          bitset_words * sizeof(BITSET_WORD));
+
+   /* Process instructions */
+   foreach_instr_rev (instr, &block->instr_list) {
+      ra_foreach_dst (dst, instr) {
+         if (BITSET_TEST(tmp_live, dst->name))
+            dst->flags &= ~IR3_REG_UNUSED;
+         else
+            dst->flags |= IR3_REG_UNUSED;
+         BITSET_CLEAR(tmp_live, dst->name);
+      }
+
+      /* Phi node uses occur after the predecessor block */
+      if (instr->opc != OPC_META_PHI) {
+         ra_foreach_src (src, instr) {
+            if (BITSET_TEST(tmp_live, src->def->name))
+               src->flags &= ~IR3_REG_KILL;
+            else
+               src->flags |= IR3_REG_KILL;
+         }
+
+         ra_foreach_src (src, instr) {
+            if (BITSET_TEST(tmp_live, src->def->name))
+               src->flags &= ~IR3_REG_FIRST_KILL;
+            else
+               src->flags |= IR3_REG_FIRST_KILL;
+            BITSET_SET(tmp_live, src->def->name);
+         }
+      }
+   }
+
+   memcpy(live->live_in[block->index], tmp_live,
+          bitset_words * sizeof(BITSET_WORD));
+
+   bool progress = false;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      const struct ir3_block *pred = block->predecessors[i];
+      for (unsigned j = 0; j < bitset_words; j++) {
+         if (tmp_live[j] & ~live->live_out[pred->index][j])
+            progress = true;
+         live->live_out[pred->index][j] |= tmp_live[j];
+      }
+
+      /* Process phi sources. */
+      foreach_instr (phi, &block->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+         if (!phi->srcs[i]->def)
+            continue;
+         unsigned name = phi->srcs[i]->def->name;
+         if (!BITSET_TEST(live->live_out[pred->index], name)) {
+            progress = true;
+            BITSET_SET(live->live_out[pred->index], name);
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+      const struct ir3_block *pred = block->physical_predecessors[i];
+      unsigned name;
+      BITSET_FOREACH_SET (name, tmp_live, live->definitions_count) {
+         struct ir3_register *reg = live->definitions[name];
+         if (!(reg->flags & IR3_REG_SHARED))
+            continue;
+         if (!BITSET_TEST(live->live_out[pred->index], name)) {
+            progress = true;
+            BITSET_SET(live->live_out[pred->index], name);
+         }
+      }
+   }
+
+   return progress;
  }
  
-struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
+struct ir3_liveness *
+ir3_calc_liveness(struct ir3_shader_variant *v)
  {
-       struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
-
-       /* Reserve name 0 to mean "doesn't have a name yet" to make the debug
-        * output nicer.
-        */
-       array_insert(live, live->definitions, NULL);
-
-       /* Build definition <-> name mapping */
-       unsigned block_count = 0;
-       foreach_block (block, &v->ir->block_list) {
-               block->index = block_count++;
-               foreach_instr (instr, &block->instr_list) {
-                       ra_foreach_dst(dst, instr) {
-                               dst->name = live->definitions_count;
-                               array_insert(live, live->definitions, dst);
-                       }
-               }
-       }
-
-       live->block_count = block_count;
-
-       unsigned bitset_words = BITSET_WORDS(live->definitions_count);
-       BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
-       live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
-       live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
-       unsigned i = 0;
-       foreach_block (block, &v->ir->block_list) {
-               block->index = i++;
-               live->live_in[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
-               live->live_out[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
-       }
-
-       bool progress = true;
-       while (progress) {
-               progress = false;
-               foreach_block_rev (block, &v->ir->block_list) {
-                       progress |=
-                               compute_block_liveness(live, block, tmp_live, bitset_words);
-               }
-       }
-
-       return live;
+   struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
+
+   /* Reserve name 0 to mean "doesn't have a name yet" to make the debug
+    * output nicer.
+    */
+   array_insert(live, live->definitions, NULL);
+
+   /* Build definition <-> name mapping */
+   unsigned block_count = 0;
+   foreach_block (block, &v->ir->block_list) {
+      block->index = block_count++;
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            dst->name = live->definitions_count;
+            array_insert(live, live->definitions, dst);
+         }
+      }
+   }
+
+   live->block_count = block_count;
+
+   unsigned bitset_words = BITSET_WORDS(live->definitions_count);
+   BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
+   live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
+   live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
+   unsigned i = 0;
+   foreach_block (block, &v->ir->block_list) {
+      block->index = i++;
+      live->live_in[block->index] =
+         rzalloc_array(live, BITSET_WORD, bitset_words);
+      live->live_out[block->index] =
+         rzalloc_array(live, BITSET_WORD, bitset_words);
+   }
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+      foreach_block_rev (block, &v->ir->block_list) {
+         progress |=
+            compute_block_liveness(live, block, tmp_live, bitset_words);
+      }
+   }
+
+   return live;
  }
  
  /* Return true if "def" is live after "instr". It's assumed that "def"
@@ -165,32 +168,31 @@ struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
   */
  bool
  ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
-                                  struct ir3_instruction *instr)
+                   struct ir3_instruction *instr)
  {
-       /* If it's live out then it's definitely live at the instruction. */
-       if (BITSET_TEST(live->live_out[instr->block->index], def->name))
-               return true;
-
-       /* If it's not live in and not defined in the same block then the live
-        * range can't extend to the instruction.
-        */
-       if (def->instr->block != instr->block &&
-               !BITSET_TEST(live->live_in[instr->block->index], def->name))
-               return false;
-
-       /* Ok, now comes the tricky case, where "def" is killed somewhere in
-        * "instr"'s block and we have to check if it's before or after.
-        */
-       foreach_instr_rev (test_instr, &instr->block->instr_list) {
-               if (test_instr == instr)
-                       break;
-
-               for (unsigned i = 0; i < test_instr->srcs_count; i++) {
-                       if (test_instr->srcs[i]->def == def)
-                               return true;
-               }
-       }
-
-       return false;
+   /* If it's live out then it's definitely live at the instruction. */
+   if (BITSET_TEST(live->live_out[instr->block->index], def->name))
+      return true;
+
+   /* If it's not live in and not defined in the same block then the live
+    * range can't extend to the instruction.
+    */
+   if (def->instr->block != instr->block &&
+       !BITSET_TEST(live->live_in[instr->block->index], def->name))
+      return false;
+
+   /* Ok, now comes the tricky case, where "def" is killed somewhere in
+    * "instr"'s block and we have to check if it's before or after.
+    */
+   foreach_instr_rev (test_instr, &instr->block->instr_list) {
+      if (test_instr == instr)
+         break;
+
+      for (unsigned i = 0; i < test_instr->srcs_count; i++) {
+         if (test_instr->srcs[i]->def == def)
+            return true;
+      }
+   }
+
+   return false;
  }
-
diff --git a/src/freedreno/ir3/ir3_lower_parallelcopy.c b/src/freedreno/ir3/ir3_lower_parallelcopy.c

index ef9a4ba..81087d6 100644 (file)
--- a/src/freedreno/ir3/ir3_lower_parallelcopy.c
+++ b/src/freedreno/ir3/ir3_lower_parallelcopy.c
@@ -25,524 +25,542 @@
  #include "ir3_shader.h"
  
  struct copy_src {
-       unsigned flags;
-       union {
-               uint32_t imm;
-               physreg_t reg;
-               unsigned const_num;
-       };
+   unsigned flags;
+   union {
+      uint32_t imm;
+      physreg_t reg;
+      unsigned const_num;
+   };
  };
  
  struct copy_entry {
-       physreg_t dst;
-       unsigned flags;
-       bool done;
+   physreg_t dst;
+   unsigned flags;
+   bool done;
  
-       struct copy_src src;
+   struct copy_src src;
  };
  
  static unsigned
  copy_entry_size(const struct copy_entry *entry)
  {
-       return (entry->flags & IR3_REG_HALF) ? 1 : 2;
+   return (entry->flags & IR3_REG_HALF) ? 1 : 2;
  }
  
  static struct copy_src
  get_copy_src(const struct ir3_register *reg, unsigned offset)
  {
-       if (reg->flags & IR3_REG_IMMED) {
-               return (struct copy_src) {
-                       .flags = IR3_REG_IMMED,
-                       .imm = reg->uim_val,
-               };
-       } else if (reg->flags & IR3_REG_CONST) {
-               return (struct copy_src) {
-                       .flags = IR3_REG_CONST,
-                       .const_num = reg->num,
-               };
-       } else {
-               return (struct copy_src) {
-                       .flags = 0,
-                       .reg = ra_reg_get_physreg(reg) + offset,
-               };
-       }
+   if (reg->flags & IR3_REG_IMMED) {
+      return (struct copy_src){
+         .flags = IR3_REG_IMMED,
+         .imm = reg->uim_val,
+      };
+   } else if (reg->flags & IR3_REG_CONST) {
+      return (struct copy_src){
+         .flags = IR3_REG_CONST,
+         .const_num = reg->num,
+      };
+   } else {
+      return (struct copy_src){
+         .flags = 0,
+         .reg = ra_reg_get_physreg(reg) + offset,
+      };
+   }
  }
  
  static void
-do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsigned src2_num, unsigned flags)
+do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
+       unsigned src2_num, unsigned flags)
  {
-       struct ir3_instruction *xor = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
-       ir3_dst_create(xor, dst_num, flags);
-       ir3_src_create(xor, src1_num, flags);
-       ir3_src_create(xor, src2_num, flags);
+   struct ir3_instruction * xor
+      = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
+   ir3_dst_create(xor, dst_num, flags);
+   ir3_src_create(xor, src1_num, flags);
+   ir3_src_create(xor, src2_num, flags);
  
-       ir3_instr_move_before(xor, instr);
+   ir3_instr_move_before(xor, instr);
  }
  
  static void
  do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
-               const struct copy_entry *entry)
+        const struct copy_entry *entry)
  {
-       assert(!entry->src.flags);
-
-       if (entry->flags & IR3_REG_HALF) {
-               /* We currently make sure to never emit parallel copies where the
-                * source/destination is a half-reg above the range accessable to half
-                * registers. However, when a full-reg source overlaps a half-reg
-                * destination or vice versa, it can be very, very complicated to come
-                * up with a series of "legal" swaps and copies to resolve the
-                * parallel copy. So here we provide a fallback to implement the
-                * "illegal" swap instead. This may also be useful for implementing
-                * "spilling" half-regs to the inaccessable space.
-                */
-               if (entry->src.reg >= RA_HALF_SIZE) {
-                       /* Choose a temporary that doesn't overlap src or dst */
-                       physreg_t tmp = entry->dst < 2 ? 2 : 0;
-
-                       /* Swap src and the temporary */
-                       do_swap(compiler, instr, &(struct copy_entry) {
-                               .src = { .reg = entry->src.reg & ~1u },
-                               .dst = tmp,
-                               .flags = entry->flags & ~IR3_REG_HALF,
-                       });
-
-                       /* Do the original swap with src replaced with tmp */
-                       do_swap(compiler, instr, &(struct copy_entry) {
-                               .src = { .reg = tmp + (entry->src.reg & 1) },
-                               .dst = entry->dst,
-                               .flags = entry->flags,
-                       });
-
-                       /* Swap src and the temporary back */
-                       do_swap(compiler, instr, &(struct copy_entry) {
-                               .src = { .reg = entry->src.reg & ~1u },
-                               .dst = tmp,
-                               .flags = entry->flags & ~IR3_REG_HALF,
-                       });
-                       return;
-               }
-
-               /* If dst is not addressable, we only need to swap the arguments and
-                * let the case above handle it.
-                */
-               if (entry->dst >= RA_HALF_SIZE) {
-                       do_swap(compiler, instr, &(struct copy_entry) {
-                               .src = { .reg = entry->dst },
-                               .dst = entry->src.reg,
-                               .flags = entry->flags,
-                       });
-                       return;
-               }
-       }
-
-       unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
-       unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
-
-       /* a5xx+ is known to support swz, which enables us to swap two registers
-        * in-place. If unsupported we emulate it using the xor trick.
-        */
-       if (compiler->gpu_id < 500) {
-               /* Shared regs only exist since a5xx, so we don't have to provide a
-                * fallback path for them.
-                */
-               assert(!(entry->flags & IR3_REG_SHARED));
-               do_xor(instr, dst_num, dst_num, src_num, entry->flags);
-               do_xor(instr, src_num, src_num, dst_num, entry->flags);
-               do_xor(instr, dst_num, dst_num, src_num, entry->flags);
-       } else {
-               /* Use a macro for shared regs because any shared reg writes need to
-                * be wrapped in a getone block to work correctly. Writing shared regs
-                * with multiple threads active does not work, even if they all return
-                * the same value.
-                */
-               unsigned opc = (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
-               struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
-               ir3_dst_create(swz, dst_num, entry->flags);
-               ir3_dst_create(swz, src_num, entry->flags);
-               ir3_src_create(swz, src_num, entry->flags);
-               ir3_src_create(swz, dst_num, entry->flags);
-               swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-               swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-               swz->repeat = 1;
-               ir3_instr_move_before(swz, instr);
-       }
+   assert(!entry->src.flags);
+
+   if (entry->flags & IR3_REG_HALF) {
+      /* We currently make sure to never emit parallel copies where the
+       * source/destination is a half-reg above the range accessable to half
+       * registers. However, when a full-reg source overlaps a half-reg
+       * destination or vice versa, it can be very, very complicated to come
+       * up with a series of "legal" swaps and copies to resolve the
+       * parallel copy. So here we provide a fallback to implement the
+       * "illegal" swap instead. This may also be useful for implementing
+       * "spilling" half-regs to the inaccessable space.
+       */
+      if (entry->src.reg >= RA_HALF_SIZE) {
+         /* Choose a temporary that doesn't overlap src or dst */
+         physreg_t tmp = entry->dst < 2 ? 2 : 0;
+
+         /* Swap src and the temporary */
+         do_swap(compiler, instr,
+                 &(struct copy_entry){
+                    .src = {.reg = entry->src.reg & ~1u},
+                    .dst = tmp,
+                    .flags = entry->flags & ~IR3_REG_HALF,
+                 });
+
+         /* Do the original swap with src replaced with tmp */
+         do_swap(compiler, instr,
+                 &(struct copy_entry){
+                    .src = {.reg = tmp + (entry->src.reg & 1)},
+                    .dst = entry->dst,
+                    .flags = entry->flags,
+                 });
+
+         /* Swap src and the temporary back */
+         do_swap(compiler, instr,
+                 &(struct copy_entry){
+                    .src = {.reg = entry->src.reg & ~1u},
+                    .dst = tmp,
+                    .flags = entry->flags & ~IR3_REG_HALF,
+                 });
+         return;
+      }
+
+      /* If dst is not addressable, we only need to swap the arguments and
+       * let the case above handle it.
+       */
+      if (entry->dst >= RA_HALF_SIZE) {
+         do_swap(compiler, instr,
+                 &(struct copy_entry){
+                    .src = {.reg = entry->dst},
+                    .dst = entry->src.reg,
+                    .flags = entry->flags,
+                 });
+         return;
+      }
+   }
+
+   unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
+   unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
+
+   /* a5xx+ is known to support swz, which enables us to swap two registers
+    * in-place. If unsupported we emulate it using the xor trick.
+    */
+   if (compiler->gpu_id < 500) {
+      /* Shared regs only exist since a5xx, so we don't have to provide a
+       * fallback path for them.
+       */
+      assert(!(entry->flags & IR3_REG_SHARED));
+      do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+      do_xor(instr, src_num, src_num, dst_num, entry->flags);
+      do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+   } else {
+      /* Use a macro for shared regs because any shared reg writes need to
+       * be wrapped in a getone block to work correctly. Writing shared regs
+       * with multiple threads active does not work, even if they all return
+       * the same value.
+       */
+      unsigned opc =
+         (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
+      struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
+      ir3_dst_create(swz, dst_num, entry->flags);
+      ir3_dst_create(swz, src_num, entry->flags);
+      ir3_src_create(swz, src_num, entry->flags);
+      ir3_src_create(swz, dst_num, entry->flags);
+      swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+      swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+      swz->repeat = 1;
+      ir3_instr_move_before(swz, instr);
+   }
  }
  
  static void
  do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
-               const struct copy_entry *entry)
+        const struct copy_entry *entry)
  {
-       if (entry->flags & IR3_REG_HALF) {
-               /* See do_swap() for why this is here. */
-               if (entry->dst >= RA_HALF_SIZE) {
-                       /* TODO: is there a hw instruction we can use for this case? */
-                       physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
-
-                       do_swap(compiler, instr, &(struct copy_entry) {
-                               .src = { .reg = entry->dst & ~1u },
-                               .dst = tmp,
-                               .flags = entry->flags & ~IR3_REG_HALF,
-                       });
-
-                       do_copy(compiler, instr, &(struct copy_entry) {
-                               .src = entry->src,
-                               .dst = tmp + (entry->dst & 1),
-                               .flags = entry->flags,
-                       });
-
-                       do_swap(compiler, instr, &(struct copy_entry) {
-                               .src = { .reg = entry->dst & ~1u },
-                               .dst = tmp,
-                               .flags = entry->flags & ~IR3_REG_HALF,
-                       });
-                       return;
-               }
-
-               if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
-                       unsigned src_num =
-                               ra_physreg_to_num(entry->src.reg & ~1u, entry->flags & ~IR3_REG_HALF);
-                       unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
-                       
-                       if (entry->src.reg % 2 == 0) {
-                               /* cov.u32u16 dst, src */
-                               struct ir3_instruction *cov = ir3_instr_create(instr->block, OPC_MOV, 1, 1);
-                               ir3_dst_create(cov, dst_num, entry->flags);
-                               ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
-                               cov->cat1.dst_type = TYPE_U16;
-                               cov->cat1.src_type = TYPE_U32;
-                               ir3_instr_move_before(cov, instr);
-                       } else {
-                               /* shr.b dst, src, h(16) */
-                               struct ir3_instruction *shr = ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
-                               ir3_dst_create(shr, dst_num, entry->flags);
-                               ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
-                               ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
-                               ir3_instr_move_before(shr, instr);
-                       }
-                       return;
-               }
-       }
-
-       unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
-       unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
-
-       /* Similar to the swap case, we have to use a macro for shared regs. */
-       unsigned opc = (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
-       struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
-       ir3_dst_create(mov, dst_num, entry->flags);
-       ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
-       mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-       mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-       if (entry->src.flags & IR3_REG_IMMED)
-               mov->srcs[0]->uim_val = entry->src.imm;
-       else if (entry->src.flags & IR3_REG_CONST)
-               mov->srcs[0]->num = entry->src.const_num;
-       ir3_instr_move_before(mov, instr);
+   if (entry->flags & IR3_REG_HALF) {
+      /* See do_swap() for why this is here. */
+      if (entry->dst >= RA_HALF_SIZE) {
+         /* TODO: is there a hw instruction we can use for this case? */
+         physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
+
+         do_swap(compiler, instr,
+                 &(struct copy_entry){
+                    .src = {.reg = entry->dst & ~1u},
+                    .dst = tmp,
+                    .flags = entry->flags & ~IR3_REG_HALF,
+                 });
+
+         do_copy(compiler, instr,
+                 &(struct copy_entry){
+                    .src = entry->src,
+                    .dst = tmp + (entry->dst & 1),
+                    .flags = entry->flags,
+                 });
+
+         do_swap(compiler, instr,
+                 &(struct copy_entry){
+                    .src = {.reg = entry->dst & ~1u},
+                    .dst = tmp,
+                    .flags = entry->flags & ~IR3_REG_HALF,
+                 });
+         return;
+      }
+
+      if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
+         unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
+                                              entry->flags & ~IR3_REG_HALF);
+         unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
+
+         if (entry->src.reg % 2 == 0) {
+            /* cov.u32u16 dst, src */
+            struct ir3_instruction *cov =
+               ir3_instr_create(instr->block, OPC_MOV, 1, 1);
+            ir3_dst_create(cov, dst_num, entry->flags);
+            ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
+            cov->cat1.dst_type = TYPE_U16;
+            cov->cat1.src_type = TYPE_U32;
+            ir3_instr_move_before(cov, instr);
+         } else {
+            /* shr.b dst, src, h(16) */
+            struct ir3_instruction *shr =
+               ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
+            ir3_dst_create(shr, dst_num, entry->flags);
+            ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
+            ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
+            ir3_instr_move_before(shr, instr);
+         }
+         return;
+      }
+   }
+
+   unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
+   unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
+
+   /* Similar to the swap case, we have to use a macro for shared regs. */
+   unsigned opc =
+      (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
+   struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
+   ir3_dst_create(mov, dst_num, entry->flags);
+   ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
+   mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   if (entry->src.flags & IR3_REG_IMMED)
+      mov->srcs[0]->uim_val = entry->src.imm;
+   else if (entry->src.flags & IR3_REG_CONST)
+      mov->srcs[0]->num = entry->src.const_num;
+   ir3_instr_move_before(mov, instr);
  }
  
  struct copy_ctx {
-       /* For each physreg, the number of pending copy entries that use it as a
-        * source. Once this drops to zero, then the physreg is unblocked and can
-        * be moved to.
-        */
-       unsigned physreg_use_count[RA_MAX_FILE_SIZE];
+   /* For each physreg, the number of pending copy entries that use it as a
+    * source. Once this drops to zero, then the physreg is unblocked and can
+    * be moved to.
+    */
+   unsigned physreg_use_count[RA_MAX_FILE_SIZE];
  
-       /* For each physreg, the pending copy_entry that uses it as a dest. */
-       struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
+   /* For each physreg, the pending copy_entry that uses it as a dest. */
+   struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
  
-       struct copy_entry entries[RA_MAX_FILE_SIZE];
-       unsigned entry_count;
+   struct copy_entry entries[RA_MAX_FILE_SIZE];
+   unsigned entry_count;
  };
  
  static bool
  entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
  {
-       for (unsigned i = 0; i < copy_entry_size(entry); i++) {
-               if (ctx->physreg_use_count[entry->dst + i] != 0)
-                       return true;
-       }
+   for (unsigned i = 0; i < copy_entry_size(entry); i++) {
+      if (ctx->physreg_use_count[entry->dst + i] != 0)
+         return true;
+   }
  
-       return false;
+   return false;
  }
  
  static void
  split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
  {
-       assert(!entry->done);
-       assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
-       assert(copy_entry_size(entry) == 2);
-       struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
-
-       new_entry->dst = entry->dst + 1;
-       new_entry->src.flags = entry->src.flags;
-       new_entry->src.reg = entry->src.reg + 1;
-       new_entry->done = false;
-       entry->flags |= IR3_REG_HALF;
-       new_entry->flags = entry->flags;
-       ctx->physreg_dst[entry->dst + 1] = new_entry;
+   assert(!entry->done);
+   assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
+   assert(copy_entry_size(entry) == 2);
+   struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
+
+   new_entry->dst = entry->dst + 1;
+   new_entry->src.flags = entry->src.flags;
+   new_entry->src.reg = entry->src.reg + 1;
+   new_entry->done = false;
+   entry->flags |= IR3_REG_HALF;
+   new_entry->flags = entry->flags;
+   ctx->physreg_dst[entry->dst + 1] = new_entry;
  }
  
  static void
  _handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
-                          struct copy_ctx *ctx)
+               struct copy_ctx *ctx)
  {
-       /* Set up the bookkeeping */
-       memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
-       memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
-
-       for (unsigned i = 0; i < ctx->entry_count; i++) {
-               struct copy_entry *entry = &ctx->entries[i];
-               for (unsigned j = 0; j < copy_entry_size(entry); j++) {
-                       if (!entry->src.flags)
-                               ctx->physreg_use_count[entry->src.reg + j]++;
-
-                       /* Copies should not have overlapping destinations. */
-                       assert(!ctx->physreg_dst[entry->dst + j]);
-                       ctx->physreg_dst[entry->dst + j] = entry;
-               }
-       }
-
-       bool progress = true;
-       while (progress) {
-               progress = false;
-
-               /* Step 1: resolve paths in the transfer graph. This means finding
-                * copies whose destination aren't blocked by something else and then
-                * emitting them, continuing this process until every copy is blocked
-                * and there are only cycles left.
-                *
-                * TODO: We should note that src is also available in dst to unblock
-                * cycles that src is involved in.
-                */
-
-               for (unsigned i = 0; i < ctx->entry_count; i++) {
-                       struct copy_entry *entry = &ctx->entries[i];
-                       if (!entry->done && !entry_blocked(entry, ctx)) {
-                               entry->done = true;
-                               progress = true;
-                               do_copy(compiler, instr, entry);
-                               for (unsigned j = 0; j < copy_entry_size(entry); j++) {
-                                       if (!entry->src.flags)
-                                               ctx->physreg_use_count[entry->src.reg + j]--;
-                                       ctx->physreg_dst[entry->dst + j] = NULL;
-                               }
-                       }
-               }
-
-               if (progress)
-                       continue;
-
-               /* Step 2: Find partially blocked copies and split them. In the
-                * mergedregs case, we can 32-bit copies which are only blocked on one
-                * 16-bit half, and splitting them helps get things moving.
-                *
-                * We can skip splitting copies if the source isn't a register,
-                * however, because it does not unblock anything and therefore doesn't
-                * contribute to making forward progress with step 1. These copies
-                * should still be resolved eventually in step 1 because they can't be
-                * part of a cycle.
-                */
-               for (unsigned i = 0; i < ctx->entry_count; i++) {
-                       struct copy_entry *entry = &ctx->entries[i];
-                       if (entry->done || entry->flags & IR3_REG_HALF)
-                               continue;
-
-                       if (((ctx->physreg_use_count[entry->dst] == 0 ||
-                                 ctx->physreg_use_count[entry->dst + 1] == 0)) &&
-                                !(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
-                               split_32bit_copy(ctx, entry);
-                               progress = true;
-                       }
-               }
-       }
-
-       /* Step 3: resolve cycles through swapping.
-        *
-        * At this point, the transfer graph should consist of only cycles.
-        * The reason is that, given any physreg n_1 that's the source of a
-        * remaining entry, it has a destination n_2, which (because every
-        * copy is blocked) is the source of some other copy whose destination
-        * is n_3, and so we can follow the chain until we get a cycle. If we
-        * reached some other node than n_1:
-        *
-        *  n_1 -> n_2 -> ... -> n_i
-        *          ^             |
-        *          |-------------|
-        *
-        *  then n_2 would be the destination of 2 copies, which is illegal
-        *  (checked above in an assert). So n_1 must be part of a cycle:
-        *
-        *  n_1 -> n_2 -> ... -> n_i
-        *  ^                     |
-        *  |---------------------|
-        *
-        *  and this must be only cycle n_1 is involved in, because any other
-        *  path starting from n_1 would also have to end in n_1, resulting in
-        *  a node somewhere along the way being the destination of 2 copies
-        *  when the 2 paths merge.
-        *
-        *  The way we resolve the cycle is through picking a copy (n_1, n_2)
-        *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
-        *  out of the cycle:
-        *
-        *  n_1 -> ... -> n_i
-        *  ^              |
-        *  |--------------|
-        *
-        *  and we can keep repeating this until the cycle is empty.
-        */
-
-       for (unsigned i = 0; i < ctx->entry_count; i++) {
-               struct copy_entry *entry = &ctx->entries[i];
-               if (entry->done)
-                       continue;
-
-               assert(!entry->src.flags);
-
-               /* catch trivial copies */
-               if (entry->dst == entry->src.reg) {
-                       entry->done = true;
-                       continue;
-               }
-
-               do_swap(compiler, instr, entry);
-
-               /* Split any blocking copies whose sources are only partially
-                * contained within our destination.
-                */
-               if (entry->flags & IR3_REG_HALF) {
-                       for (unsigned j = 0; j < ctx->entry_count; j++) {
-                               struct copy_entry *blocking = &ctx->entries[j];
-
-                               if (blocking->done)
-                                       continue;
-
-                               if (blocking->src.reg <= entry->dst &&
-                                       blocking->src.reg + 1 >= entry->dst &&
-                                       !(blocking->flags & IR3_REG_HALF)) {
-                                       split_32bit_copy(ctx, blocking);
-                               }
-                       }
-               }
-
-               /* Update sources of blocking copies.
-                *
-                * Note: at this point, every blocking copy's source should be
-                * contained within our destination.
-                */
-               for (unsigned j = 0; j < ctx->entry_count; j++) {
-                       struct copy_entry *blocking = &ctx->entries[j];
-                       if (blocking->src.reg >= entry->dst &&
-                               blocking->src.reg < entry->dst + copy_entry_size(entry)) {
-                               blocking->src.reg = entry->src.reg + (blocking->src.reg - entry->dst);
-                       }
-               }
-       }
+   /* Set up the bookkeeping */
+   memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
+   memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
+
+   for (unsigned i = 0; i < ctx->entry_count; i++) {
+      struct copy_entry *entry = &ctx->entries[i];
+      for (unsigned j = 0; j < copy_entry_size(entry); j++) {
+         if (!entry->src.flags)
+            ctx->physreg_use_count[entry->src.reg + j]++;
+
+         /* Copies should not have overlapping destinations. */
+         assert(!ctx->physreg_dst[entry->dst + j]);
+         ctx->physreg_dst[entry->dst + j] = entry;
+      }
+   }
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+
+      /* Step 1: resolve paths in the transfer graph. This means finding
+       * copies whose destination aren't blocked by something else and then
+       * emitting them, continuing this process until every copy is blocked
+       * and there are only cycles left.
+       *
+       * TODO: We should note that src is also available in dst to unblock
+       * cycles that src is involved in.
+       */
+
+      for (unsigned i = 0; i < ctx->entry_count; i++) {
+         struct copy_entry *entry = &ctx->entries[i];
+         if (!entry->done && !entry_blocked(entry, ctx)) {
+            entry->done = true;
+            progress = true;
+            do_copy(compiler, instr, entry);
+            for (unsigned j = 0; j < copy_entry_size(entry); j++) {
+               if (!entry->src.flags)
+                  ctx->physreg_use_count[entry->src.reg + j]--;
+               ctx->physreg_dst[entry->dst + j] = NULL;
+            }
+         }
+      }
+
+      if (progress)
+         continue;
+
+      /* Step 2: Find partially blocked copies and split them. In the
+       * mergedregs case, we can 32-bit copies which are only blocked on one
+       * 16-bit half, and splitting them helps get things moving.
+       *
+       * We can skip splitting copies if the source isn't a register,
+       * however, because it does not unblock anything and therefore doesn't
+       * contribute to making forward progress with step 1. These copies
+       * should still be resolved eventually in step 1 because they can't be
+       * part of a cycle.
+       */
+      for (unsigned i = 0; i < ctx->entry_count; i++) {
+         struct copy_entry *entry = &ctx->entries[i];
+         if (entry->done || entry->flags & IR3_REG_HALF)
+            continue;
+
+         if (((ctx->physreg_use_count[entry->dst] == 0 ||
+               ctx->physreg_use_count[entry->dst + 1] == 0)) &&
+             !(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
+            split_32bit_copy(ctx, entry);
+            progress = true;
+         }
+      }
+   }
+
+   /* Step 3: resolve cycles through swapping.
+    *
+    * At this point, the transfer graph should consist of only cycles.
+    * The reason is that, given any physreg n_1 that's the source of a
+    * remaining entry, it has a destination n_2, which (because every
+    * copy is blocked) is the source of some other copy whose destination
+    * is n_3, and so we can follow the chain until we get a cycle. If we
+    * reached some other node than n_1:
+    *
+    *  n_1 -> n_2 -> ... -> n_i
+    *          ^             |
+    *          |-------------|
+    *
+    *  then n_2 would be the destination of 2 copies, which is illegal
+    *  (checked above in an assert). So n_1 must be part of a cycle:
+    *
+    *  n_1 -> n_2 -> ... -> n_i
+    *  ^                     |
+    *  |---------------------|
+    *
+    *  and this must be only cycle n_1 is involved in, because any other
+    *  path starting from n_1 would also have to end in n_1, resulting in
+    *  a node somewhere along the way being the destination of 2 copies
+    *  when the 2 paths merge.
+    *
+    *  The way we resolve the cycle is through picking a copy (n_1, n_2)
+    *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
+    *  out of the cycle:
+    *
+    *  n_1 -> ... -> n_i
+    *  ^              |
+    *  |--------------|
+    *
+    *  and we can keep repeating this until the cycle is empty.
+    */
+
+   for (unsigned i = 0; i < ctx->entry_count; i++) {
+      struct copy_entry *entry = &ctx->entries[i];
+      if (entry->done)
+         continue;
+
+      assert(!entry->src.flags);
+
+      /* catch trivial copies */
+      if (entry->dst == entry->src.reg) {
+         entry->done = true;
+         continue;
+      }
+
+      do_swap(compiler, instr, entry);
+
+      /* Split any blocking copies whose sources are only partially
+       * contained within our destination.
+       */
+      if (entry->flags & IR3_REG_HALF) {
+         for (unsigned j = 0; j < ctx->entry_count; j++) {
+            struct copy_entry *blocking = &ctx->entries[j];
+
+            if (blocking->done)
+               continue;
+
+            if (blocking->src.reg <= entry->dst &&
+                blocking->src.reg + 1 >= entry->dst &&
+                !(blocking->flags & IR3_REG_HALF)) {
+               split_32bit_copy(ctx, blocking);
+            }
+         }
+      }
+
+      /* Update sources of blocking copies.
+       *
+       * Note: at this point, every blocking copy's source should be
+       * contained within our destination.
+       */
+      for (unsigned j = 0; j < ctx->entry_count; j++) {
+         struct copy_entry *blocking = &ctx->entries[j];
+         if (blocking->src.reg >= entry->dst &&
+             blocking->src.reg < entry->dst + copy_entry_size(entry)) {
+            blocking->src.reg =
+               entry->src.reg + (blocking->src.reg - entry->dst);
+         }
+      }
+   }
  }
  
  static void
  handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
-                         struct copy_entry *entries, unsigned entry_count)
+              struct copy_entry *entries, unsigned entry_count)
  {
-       struct copy_ctx ctx;    
-
-       /* handle shared copies first */
-       ctx.entry_count = 0;
-       for (unsigned i = 0; i < entry_count; i++) {
-               if (entries[i].flags & IR3_REG_SHARED)
-                       ctx.entries[ctx.entry_count++] = entries[i];
-       }
-       _handle_copies(v->shader->compiler, instr, &ctx);
-
-       if (v->mergedregs) {
-               /* Half regs and full regs are in the same file, so handle everything
-                * at once.
-                */
-               ctx.entry_count = 0;
-               for (unsigned i = 0; i < entry_count; i++) {
-                       if (!(entries[i].flags & IR3_REG_SHARED))
-                               ctx.entries[ctx.entry_count++] = entries[i];
-               }
-               _handle_copies(v->shader->compiler, instr, &ctx);
-       } else {
-               /* There may be both half copies and full copies, so we have to split
-                * them up since they don't interfere.
-                */
-               ctx.entry_count = 0;
-               for (unsigned i = 0; i < entry_count; i++) {
-                       if (entries[i].flags & IR3_REG_HALF)
-                               ctx.entries[ctx.entry_count++] = entries[i];
-               }
-               _handle_copies(v->shader->compiler, instr, &ctx);
-
-               ctx.entry_count = 0;
-               for (unsigned i = 0; i < entry_count; i++) {
-                       if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
-                               ctx.entries[ctx.entry_count++] = entries[i];
-               }
-               _handle_copies(v->shader->compiler, instr, &ctx);
-       }
+   struct copy_ctx ctx;
+
+   /* handle shared copies first */
+   ctx.entry_count = 0;
+   for (unsigned i = 0; i < entry_count; i++) {
+      if (entries[i].flags & IR3_REG_SHARED)
+         ctx.entries[ctx.entry_count++] = entries[i];
+   }
+   _handle_copies(v->shader->compiler, instr, &ctx);
+
+   if (v->mergedregs) {
+      /* Half regs and full regs are in the same file, so handle everything
+       * at once.
+       */
+      ctx.entry_count = 0;
+      for (unsigned i = 0; i < entry_count; i++) {
+         if (!(entries[i].flags & IR3_REG_SHARED))
+            ctx.entries[ctx.entry_count++] = entries[i];
+      }
+      _handle_copies(v->shader->compiler, instr, &ctx);
+   } else {
+      /* There may be both half copies and full copies, so we have to split
+       * them up since they don't interfere.
+       */
+      ctx.entry_count = 0;
+      for (unsigned i = 0; i < entry_count; i++) {
+         if (entries[i].flags & IR3_REG_HALF)
+            ctx.entries[ctx.entry_count++] = entries[i];
+      }
+      _handle_copies(v->shader->compiler, instr, &ctx);
+
+      ctx.entry_count = 0;
+      for (unsigned i = 0; i < entry_count; i++) {
+         if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
+            ctx.entries[ctx.entry_count++] = entries[i];
+      }
+      _handle_copies(v->shader->compiler, instr, &ctx);
+   }
  }
  
  void
  ir3_lower_copies(struct ir3_shader_variant *v)
  {
-       DECLARE_ARRAY(struct copy_entry, copies);
-       copies_count = copies_sz = 0;
-       copies = NULL;
-
-       foreach_block (block, &v->ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       if (instr->opc == OPC_META_PARALLEL_COPY) {
-                               copies_count = 0;
-                               for (unsigned i = 0; i < instr->dsts_count; i++) {
-                                       struct ir3_register *dst = instr->dsts[i];
-                                       struct ir3_register *src = instr->srcs[i];
-                                       unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
-                                       unsigned dst_physreg = ra_reg_get_physreg(dst);
-                                       for (unsigned j = 0; j < reg_elems(dst); j++) {
-                                               array_insert(NULL, copies, (struct copy_entry) {
-                                                       .dst = dst_physreg + j * reg_elem_size(dst),
-                                                       .src = get_copy_src(src, j * reg_elem_size(dst)),
-                                                       .flags = flags,
-                                               });
-                                       }
-                               }
-                               handle_copies(v, instr, copies, copies_count);
-                               list_del(&instr->node);
-                       } else if (instr->opc == OPC_META_COLLECT) {
-                               copies_count = 0;
-                               struct ir3_register *dst = instr->dsts[0];
-                               unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
-                               for (unsigned i = 0; i < instr->srcs_count; i++) {
-                                       struct ir3_register *src = instr->srcs[i];
-                                       array_insert(NULL, copies, (struct copy_entry) {
-                                               .dst = ra_num_to_physreg(dst->num + i, flags),
-                                               .src = get_copy_src(src, 0),
-                                               .flags = flags,
-                                       });
-                               }
-                               handle_copies(v, instr, copies, copies_count);
-                               list_del(&instr->node);
-                       } else if (instr->opc == OPC_META_SPLIT) {
-                               copies_count = 0;
-                               struct ir3_register *dst = instr->dsts[0];
-                               struct ir3_register *src = instr->srcs[0];
-                               unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
-                               array_insert(NULL, copies, (struct copy_entry) {
-                                       .dst = ra_reg_get_physreg(dst),
-                                       .src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
-                                       .flags = flags,
-                               });
-                               handle_copies(v, instr, copies, copies_count);
-                               list_del(&instr->node);
-                       } else if (instr->opc == OPC_META_PHI) {
-                               list_del(&instr->node);
-                       }
-               }
-       }
-
-       if (copies)
-               ralloc_free(copies);
+   DECLARE_ARRAY(struct copy_entry, copies);
+   copies_count = copies_sz = 0;
+   copies = NULL;
+
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_META_PARALLEL_COPY) {
+            copies_count = 0;
+            for (unsigned i = 0; i < instr->dsts_count; i++) {
+               struct ir3_register *dst = instr->dsts[i];
+               struct ir3_register *src = instr->srcs[i];
+               unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+               unsigned dst_physreg = ra_reg_get_physreg(dst);
+               for (unsigned j = 0; j < reg_elems(dst); j++) {
+                  array_insert(
+                     NULL, copies,
+                     (struct copy_entry){
+                        .dst = dst_physreg + j * reg_elem_size(dst),
+                        .src = get_copy_src(src, j * reg_elem_size(dst)),
+                        .flags = flags,
+                     });
+               }
+            }
+            handle_copies(v, instr, copies, copies_count);
+            list_del(&instr->node);
+         } else if (instr->opc == OPC_META_COLLECT) {
+            copies_count = 0;
+            struct ir3_register *dst = instr->dsts[0];
+            unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+            for (unsigned i = 0; i < instr->srcs_count; i++) {
+               struct ir3_register *src = instr->srcs[i];
+               array_insert(NULL, copies,
+                            (struct copy_entry){
+                               .dst = ra_num_to_physreg(dst->num + i, flags),
+                               .src = get_copy_src(src, 0),
+                               .flags = flags,
+                            });
+            }
+            handle_copies(v, instr, copies, copies_count);
+            list_del(&instr->node);
+         } else if (instr->opc == OPC_META_SPLIT) {
+            copies_count = 0;
+            struct ir3_register *dst = instr->dsts[0];
+            struct ir3_register *src = instr->srcs[0];
+            unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+            array_insert(NULL, copies,
+                         (struct copy_entry){
+                            .dst = ra_reg_get_physreg(dst),
+                            .src = get_copy_src(
+                               src, instr->split.off * reg_elem_size(dst)),
+                            .flags = flags,
+                         });
+            handle_copies(v, instr, copies, copies_count);
+            list_del(&instr->node);
+         } else if (instr->opc == OPC_META_PHI) {
+            list_del(&instr->node);
+         }
+      }
+   }
+
+   if (copies)
+      ralloc_free(copies);
  }
-
diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c

index 2efdf09..84235ce 100644 (file)
--- a/src/freedreno/ir3/ir3_lower_subgroups.c
+++ b/src/freedreno/ir3/ir3_lower_subgroups.c
@@ -35,220 +35,224 @@
  
  static void
  replace_pred(struct ir3_block *block, struct ir3_block *old_pred,
-                        struct ir3_block *new_pred)
+             struct ir3_block *new_pred)
  {
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               if (block->predecessors[i] == old_pred) {
-                       block->predecessors[i] = new_pred;
-                       return;
-               }
-       }
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      if (block->predecessors[i] == old_pred) {
+         block->predecessors[i] = new_pred;
+         return;
+      }
+   }
  }
  
  static void
  replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred,
-                                         struct ir3_block *new_pred)
+                      struct ir3_block *new_pred)
  {
-       for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
-               if (block->physical_predecessors[i] == old_pred) {
-                       block->physical_predecessors[i] = new_pred;
-                       return;
-               }
-       }
+   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+      if (block->physical_predecessors[i] == old_pred) {
+         block->physical_predecessors[i] = new_pred;
+         return;
+      }
+   }
  }
  
  static void
  mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)
  {
-       struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
-       struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
-       mov_dst->wrmask = dst->wrmask;
-       struct ir3_register *src =
-               ir3_src_create(mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
-       src->uim_val = immed;
-       mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-       mov->cat1.src_type = mov->cat1.dst_type;
-       mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
+   mov_dst->wrmask = dst->wrmask;
+   struct ir3_register *src = ir3_src_create(
+      mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
+   src->uim_val = immed;
+   mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   mov->cat1.src_type = mov->cat1.dst_type;
+   mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
  }
  
  static struct ir3_block *
  split_block(struct ir3 *ir, struct ir3_block *before_block,
-                   struct ir3_instruction *instr, struct ir3_block **then)
+            struct ir3_instruction *instr, struct ir3_block **then)
  {
-       struct ir3_block *then_block = ir3_block_create(ir);
-       struct ir3_block *after_block = ir3_block_create(ir);
-       list_add(&then_block->node, &before_block->node);
-       list_add(&after_block->node, &then_block->node);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
-               after_block->successors[i] = before_block->successors[i];
-               if (after_block->successors[i])
-                       replace_pred(after_block->successors[i], before_block, after_block);
-       }
-
-       for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors); i++) {
-               after_block->physical_successors[i] = before_block->physical_successors[i];
-               if (after_block->physical_successors[i]) {
-                       replace_physical_pred(after_block->physical_successors[i],
-                                                                 before_block, after_block);
-               }
-       }
-
-       before_block->successors[0] = then_block;
-       before_block->successors[1] = after_block;
-       before_block->physical_successors[0] = then_block;
-       before_block->physical_successors[1] = after_block;
-       ir3_block_add_predecessor(then_block, before_block);
-       ir3_block_add_predecessor(after_block, before_block);
-       ir3_block_add_physical_predecessor(then_block, before_block);
-       ir3_block_add_physical_predecessor(after_block, before_block);
-
-       then_block->successors[0] = after_block;
-       then_block->physical_successors[0] = after_block;
-       ir3_block_add_predecessor(after_block, then_block);
-       ir3_block_add_physical_predecessor(after_block, then_block);
-       
-       foreach_instr_from_safe (rem_instr, &instr->node, &before_block->instr_list) {
-               list_del(&rem_instr->node);
-               list_addtail(&rem_instr->node, &after_block->instr_list);
-               rem_instr->block = after_block;
-       }
-
-       after_block->brtype = before_block->brtype;
-       after_block->condition = before_block->condition;
-
-       *then = then_block;
-       return after_block;
+   struct ir3_block *then_block = ir3_block_create(ir);
+   struct ir3_block *after_block = ir3_block_create(ir);
+   list_add(&then_block->node, &before_block->node);
+   list_add(&after_block->node, &then_block->node);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
+      after_block->successors[i] = before_block->successors[i];
+      if (after_block->successors[i])
+         replace_pred(after_block->successors[i], before_block, after_block);
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors);
+        i++) {
+      after_block->physical_successors[i] =
+         before_block->physical_successors[i];
+      if (after_block->physical_successors[i]) {
+         replace_physical_pred(after_block->physical_successors[i],
+                               before_block, after_block);
+      }
+   }
+
+   before_block->successors[0] = then_block;
+   before_block->successors[1] = after_block;
+   before_block->physical_successors[0] = then_block;
+   before_block->physical_successors[1] = after_block;
+   ir3_block_add_predecessor(then_block, before_block);
+   ir3_block_add_predecessor(after_block, before_block);
+   ir3_block_add_physical_predecessor(then_block, before_block);
+   ir3_block_add_physical_predecessor(after_block, before_block);
+
+   then_block->successors[0] = after_block;
+   then_block->physical_successors[0] = after_block;
+   ir3_block_add_predecessor(after_block, then_block);
+   ir3_block_add_physical_predecessor(after_block, then_block);
+
+   foreach_instr_from_safe (rem_instr, &instr->node,
+                            &before_block->instr_list) {
+      list_del(&rem_instr->node);
+      list_addtail(&rem_instr->node, &after_block->instr_list);
+      rem_instr->block = after_block;
+   }
+
+   after_block->brtype = before_block->brtype;
+   after_block->condition = before_block->condition;
+
+   *then = then_block;
+   return after_block;
  }
  
  static bool
  lower_block(struct ir3 *ir, struct ir3_block **block)
  {
-       bool progress = false;
-
-       foreach_instr_safe (instr, &(*block)->instr_list) {
-               switch (instr->opc) {
-               case OPC_BALLOT_MACRO:
-               case OPC_ANY_MACRO:
-               case OPC_ALL_MACRO:
-               case OPC_ELECT_MACRO:
-               case OPC_READ_COND_MACRO:
-               case OPC_READ_FIRST_MACRO:
-               case OPC_SWZ_SHARED_MACRO:
-                       break;
-               default:
-                       continue;
-               }
-
-               struct ir3_block *before_block = *block;
-               struct ir3_block *then_block;
-               struct ir3_block *after_block =
-                       split_block(ir, before_block, instr, &then_block);
-
-               /* For ballot, the destination must be initialized to 0 before we do
-                * the movmsk because the condition may be 0 and then the movmsk will
-                * be skipped. Because it's a shared register we have to wrap the
-                * initialization in a getone block.
-                */
-               if (instr->opc == OPC_BALLOT_MACRO) {
-                       before_block->brtype = IR3_BRANCH_GETONE;
-                       before_block->condition = NULL;
-                       mov_immed(instr->dsts[0], then_block, 0);
-                       before_block = after_block;
-                       after_block = split_block(ir, before_block, instr, &then_block);
-               }
-
-               switch (instr->opc) {
-               case OPC_BALLOT_MACRO:
-               case OPC_READ_COND_MACRO:
-               case OPC_ANY_MACRO:
-               case OPC_ALL_MACRO:
-                       before_block->condition = instr->srcs[0]->def->instr;
-                       break;
-               default:
-                       before_block->condition = NULL;
-                       break;
-               }
-
-               switch (instr->opc) {
-               case OPC_BALLOT_MACRO:
-               case OPC_READ_COND_MACRO:
-                       before_block->brtype = IR3_BRANCH_COND;
-                       break;
-               case OPC_ANY_MACRO:
-                       before_block->brtype = IR3_BRANCH_ANY;
-                       break;
-               case OPC_ALL_MACRO:
-                       before_block->brtype = IR3_BRANCH_ALL;
-                       break;
-               case OPC_ELECT_MACRO:
-               case OPC_READ_FIRST_MACRO:
-               case OPC_SWZ_SHARED_MACRO:
-                       before_block->brtype = IR3_BRANCH_GETONE;
-                       break;
-               default:
-                       unreachable("bad opcode");
-               }
-
-               switch (instr->opc) {
-               case OPC_ALL_MACRO:
-               case OPC_ANY_MACRO:
-               case OPC_ELECT_MACRO:
-                       mov_immed(instr->dsts[0], then_block, 1);
-                       mov_immed(instr->dsts[0], before_block, 0);
-                       break;
-
-               case OPC_BALLOT_MACRO: {
-                       unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
-                       struct ir3_instruction *movmsk = ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
-                       ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
-                       movmsk->repeat = comp_count - 1;
-                       break;
-               }
-
-               case OPC_READ_COND_MACRO:
-               case OPC_READ_FIRST_MACRO: {
-                       struct ir3_instruction *mov = ir3_instr_create(then_block, OPC_MOV, 1, 1);
-                       unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
-                       ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
-                       struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
-                       *new_src = *instr->srcs[src];
-                       mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
-                       break;
-               }
-
-               case OPC_SWZ_SHARED_MACRO: {
-                       struct ir3_instruction *swz =
-                               ir3_instr_create(then_block, OPC_SWZ, 2, 2);
-                       ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
-                       ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
-                       ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
-                       ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
-                       swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
-                       swz->repeat = 1;
-                       break;
-               }
-
-               default:
-                       unreachable("bad opcode");
-               }
-
-               *block = after_block;
-               list_delinit(&instr->node);
-               progress = true;
-       }
-
-       return progress;
+   bool progress = false;
+
+   foreach_instr_safe (instr, &(*block)->instr_list) {
+      switch (instr->opc) {
+      case OPC_BALLOT_MACRO:
+      case OPC_ANY_MACRO:
+      case OPC_ALL_MACRO:
+      case OPC_ELECT_MACRO:
+      case OPC_READ_COND_MACRO:
+      case OPC_READ_FIRST_MACRO:
+      case OPC_SWZ_SHARED_MACRO:
+         break;
+      default:
+         continue;
+      }
+
+      struct ir3_block *before_block = *block;
+      struct ir3_block *then_block;
+      struct ir3_block *after_block =
+         split_block(ir, before_block, instr, &then_block);
+
+      /* For ballot, the destination must be initialized to 0 before we do
+       * the movmsk because the condition may be 0 and then the movmsk will
+       * be skipped. Because it's a shared register we have to wrap the
+       * initialization in a getone block.
+       */
+      if (instr->opc == OPC_BALLOT_MACRO) {
+         before_block->brtype = IR3_BRANCH_GETONE;
+         before_block->condition = NULL;
+         mov_immed(instr->dsts[0], then_block, 0);
+         before_block = after_block;
+         after_block = split_block(ir, before_block, instr, &then_block);
+      }
+
+      switch (instr->opc) {
+      case OPC_BALLOT_MACRO:
+      case OPC_READ_COND_MACRO:
+      case OPC_ANY_MACRO:
+      case OPC_ALL_MACRO:
+         before_block->condition = instr->srcs[0]->def->instr;
+         break;
+      default:
+         before_block->condition = NULL;
+         break;
+      }
+
+      switch (instr->opc) {
+      case OPC_BALLOT_MACRO:
+      case OPC_READ_COND_MACRO:
+         before_block->brtype = IR3_BRANCH_COND;
+         break;
+      case OPC_ANY_MACRO:
+         before_block->brtype = IR3_BRANCH_ANY;
+         break;
+      case OPC_ALL_MACRO:
+         before_block->brtype = IR3_BRANCH_ALL;
+         break;
+      case OPC_ELECT_MACRO:
+      case OPC_READ_FIRST_MACRO:
+      case OPC_SWZ_SHARED_MACRO:
+         before_block->brtype = IR3_BRANCH_GETONE;
+         break;
+      default:
+         unreachable("bad opcode");
+      }
+
+      switch (instr->opc) {
+      case OPC_ALL_MACRO:
+      case OPC_ANY_MACRO:
+      case OPC_ELECT_MACRO:
+         mov_immed(instr->dsts[0], then_block, 1);
+         mov_immed(instr->dsts[0], before_block, 0);
+         break;
+
+      case OPC_BALLOT_MACRO: {
+         unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
+         struct ir3_instruction *movmsk =
+            ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
+         ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
+         movmsk->repeat = comp_count - 1;
+         break;
+      }
+
+      case OPC_READ_COND_MACRO:
+      case OPC_READ_FIRST_MACRO: {
+         struct ir3_instruction *mov =
+            ir3_instr_create(then_block, OPC_MOV, 1, 1);
+         unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
+         ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
+         struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
+         *new_src = *instr->srcs[src];
+         mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
+         break;
+      }
+
+      case OPC_SWZ_SHARED_MACRO: {
+         struct ir3_instruction *swz =
+            ir3_instr_create(then_block, OPC_SWZ, 2, 2);
+         ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
+         ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
+         ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
+         ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
+         swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
+         swz->repeat = 1;
+         break;
+      }
+
+      default:
+         unreachable("bad opcode");
+      }
+
+      *block = after_block;
+      list_delinit(&instr->node);
+      progress = true;
+   }
+
+   return progress;
  }
  
  bool
  ir3_lower_subgroups(struct ir3 *ir)
  {
-       bool progress = false;
+   bool progress = false;
  
-       foreach_block (block, &ir->block_list)
-               progress |= lower_block(ir, &block);
+   foreach_block (block, &ir->block_list)
+      progress |= lower_block(ir, &block);
  
-       return progress;
+   return progress;
  }
-
diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c

index 30ed2b5..e5ba1bd 100644 (file)
--- a/src/freedreno/ir3/ir3_merge_regs.c
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@@ -21,8 +21,8 @@
   * SOFTWARE.
   */
  
-#include "ir3_ra.h"
  #include "ir3_compiler.h"
+#include "ir3_ra.h"
  #include "ralloc.h"
  
  /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
@@ -49,7 +49,7 @@
   * try to allocate all the definitions in the same merge set to the
   * same/compatible registers. This helps us e.g. allocate sources of a collect
   * to contiguous registers without too much special code in RA.
- * 
+ *
   * In a "normal" register allocator, or when spilling, we'd just merge
   * registers in the same merge set to the same register, but with SSA-based
   * register allocation we may have to split the live interval.
@@ -71,13 +71,13 @@
  static unsigned
  index_instrs(struct ir3_block *block, unsigned index)
  {
-       foreach_instr (instr, &block->instr_list)
-               instr->ip = index++;
+   foreach_instr (instr, &block->instr_list)
+      instr->ip = index++;
  
-       for (unsigned i = 0; i < block->dom_children_count; i++)
-               index = index_instrs(block->dom_children[i], index);
+   for (unsigned i = 0; i < block->dom_children_count; i++)
+      index = index_instrs(block->dom_children[i], index);
  
-       return index;
+   return index;
  }
  
  /* Definitions within a merge set are ordered by instr->ip as set above: */
@@ -85,27 +85,27 @@ index_instrs(struct ir3_block *block, unsigned index)
  static bool
  def_after(struct ir3_register *a, struct ir3_register *b)
  {
-       return a->instr->ip > b->instr->ip;
+   return a->instr->ip > b->instr->ip;
  }
  
  static bool
  def_dominates(struct ir3_register *a, struct ir3_register *b)
  {
-       if (def_after(a, b)) {
-               return false;
-       } else if (a->instr->block == b->instr->block) {
-               return def_after(b, a);
-       } else {
-               return ir3_block_dominates(a->instr->block, b->instr->block);
-       }
+   if (def_after(a, b)) {
+      return false;
+   } else if (a->instr->block == b->instr->block) {
+      return def_after(b, a);
+   } else {
+      return ir3_block_dominates(a->instr->block, b->instr->block);
+   }
  }
  
  /* This represents a region inside a register. The offset is relative to the
   * start of the register, and offset + size <= size(reg).
   */
  struct def_value {
-       struct ir3_register *reg;
-       unsigned offset, size;
+   struct ir3_register *reg;
+   unsigned offset, size;
  };
  
  /* Chase any copies to get the source of a region inside a register. This is
@@ -114,456 +114,452 @@ struct def_value {
  static struct def_value
  chase_copies(struct def_value value)
  {
-       while (true) {
-               struct ir3_instruction *instr = value.reg->instr;
-               if (instr->opc == OPC_META_SPLIT) {
-                       value.offset += instr->split.off * reg_elem_size(value.reg);
-                       value.reg = instr->srcs[0]->def;
-               } else if (instr->opc == OPC_META_COLLECT) {
-                       if (value.offset % reg_elem_size(value.reg) != 0 ||
-                               value.size > reg_elem_size(value.reg) ||
-                               value.offset + value.size > reg_size(value.reg))
-                               break;
-                       struct ir3_register *src = instr->srcs[value.offset / reg_elem_size(value.reg)];
-                       if (!src->def)
-                               break;
-                       value.offset = 0;
-                       value.reg = src->def;
-               } else {
-                       /* TODO: parallelcopy */
-                       break;
-               }
-       }
-
-       return value;
+   while (true) {
+      struct ir3_instruction *instr = value.reg->instr;
+      if (instr->opc == OPC_META_SPLIT) {
+         value.offset += instr->split.off * reg_elem_size(value.reg);
+         value.reg = instr->srcs[0]->def;
+      } else if (instr->opc == OPC_META_COLLECT) {
+         if (value.offset % reg_elem_size(value.reg) != 0 ||
+             value.size > reg_elem_size(value.reg) ||
+             value.offset + value.size > reg_size(value.reg))
+            break;
+         struct ir3_register *src =
+            instr->srcs[value.offset / reg_elem_size(value.reg)];
+         if (!src->def)
+            break;
+         value.offset = 0;
+         value.reg = src->def;
+      } else {
+         /* TODO: parallelcopy */
+         break;
+      }
+   }
+
+   return value;
  }
  
  /* This represents an entry in the merge set, and consists of a register +
   * offset from the merge set base.
   */
  struct merge_def {
-       struct ir3_register *reg;
-       unsigned offset;
+   struct ir3_register *reg;
+   unsigned offset;
  };
  
  static bool
  can_skip_interference(const struct merge_def *a, const struct merge_def *b)
  {
-       unsigned a_start = a->offset;
-       unsigned b_start = b->offset;
-       unsigned a_end = a_start + reg_size(a->reg);
-       unsigned b_end = b_start + reg_size(b->reg);
-
-       /* Registers that don't overlap never interfere */
-       if (a_end <= b_start || b_end <= a_start)
-               return true;
-
-       /* Disallow skipping interference unless one definition contains the
-        * other. This restriction is important for register allocation, because
-        * it means that at any given point in the program, the live values in a
-        * given merge set will form a tree. If they didn't, then one live value
-        * would partially overlap another, and they would have overlapping live
-        * ranges because they're live at the same point. This simplifies register
-        * allocation and spilling.
-        */
-       if (!((a_start <= b_start && a_end >= b_end) ||
-                 (b_start <= a_start && b_end >= a_end)))
-               return false;
-
-       /* For each register, chase the intersection of a and b to find the
-        * ultimate source.
-        */
-       unsigned start = MAX2(a_start, b_start);
-       unsigned end = MIN2(a_end, b_end);
-       struct def_value a_value =
-               chase_copies((struct def_value) {
-                               .reg = a->reg,
-                               .offset = start - a_start,
-                               .size = end - start,
-               });
-       struct def_value b_value =
-               chase_copies((struct def_value) {
-                               .reg = b->reg,
-                               .offset = start - b_start,
-                               .size = end - start,
-               });
-       return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
+   unsigned a_start = a->offset;
+   unsigned b_start = b->offset;
+   unsigned a_end = a_start + reg_size(a->reg);
+   unsigned b_end = b_start + reg_size(b->reg);
+
+   /* Registers that don't overlap never interfere */
+   if (a_end <= b_start || b_end <= a_start)
+      return true;
+
+   /* Disallow skipping interference unless one definition contains the
+    * other. This restriction is important for register allocation, because
+    * it means that at any given point in the program, the live values in a
+    * given merge set will form a tree. If they didn't, then one live value
+    * would partially overlap another, and they would have overlapping live
+    * ranges because they're live at the same point. This simplifies register
+    * allocation and spilling.
+    */
+   if (!((a_start <= b_start && a_end >= b_end) ||
+         (b_start <= a_start && b_end >= a_end)))
+      return false;
+
+   /* For each register, chase the intersection of a and b to find the
+    * ultimate source.
+    */
+   unsigned start = MAX2(a_start, b_start);
+   unsigned end = MIN2(a_end, b_end);
+   struct def_value a_value = chase_copies((struct def_value){
+      .reg = a->reg,
+      .offset = start - a_start,
+      .size = end - start,
+   });
+   struct def_value b_value = chase_copies((struct def_value){
+      .reg = b->reg,
+      .offset = start - b_start,
+      .size = end - start,
+   });
+   return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
  }
  
  static struct ir3_merge_set *
  get_merge_set(struct ir3_register *def)
  {
-       if (def->merge_set)
-               return def->merge_set;
-
-       struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
-       set->preferred_reg = ~0;
-       set->interval_start = ~0;
-       set->size = reg_size(def);
-       set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
-       set->regs_count = 1;
-       set->regs = ralloc(set, struct ir3_register *);
-       set->regs[0] = def;
-
-       return set;
+   if (def->merge_set)
+      return def->merge_set;
+
+   struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
+   set->preferred_reg = ~0;
+   set->interval_start = ~0;
+   set->size = reg_size(def);
+   set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
+   set->regs_count = 1;
+   set->regs = ralloc(set, struct ir3_register *);
+   set->regs[0] = def;
+
+   return set;
  }
  
  /* Merges b into a */
  static struct ir3_merge_set *
-merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b,
-                            int b_offset)
+merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
  {
-       if (b_offset < 0)
-               return merge_merge_sets(b, a, -b_offset);
-
-       struct ir3_register **new_regs =
-               rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
-
-       unsigned a_index = 0, b_index = 0, new_index = 0;
-       for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
-               if (b_index < b->regs_count &&
-                       (a_index == a->regs_count ||
-                        def_after(a->regs[a_index], b->regs[b_index]))) {
-                       new_regs[new_index] = b->regs[b_index++];
-                       new_regs[new_index]->merge_set_offset += b_offset;
-               } else {
-                       new_regs[new_index] = a->regs[a_index++];
-               }
-               new_regs[new_index]->merge_set = a;
-       }
-
-       assert(new_index == a->regs_count + b->regs_count);
-
-       /* Technically this should be the lcm, but because alignment is only 1 or
-        * 2 so far this should be ok.
-        */
-       a->alignment = MAX2(a->alignment, b->alignment);
-       a->regs_count += b->regs_count;
-       ralloc_free(a->regs);
-       a->regs = new_regs;
-       a->size = MAX2(a->size, b->size + b_offset);
-
-       return a;
+   if (b_offset < 0)
+      return merge_merge_sets(b, a, -b_offset);
+
+   struct ir3_register **new_regs =
+      rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
+
+   unsigned a_index = 0, b_index = 0, new_index = 0;
+   for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
+      if (b_index < b->regs_count &&
+          (a_index == a->regs_count ||
+           def_after(a->regs[a_index], b->regs[b_index]))) {
+         new_regs[new_index] = b->regs[b_index++];
+         new_regs[new_index]->merge_set_offset += b_offset;
+      } else {
+         new_regs[new_index] = a->regs[a_index++];
+      }
+      new_regs[new_index]->merge_set = a;
+   }
+
+   assert(new_index == a->regs_count + b->regs_count);
+
+   /* Technically this should be the lcm, but because alignment is only 1 or
+    * 2 so far this should be ok.
+    */
+   a->alignment = MAX2(a->alignment, b->alignment);
+   a->regs_count += b->regs_count;
+   ralloc_free(a->regs);
+   a->regs = new_regs;
+   a->size = MAX2(a->size, b->size + b_offset);
+
+   return a;
  }
  
  static bool
-merge_sets_interfere(struct ir3_liveness *live,
-                                        struct ir3_merge_set *a, struct ir3_merge_set *b,
-                                        int b_offset)
+merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
+                     struct ir3_merge_set *b, int b_offset)
  {
-       if (b_offset < 0)
-               return merge_sets_interfere(live, b, a, -b_offset);
-
-       struct merge_def dom[a->regs_count + b->regs_count];
-       unsigned a_index = 0, b_index = 0;
-       int dom_index = -1;
-
-       /* Reject trying to merge the sets if the alignment doesn't work out */
-       if (b_offset % a->alignment != 0)
-               return true;
-
-       while (a_index < a->regs_count || b_index < b->regs_count) {
-               struct merge_def current;
-               if (a_index == a->regs_count) {
-                       current.reg = b->regs[b_index];
-                       current.offset = current.reg->merge_set_offset + b_offset;
-                       b_index++;
-               } else if (b_index == b->regs_count) {
-                       current.reg = a->regs[a_index];
-                       current.offset = current.reg->merge_set_offset;
-                       a_index++;
-               } else {
-                       if (def_after(b->regs[b_index], a->regs[a_index])) {
-                               current.reg = a->regs[a_index];
-                               current.offset = current.reg->merge_set_offset;
-                               a_index++;
-                       } else {
-                               current.reg = b->regs[b_index];
-                               current.offset = current.reg->merge_set_offset + b_offset;
-                               b_index++;
-                       }
-               }
-
-               while (dom_index >= 0 &&
-                          !def_dominates(dom[dom_index].reg, current.reg)) {
-                       dom_index--;
-               }
-
-               /* TODO: in the original paper, just dom[dom_index] needs to be
-                * checked for interference. We implement the value-chasing extension
-                * as well as support for sub-registers, which complicates this
-                * significantly because it's no longer the case that if a dominates b
-                * dominates c and a and b don't interfere then we only need to check
-                * interference between b and c to be sure a and c don't interfere --
-                * this means we may have to check for interference against values
-                * higher in the stack then dom[dom_index]. In the paper there's a
-                * description of a way to do less interference tests with the
-                * value-chasing extension, but we'd have to come up with something
-                * ourselves for handling the similar problems that come up with
-                * allowing values to contain subregisters. For now we just test
-                * everything in the stack.
-                */
-               for (int i = 0; i <= dom_index; i++) {
-                       if (can_skip_interference(&current, &dom[i]))
-                               continue;
-
-                       /* Ok, now we actually have to check interference. Since we know
-                        * that dom[i] dominates current, this boils down to checking
-                        * whether dom[i] is live after current.
-                        */
-                       if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
-                               return true;
-               }
-
-               dom[++dom_index] = current;
-       }
-
-       return false;
+   if (b_offset < 0)
+      return merge_sets_interfere(live, b, a, -b_offset);
+
+   struct merge_def dom[a->regs_count + b->regs_count];
+   unsigned a_index = 0, b_index = 0;
+   int dom_index = -1;
+
+   /* Reject trying to merge the sets if the alignment doesn't work out */
+   if (b_offset % a->alignment != 0)
+      return true;
+
+   while (a_index < a->regs_count || b_index < b->regs_count) {
+      struct merge_def current;
+      if (a_index == a->regs_count) {
+         current.reg = b->regs[b_index];
+         current.offset = current.reg->merge_set_offset + b_offset;
+         b_index++;
+      } else if (b_index == b->regs_count) {
+         current.reg = a->regs[a_index];
+         current.offset = current.reg->merge_set_offset;
+         a_index++;
+      } else {
+         if (def_after(b->regs[b_index], a->regs[a_index])) {
+            current.reg = a->regs[a_index];
+            current.offset = current.reg->merge_set_offset;
+            a_index++;
+         } else {
+            current.reg = b->regs[b_index];
+            current.offset = current.reg->merge_set_offset + b_offset;
+            b_index++;
+         }
+      }
+
+      while (dom_index >= 0 &&
+             !def_dominates(dom[dom_index].reg, current.reg)) {
+         dom_index--;
+      }
+
+      /* TODO: in the original paper, just dom[dom_index] needs to be
+       * checked for interference. We implement the value-chasing extension
+       * as well as support for sub-registers, which complicates this
+       * significantly because it's no longer the case that if a dominates b
+       * dominates c and a and b don't interfere then we only need to check
+       * interference between b and c to be sure a and c don't interfere --
+       * this means we may have to check for interference against values
+       * higher in the stack then dom[dom_index]. In the paper there's a
+       * description of a way to do less interference tests with the
+       * value-chasing extension, but we'd have to come up with something
+       * ourselves for handling the similar problems that come up with
+       * allowing values to contain subregisters. For now we just test
+       * everything in the stack.
+       */
+      for (int i = 0; i <= dom_index; i++) {
+         if (can_skip_interference(&current, &dom[i]))
+            continue;
+
+         /* Ok, now we actually have to check interference. Since we know
+          * that dom[i] dominates current, this boils down to checking
+          * whether dom[i] is live after current.
+          */
+         if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
+            return true;
+      }
+
+      dom[++dom_index] = current;
+   }
+
+   return false;
  }
  
  static void
-try_merge_defs(struct ir3_liveness *live,
-                          struct ir3_register *a, struct ir3_register *b,
-                          unsigned b_offset)
+try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
+               struct ir3_register *b, unsigned b_offset)
  {
-       struct ir3_merge_set *a_set = get_merge_set(a);
-       struct ir3_merge_set *b_set = get_merge_set(b);
+   struct ir3_merge_set *a_set = get_merge_set(a);
+   struct ir3_merge_set *b_set = get_merge_set(b);
  
-       if (a_set == b_set) {
-               /* Note: Even in this case we may not always successfully be able to
-                * coalesce this copy, if the offsets don't line up. But in any
-                * case, we can't do anything.
-                */
-               return;
-       }
+   if (a_set == b_set) {
+      /* Note: Even in this case we may not always successfully be able to
+       * coalesce this copy, if the offsets don't line up. But in any
+       * case, we can't do anything.
+       */
+      return;
+   }
  
-       int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
+   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
  
-       if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
-               merge_merge_sets(a_set, b_set, b_set_offset);
+   if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
+      merge_merge_sets(a_set, b_set, b_set_offset);
  }
  
  static void
-coalesce_phi(struct ir3_liveness *live,
-                        struct ir3_instruction *phi)
+coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
  {
-       for (unsigned i = 0; i < phi->srcs_count; i++) {
-               if (phi->srcs[i]->def)
-                       try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
-       }
+   for (unsigned i = 0; i < phi->srcs_count; i++) {
+      if (phi->srcs[i]->def)
+         try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
+   }
  }
  
  static void
  aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
-                                                                 struct ir3_instruction *pcopy)
+                                  struct ir3_instruction *pcopy)
  {
-       for (unsigned i = 0; i < pcopy->dsts_count; i++) {
-               if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
-                       continue;
-               try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
-       }
+   for (unsigned i = 0; i < pcopy->dsts_count; i++) {
+      if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
+         continue;
+      try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
+   }
  }
  
  static void
  aggressive_coalesce_split(struct ir3_liveness *live,
-                                                struct ir3_instruction *split)
+                          struct ir3_instruction *split)
  {
-       try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
-                                  split->split.off * reg_elem_size(split->dsts[0]));
+   try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
+                  split->split.off * reg_elem_size(split->dsts[0]));
  }
  
  static void
  aggressive_coalesce_collect(struct ir3_liveness *live,
-                                                  struct ir3_instruction *collect)
+                            struct ir3_instruction *collect)
  {
-       for (unsigned i = 0, offset = 0; i < collect->srcs_count;
-                offset += reg_elem_size(collect->srcs[i]), i++) {
-               if (!(collect->srcs[i]->flags & IR3_REG_SSA))
-                       continue;
-               try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
-       }
+   for (unsigned i = 0, offset = 0; i < collect->srcs_count;
+        offset += reg_elem_size(collect->srcs[i]), i++) {
+      if (!(collect->srcs[i]->flags & IR3_REG_SSA))
+         continue;
+      try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
+   }
  }
  
  static void
  create_parallel_copy(struct ir3_block *block)
  {
-       for (unsigned i = 0; i < 2; i++) {
-               if (!block->successors[i])
-                       continue;
-
-               struct ir3_block *succ = block->successors[i];
-
-               unsigned pred_idx = ir3_block_get_pred_index(succ, block);
-
-               unsigned phi_count = 0;
-               foreach_instr (phi, &succ->instr_list) {
-                       if (phi->opc != OPC_META_PHI)
-                               break;
-
-                       /* Avoid undef */
-                       if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
-                                !phi->srcs[pred_idx]->def)
-                               continue;
-
-                       /* We don't support critical edges. If we were to support them,
-                        * we'd need to insert parallel copies after the phi node to solve
-                        * the lost-copy problem.
-                        */
-                       assert(i == 0 && !block->successors[1]);
-                       phi_count++;
-               }
-
-               if (phi_count == 0)
-                       continue;
-
-               struct ir3_register *src[phi_count];
-               unsigned j = 0;
-               foreach_instr (phi, &succ->instr_list) {
-                       if (phi->opc != OPC_META_PHI)
-                               break;
-                       if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
-                                !phi->srcs[pred_idx]->def)
-                               continue;
-                       src[j++] = phi->srcs[pred_idx];
-               }
-               assert(j == phi_count);
-
-               struct ir3_instruction *pcopy =
-                       ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
-               
-               for (j = 0; j < phi_count; j++) {
-                       struct ir3_register *reg = __ssa_dst(pcopy);
-                       reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
-                       reg->size = reg_elems(src[j]);
-               }
-
-               for (j = 0; j < phi_count; j++) {
-                       pcopy->srcs[pcopy->srcs_count++] = ir3_reg_clone(block->shader, src[j]);
-               }
-
-               j = 0;
-               foreach_instr (phi, &succ->instr_list) {
-                       if (phi->opc != OPC_META_PHI)
-                               break;
-                       if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
-                                !phi->srcs[pred_idx]->def)
-                               continue;
-                       phi->srcs[pred_idx]->def = pcopy->dsts[j];
-                       phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
-                       j++;
-               }
-               assert(j == phi_count);
-       }
+   for (unsigned i = 0; i < 2; i++) {
+      if (!block->successors[i])
+         continue;
+
+      struct ir3_block *succ = block->successors[i];
+
+      unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+
+      unsigned phi_count = 0;
+      foreach_instr (phi, &succ->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+
+         /* Avoid undef */
+         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
+             !phi->srcs[pred_idx]->def)
+            continue;
+
+         /* We don't support critical edges. If we were to support them,
+          * we'd need to insert parallel copies after the phi node to solve
+          * the lost-copy problem.
+          */
+         assert(i == 0 && !block->successors[1]);
+         phi_count++;
+      }
+
+      if (phi_count == 0)
+         continue;
+
+      struct ir3_register *src[phi_count];
+      unsigned j = 0;
+      foreach_instr (phi, &succ->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
+             !phi->srcs[pred_idx]->def)
+            continue;
+         src[j++] = phi->srcs[pred_idx];
+      }
+      assert(j == phi_count);
+
+      struct ir3_instruction *pcopy =
+         ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
+
+      for (j = 0; j < phi_count; j++) {
+         struct ir3_register *reg = __ssa_dst(pcopy);
+         reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+         reg->size = reg_elems(src[j]);
+      }
+
+      for (j = 0; j < phi_count; j++) {
+         pcopy->srcs[pcopy->srcs_count++] =
+            ir3_reg_clone(block->shader, src[j]);
+      }
+
+      j = 0;
+      foreach_instr (phi, &succ->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
+             !phi->srcs[pred_idx]->def)
+            continue;
+         phi->srcs[pred_idx]->def = pcopy->dsts[j];
+         phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
+         j++;
+      }
+      assert(j == phi_count);
+   }
  }
  
  void
  ir3_create_parallel_copies(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list) {
-               create_parallel_copy(block);
-       }
+   foreach_block (block, &ir->block_list) {
+      create_parallel_copy(block);
+   }
  }
  
  static void
  index_merge_sets(struct ir3 *ir)
  {
-       unsigned offset = 0;
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       for (unsigned i = 0; i < instr->dsts_count; i++) {
-                               struct ir3_register *dst = instr->dsts[i];
-
-                               unsigned dst_offset;
-                               struct ir3_merge_set *merge_set = dst->merge_set;
-                               unsigned size = reg_size(dst);
-                               if (merge_set) {
-                                       if (merge_set->interval_start == ~0) {
-                                               merge_set->interval_start = offset;
-                                               offset += merge_set->size;
-                                       }
-                                       dst_offset = merge_set->interval_start + dst->merge_set_offset;
-                               } else {
-                                       dst_offset = offset;
-                                       offset += size;
-                               }
-
-                               dst->interval_start = dst_offset;
-                               dst->interval_end = dst_offset + size;
-                       }
-               }
-       }
+   unsigned offset = 0;
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         for (unsigned i = 0; i < instr->dsts_count; i++) {
+            struct ir3_register *dst = instr->dsts[i];
+
+            unsigned dst_offset;
+            struct ir3_merge_set *merge_set = dst->merge_set;
+            unsigned size = reg_size(dst);
+            if (merge_set) {
+               if (merge_set->interval_start == ~0) {
+                  merge_set->interval_start = offset;
+                  offset += merge_set->size;
+               }
+               dst_offset = merge_set->interval_start + dst->merge_set_offset;
+            } else {
+               dst_offset = offset;
+               offset += size;
+            }
+
+            dst->interval_start = dst_offset;
+            dst->interval_end = dst_offset + size;
+         }
+      }
+   }
  }
  
-#define RESET  "\x1b[0m"
-#define BLUE   "\x1b[0;34m"
-#define SYN_SSA(x)             BLUE x RESET
+#define RESET      "\x1b[0m"
+#define BLUE       "\x1b[0;34m"
+#define SYN_SSA(x) BLUE x RESET
  
  static void
  dump_merge_sets(struct ir3 *ir)
  {
-       printf("merge sets:\n");
-       struct set *merge_sets = _mesa_pointer_set_create(NULL);
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       for (unsigned i = 0; i < instr->dsts_count; i++) {
-                               struct ir3_register *dst = instr->dsts[i];
-
-                               struct ir3_merge_set *merge_set = dst->merge_set;
-                               if (!merge_set || _mesa_set_search(merge_sets, merge_set))
-                                       continue;
-
-                               printf("merge set, size %u, align %u:\n", merge_set->size, merge_set->alignment);
-                               for (unsigned j = 0; j < merge_set->regs_count; j++) {
-                                       struct ir3_register *reg = merge_set->regs[j];
-                                       printf("\t"SYN_SSA("ssa_%u")":%u, offset %u\n", reg->instr->serialno,
-                                                  reg->name, reg->merge_set_offset);
-                               }
-
-                               _mesa_set_add(merge_sets, merge_set);
-                       }
-               }
-       }
-
-       ralloc_free(merge_sets);
+   printf("merge sets:\n");
+   struct set *merge_sets = _mesa_pointer_set_create(NULL);
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         for (unsigned i = 0; i < instr->dsts_count; i++) {
+            struct ir3_register *dst = instr->dsts[i];
+
+            struct ir3_merge_set *merge_set = dst->merge_set;
+            if (!merge_set || _mesa_set_search(merge_sets, merge_set))
+               continue;
+
+            printf("merge set, size %u, align %u:\n", merge_set->size,
+                   merge_set->alignment);
+            for (unsigned j = 0; j < merge_set->regs_count; j++) {
+               struct ir3_register *reg = merge_set->regs[j];
+               printf("\t" SYN_SSA("ssa_%u") ":%u, offset %u\n",
+                      reg->instr->serialno, reg->name, reg->merge_set_offset);
+            }
+
+            _mesa_set_add(merge_sets, merge_set);
+         }
+      }
+   }
+
+   ralloc_free(merge_sets);
  }
  
  void
  ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
  {
-       index_instrs(ir3_start_block(ir), 0);
-
-       /* First pass: coalesce phis, which must be together. */
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (instr->opc != OPC_META_PHI)
-                               break;
-
-                       coalesce_phi(live, instr);
-               }
-       }
-
-       /* Second pass: aggressively coalesce parallelcopy, split, collect */
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       switch (instr->opc) {
-                               case OPC_META_SPLIT:
-                                       aggressive_coalesce_split(live, instr);
-                                       break;
-                               case OPC_META_COLLECT:
-                                       aggressive_coalesce_collect(live, instr);
-                                       break;
-                               case OPC_META_PARALLEL_COPY:
-                                       aggressive_coalesce_parallel_copy(live, instr);
-                                       break;
-                               default:
-                                       break;
-                       }
-               }
-       }
-
-       index_merge_sets(ir);
-
-       if (ir3_shader_debug & IR3_DBG_RAMSGS)
-               dump_merge_sets(ir);
+   index_instrs(ir3_start_block(ir), 0);
+
+   /* First pass: coalesce phis, which must be together. */
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_PHI)
+            break;
+
+         coalesce_phi(live, instr);
+      }
+   }
+
+   /* Second pass: aggressively coalesce parallelcopy, split, collect */
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         switch (instr->opc) {
+         case OPC_META_SPLIT:
+            aggressive_coalesce_split(live, instr);
+            break;
+         case OPC_META_COLLECT:
+            aggressive_coalesce_collect(live, instr);
+            break;
+         case OPC_META_PARALLEL_COPY:
+            aggressive_coalesce_parallel_copy(live, instr);
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   index_merge_sets(ir);
+
+   if (ir3_shader_debug & IR3_DBG_RAMSGS)
+      dump_merge_sets(ir);
  }
-
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c

index eec8208..2b65167 100644 (file)
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -24,453 +24,451 @@
   *    Rob Clark <robclark@freedesktop.org>
   */
  
-
  #include "util/debug.h"
  #include "util/u_math.h"
  
-#include "ir3_nir.h"
  #include "ir3_compiler.h"
+#include "ir3_nir.h"
  #include "ir3_shader.h"
  
  static const nir_shader_compiler_options options = {
-               .lower_fpow = true,
-               .lower_scmp = true,
-               .lower_flrp16 = true,
-               .lower_flrp32 = true,
-               .lower_flrp64 = true,
-               .lower_ffract = true,
-               .lower_fmod = true,
-               .lower_fdiv = true,
-               .lower_isign = true,
-               .lower_ldexp = true,
-               .lower_uadd_carry = true,
-               .lower_usub_borrow = true,
-               .lower_mul_high = true,
-               .lower_mul_2x32_64 = true,
-               .fuse_ffma16 = true,
-               .fuse_ffma32 = true,
-               .fuse_ffma64 = true,
-               .vertex_id_zero_based = true,
-               .lower_extract_byte = true,
-               .lower_extract_word = true,
-               .lower_insert_byte = true,
-               .lower_insert_word = true,
-               .lower_helper_invocation = true,
-               .lower_bitfield_insert_to_shifts = true,
-               .lower_bitfield_extract_to_shifts = true,
-               .lower_pack_half_2x16 = true,
-               .lower_pack_snorm_4x8 = true,
-               .lower_pack_snorm_2x16 = true,
-               .lower_pack_unorm_4x8 = true,
-               .lower_pack_unorm_2x16 = true,
-               .lower_unpack_half_2x16 = true,
-               .lower_unpack_snorm_4x8 = true,
-               .lower_unpack_snorm_2x16 = true,
-               .lower_unpack_unorm_4x8 = true,
-               .lower_unpack_unorm_2x16 = true,
-               .lower_pack_split = true,
-               .use_interpolated_input_intrinsics = true,
-               .lower_rotate = true,
-               .lower_to_scalar = true,
-               .has_imul24 = true,
-               .has_fsub = true,
-               .has_isub = true,
-               .lower_wpos_pntc = true,
-               .lower_cs_local_index_from_id = true,
-
-               /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
-                * but that should be harmless for GL since 64b is not
-                * supported there.
-                */
-               .lower_int64_options = (nir_lower_int64_options)~0,
-               .lower_uniforms_to_ubo = true,
-               .use_scoped_barrier = true,
+   .lower_fpow = true,
+   .lower_scmp = true,
+   .lower_flrp16 = true,
+   .lower_flrp32 = true,
+   .lower_flrp64 = true,
+   .lower_ffract = true,
+   .lower_fmod = true,
+   .lower_fdiv = true,
+   .lower_isign = true,
+   .lower_ldexp = true,
+   .lower_uadd_carry = true,
+   .lower_usub_borrow = true,
+   .lower_mul_high = true,
+   .lower_mul_2x32_64 = true,
+   .fuse_ffma16 = true,
+   .fuse_ffma32 = true,
+   .fuse_ffma64 = true,
+   .vertex_id_zero_based = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .lower_insert_byte = true,
+   .lower_insert_word = true,
+   .lower_helper_invocation = true,
+   .lower_bitfield_insert_to_shifts = true,
+   .lower_bitfield_extract_to_shifts = true,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_pack_split = true,
+   .use_interpolated_input_intrinsics = true,
+   .lower_rotate = true,
+   .lower_to_scalar = true,
+   .has_imul24 = true,
+   .has_fsub = true,
+   .has_isub = true,
+   .lower_wpos_pntc = true,
+   .lower_cs_local_index_from_id = true,
+
+   /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
+    * but that should be harmless for GL since 64b is not
+    * supported there.
+    */
+   .lower_int64_options = (nir_lower_int64_options)~0,
+   .lower_uniforms_to_ubo = true,
+   .use_scoped_barrier = true,
  };
  
  /* we don't want to lower vertex_id to _zero_based on newer gpus: */
  static const nir_shader_compiler_options options_a6xx = {
-               .lower_fpow = true,
-               .lower_scmp = true,
-               .lower_flrp16 = true,
-               .lower_flrp32 = true,
-               .lower_flrp64 = true,
-               .lower_ffract = true,
-               .lower_fmod = true,
-               .lower_fdiv = true,
-               .lower_isign = true,
-               .lower_ldexp = true,
-               .lower_uadd_carry = true,
-               .lower_usub_borrow = true,
-               .lower_mul_high = true,
-               .lower_mul_2x32_64 = true,
-               .fuse_ffma16 = true,
-               .fuse_ffma32 = true,
-               .fuse_ffma64 = true,
-               .vertex_id_zero_based = false,
-               .lower_extract_byte = true,
-               .lower_extract_word = true,
-               .lower_insert_byte = true,
-               .lower_insert_word = true,
-               .lower_helper_invocation = true,
-               .lower_bitfield_insert_to_shifts = true,
-               .lower_bitfield_extract_to_shifts = true,
-               .lower_pack_half_2x16 = true,
-               .lower_pack_snorm_4x8 = true,
-               .lower_pack_snorm_2x16 = true,
-               .lower_pack_unorm_4x8 = true,
-               .lower_pack_unorm_2x16 = true,
-               .lower_unpack_half_2x16 = true,
-               .lower_unpack_snorm_4x8 = true,
-               .lower_unpack_snorm_2x16 = true,
-               .lower_unpack_unorm_4x8 = true,
-               .lower_unpack_unorm_2x16 = true,
-               .lower_pack_split = true,
-               .use_interpolated_input_intrinsics = true,
-               .lower_rotate = true,
-               .vectorize_io = true,
-               .lower_to_scalar = true,
-               .has_imul24 = true,
-               .has_fsub = true,
-               .has_isub = true,
-               .max_unroll_iterations = 32,
-               .lower_wpos_pntc = true,
-               .lower_cs_local_index_from_id = true,
-
-               /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
-                * but that should be harmless for GL since 64b is not
-                * supported there.
-                */
-               .lower_int64_options = (nir_lower_int64_options)~0,
-               .lower_uniforms_to_ubo = true,
-               .lower_device_index_to_zero = true,
-               .use_scoped_barrier = true,
+   .lower_fpow = true,
+   .lower_scmp = true,
+   .lower_flrp16 = true,
+   .lower_flrp32 = true,
+   .lower_flrp64 = true,
+   .lower_ffract = true,
+   .lower_fmod = true,
+   .lower_fdiv = true,
+   .lower_isign = true,
+   .lower_ldexp = true,
+   .lower_uadd_carry = true,
+   .lower_usub_borrow = true,
+   .lower_mul_high = true,
+   .lower_mul_2x32_64 = true,
+   .fuse_ffma16 = true,
+   .fuse_ffma32 = true,
+   .fuse_ffma64 = true,
+   .vertex_id_zero_based = false,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .lower_insert_byte = true,
+   .lower_insert_word = true,
+   .lower_helper_invocation = true,
+   .lower_bitfield_insert_to_shifts = true,
+   .lower_bitfield_extract_to_shifts = true,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_pack_split = true,
+   .use_interpolated_input_intrinsics = true,
+   .lower_rotate = true,
+   .vectorize_io = true,
+   .lower_to_scalar = true,
+   .has_imul24 = true,
+   .has_fsub = true,
+   .has_isub = true,
+   .max_unroll_iterations = 32,
+   .lower_wpos_pntc = true,
+   .lower_cs_local_index_from_id = true,
+
+   /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
+    * but that should be harmless for GL since 64b is not
+    * supported there.
+    */
+   .lower_int64_options = (nir_lower_int64_options)~0,
+   .lower_uniforms_to_ubo = true,
+   .lower_device_index_to_zero = true,
+   .use_scoped_barrier = true,
  };
  
  const nir_shader_compiler_options *
  ir3_get_compiler_options(struct ir3_compiler *compiler)
  {
-       if (compiler->gpu_id >= 600)
-               return &options_a6xx;
-       return &options;
+   if (compiler->gpu_id >= 600)
+      return &options_a6xx;
+   return &options;
  }
  
  static bool
  ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
-               unsigned bit_size,
-               unsigned num_components,
-               nir_intrinsic_instr *low,
-               nir_intrinsic_instr *high,
-               void *data)
+                             unsigned bit_size, unsigned num_components,
+                             nir_intrinsic_instr *low,
+                             nir_intrinsic_instr *high, void *data)
  {
-       assert(bit_size >= 8);
-       if (bit_size != 32)
-               return false;
-       unsigned byte_size = bit_size / 8;
+   assert(bit_size >= 8);
+   if (bit_size != 32)
+      return false;
+   unsigned byte_size = bit_size / 8;
  
-       int size = num_components * byte_size;
+   int size = num_components * byte_size;
  
-       /* Don't care about alignment past vec4. */
-       assert(util_is_power_of_two_nonzero(align_mul));
-       align_mul = MIN2(align_mul, 16);
-       align_offset &= 15;
+   /* Don't care about alignment past vec4. */
+   assert(util_is_power_of_two_nonzero(align_mul));
+   align_mul = MIN2(align_mul, 16);
+   align_offset &= 15;
  
-       /* Our offset alignment should aways be at least 4 bytes */
-       if (align_mul < 4)
-               return false;
+   /* Our offset alignment should aways be at least 4 bytes */
+   if (align_mul < 4)
+      return false;
  
-       unsigned worst_start_offset = 16 - align_mul + align_offset;
-       if (worst_start_offset + size > 16)
-               return false;
+   unsigned worst_start_offset = 16 - align_mul + align_offset;
+   if (worst_start_offset + size > 16)
+      return false;
  
-       return true;
+   return true;
  }
  
-#define OPT(nir, pass, ...) ({                             \
-   bool this_progress = false;                             \
-   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
-   this_progress;                                          \
-})
+#define OPT(nir, pass, ...)                                                    \
+   ({                                                                          \
+      bool this_progress = false;                                              \
+      NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);                       \
+      this_progress;                                                           \
+   })
  
  #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
  
  void
  ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
  {
-       bool progress;
-       unsigned lower_flrp =
-               (s->options->lower_flrp16 ? 16 : 0) |
-               (s->options->lower_flrp32 ? 32 : 0) |
-               (s->options->lower_flrp64 ? 64 : 0);
-
-       do {
-               progress = false;
-
-               OPT_V(s, nir_lower_vars_to_ssa);
-               progress |= OPT(s, nir_opt_copy_prop_vars);
-               progress |= OPT(s, nir_opt_dead_write_vars);
-               progress |= OPT(s, nir_lower_alu_to_scalar, NULL, NULL);
-               progress |= OPT(s, nir_lower_phis_to_scalar, false);
-
-               progress |= OPT(s, nir_copy_prop);
-               progress |= OPT(s, nir_opt_dce);
-               progress |= OPT(s, nir_opt_cse);
-               static int gcm = -1;
-               if (gcm == -1)
-                       gcm = env_var_as_unsigned("GCM", 0);
-               if (gcm == 1)
-                       progress |= OPT(s, nir_opt_gcm, true);
-               else if (gcm == 2)
-                       progress |= OPT(s, nir_opt_gcm, false);
-               progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
-               progress |= OPT(s, nir_opt_intrinsics);
-               /* NOTE: GS lowering inserts an output var with varying slot that
-                * is larger than VARYING_SLOT_MAX (ie. GS_VERTEX_FLAGS_IR3),
-                * which triggers asserts in nir_shader_gather_info().  To work
-                * around that skip lowering phi precision for GS.
-                *
-                * Calling nir_shader_gather_info() late also seems to cause
-                * problems for tess lowering, for now since we only enable
-                * fp16/int16 for frag and compute, skip phi precision lowering
-                * for other stages.
-                */
-               if ((s->info.stage == MESA_SHADER_FRAGMENT) ||
-                               (s->info.stage == MESA_SHADER_COMPUTE)) {
-                       progress |= OPT(s, nir_opt_phi_precision);
-               }
-               progress |= OPT(s, nir_opt_algebraic);
-               progress |= OPT(s, nir_lower_alu);
-               progress |= OPT(s, nir_lower_pack);
-               progress |= OPT(s, nir_opt_constant_folding);
-
-               nir_load_store_vectorize_options vectorize_opts = {
-                  .modes = nir_var_mem_ubo,
-                  .callback = ir3_nir_should_vectorize_mem,
-                  .robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0,
-               };
-               progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
-
-               if (lower_flrp != 0) {
-                       if (OPT(s, nir_lower_flrp,
-                                       lower_flrp,
-                                       false /* always_precise */)) {
-                               OPT(s, nir_opt_constant_folding);
-                               progress = true;
-                       }
-
-                       /* Nothing should rematerialize any flrps, so we only
-                        * need to do this lowering once.
-                        */
-                       lower_flrp = 0;
-               }
-
-               progress |= OPT(s, nir_opt_dead_cf);
-               if (OPT(s, nir_opt_trivial_continues)) {
-                       progress |= true;
-                       /* If nir_opt_trivial_continues makes progress, then we need to clean
-                        * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
-                        * to make progress.
-                        */
-                       OPT(s, nir_copy_prop);
-                       OPT(s, nir_opt_dce);
-               }
-               progress |= OPT(s, nir_opt_if, false);
-               progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
-               progress |= OPT(s, nir_opt_remove_phis);
-               progress |= OPT(s, nir_opt_undef);
-       } while (progress);
+   bool progress;
+   unsigned lower_flrp = (s->options->lower_flrp16 ? 16 : 0) |
+                         (s->options->lower_flrp32 ? 32 : 0) |
+                         (s->options->lower_flrp64 ? 64 : 0);
+
+   do {
+      progress = false;
+
+      OPT_V(s, nir_lower_vars_to_ssa);
+      progress |= OPT(s, nir_opt_copy_prop_vars);
+      progress |= OPT(s, nir_opt_dead_write_vars);
+      progress |= OPT(s, nir_lower_alu_to_scalar, NULL, NULL);
+      progress |= OPT(s, nir_lower_phis_to_scalar, false);
+
+      progress |= OPT(s, nir_copy_prop);
+      progress |= OPT(s, nir_opt_dce);
+      progress |= OPT(s, nir_opt_cse);
+      static int gcm = -1;
+      if (gcm == -1)
+         gcm = env_var_as_unsigned("GCM", 0);
+      if (gcm == 1)
+         progress |= OPT(s, nir_opt_gcm, true);
+      else if (gcm == 2)
+         progress |= OPT(s, nir_opt_gcm, false);
+      progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
+      progress |= OPT(s, nir_opt_intrinsics);
+      /* NOTE: GS lowering inserts an output var with varying slot that
+       * is larger than VARYING_SLOT_MAX (ie. GS_VERTEX_FLAGS_IR3),
+       * which triggers asserts in nir_shader_gather_info().  To work
+       * around that skip lowering phi precision for GS.
+       *
+       * Calling nir_shader_gather_info() late also seems to cause
+       * problems for tess lowering, for now since we only enable
+       * fp16/int16 for frag and compute, skip phi precision lowering
+       * for other stages.
+       */
+      if ((s->info.stage == MESA_SHADER_FRAGMENT) ||
+          (s->info.stage == MESA_SHADER_COMPUTE)) {
+         progress |= OPT(s, nir_opt_phi_precision);
+      }
+      progress |= OPT(s, nir_opt_algebraic);
+      progress |= OPT(s, nir_lower_alu);
+      progress |= OPT(s, nir_lower_pack);
+      progress |= OPT(s, nir_opt_constant_folding);
+
+      nir_load_store_vectorize_options vectorize_opts = {
+         .modes = nir_var_mem_ubo,
+         .callback = ir3_nir_should_vectorize_mem,
+         .robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0,
+      };
+      progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
+
+      if (lower_flrp != 0) {
+         if (OPT(s, nir_lower_flrp, lower_flrp, false /* always_precise */)) {
+            OPT(s, nir_opt_constant_folding);
+            progress = true;
+         }
+
+         /* Nothing should rematerialize any flrps, so we only
+          * need to do this lowering once.
+          */
+         lower_flrp = 0;
+      }
+
+      progress |= OPT(s, nir_opt_dead_cf);
+      if (OPT(s, nir_opt_trivial_continues)) {
+         progress |= true;
+         /* If nir_opt_trivial_continues makes progress, then we need to clean
+          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+          * to make progress.
+          */
+         OPT(s, nir_copy_prop);
+         OPT(s, nir_opt_dce);
+      }
+      progress |= OPT(s, nir_opt_if, false);
+      progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
+      progress |= OPT(s, nir_opt_remove_phis);
+      progress |= OPT(s, nir_opt_undef);
+   } while (progress);
  }
  
  static bool
  should_split_wrmask(const nir_instr *instr, const void *data)
  {
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_store_ssbo:
-       case nir_intrinsic_store_shared:
-       case nir_intrinsic_store_global:
-       case nir_intrinsic_store_scratch:
-               return true;
-       default:
-               return false;
-       }
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_store_shared:
+   case nir_intrinsic_store_global:
+   case nir_intrinsic_store_scratch:
+      return true;
+   default:
+      return false;
+   }
  }
  
  void
  ir3_nir_lower_io_to_temporaries(nir_shader *s)
  {
-       /* Outputs consumed by the VPC, VS inputs, and FS outputs are all handled
-        * by the hardware pre-loading registers at the beginning and then reading
-        * them at the end, so we can't access them indirectly except through
-        * normal register-indirect accesses, and therefore ir3 doesn't support
-        * indirect accesses on those. Other i/o is lowered in ir3_nir_lower_tess,
-        * and indirects work just fine for those. GS outputs may be consumed by
-        * VPC, but have their own lowering in ir3_nir_lower_gs() which does
-        * something similar to nir_lower_io_to_temporaries so we shouldn't need
-        * to lower them.
-        *
-        * Note: this might be a little inefficient for VS or TES outputs which are
-        * when the next stage isn't an FS, but it probably don't make sense to
-        * depend on the next stage before variant creation.
-        *
-        * TODO: for gallium, mesa/st also does some redundant lowering, including
-        * running this pass for GS inputs/outputs which we don't want but not
-        * including TES outputs or FS inputs which we do need. We should probably
-        * stop doing that once we're sure all drivers are doing their own
-        * indirect i/o lowering.
-        */
-       bool lower_input = s->info.stage == MESA_SHADER_VERTEX || s->info.stage == MESA_SHADER_FRAGMENT;
-       bool lower_output = s->info.stage != MESA_SHADER_TESS_CTRL && s->info.stage != MESA_SHADER_GEOMETRY;
-       if (lower_input || lower_output) {
-               NIR_PASS_V(s, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(s),
-                                 lower_output, lower_input);
-
-               /* nir_lower_io_to_temporaries() creates global variables and copy
-                * instructions which need to be cleaned up.
-                */
-               NIR_PASS_V(s, nir_split_var_copies);
-               NIR_PASS_V(s, nir_lower_var_copies);
-               NIR_PASS_V(s, nir_lower_global_vars_to_local);
-       }
-
-       /* Regardless of the above, we need to lower indirect references to
-        * compact variables such as clip/cull distances because due to how
-        * TCS<->TES IO works we cannot handle indirect accesses that "straddle"
-        * vec4 components. nir_lower_indirect_derefs has a special case for
-        * compact variables, so it will actually lower them even though we pass
-        * in 0 modes.
-        *
-        * Using temporaries would be slightly better but
-        * nir_lower_io_to_temporaries currently doesn't support TCS i/o.
-        */
-       NIR_PASS_V(s, nir_lower_indirect_derefs, 0, UINT32_MAX);
+   /* Outputs consumed by the VPC, VS inputs, and FS outputs are all handled
+    * by the hardware pre-loading registers at the beginning and then reading
+    * them at the end, so we can't access them indirectly except through
+    * normal register-indirect accesses, and therefore ir3 doesn't support
+    * indirect accesses on those. Other i/o is lowered in ir3_nir_lower_tess,
+    * and indirects work just fine for those. GS outputs may be consumed by
+    * VPC, but have their own lowering in ir3_nir_lower_gs() which does
+    * something similar to nir_lower_io_to_temporaries so we shouldn't need
+    * to lower them.
+    *
+    * Note: this might be a little inefficient for VS or TES outputs which are
+    * when the next stage isn't an FS, but it probably don't make sense to
+    * depend on the next stage before variant creation.
+    *
+    * TODO: for gallium, mesa/st also does some redundant lowering, including
+    * running this pass for GS inputs/outputs which we don't want but not
+    * including TES outputs or FS inputs which we do need. We should probably
+    * stop doing that once we're sure all drivers are doing their own
+    * indirect i/o lowering.
+    */
+   bool lower_input = s->info.stage == MESA_SHADER_VERTEX ||
+                      s->info.stage == MESA_SHADER_FRAGMENT;
+   bool lower_output = s->info.stage != MESA_SHADER_TESS_CTRL &&
+                       s->info.stage != MESA_SHADER_GEOMETRY;
+   if (lower_input || lower_output) {
+      NIR_PASS_V(s, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(s),
+                 lower_output, lower_input);
+
+      /* nir_lower_io_to_temporaries() creates global variables and copy
+       * instructions which need to be cleaned up.
+       */
+      NIR_PASS_V(s, nir_split_var_copies);
+      NIR_PASS_V(s, nir_lower_var_copies);
+      NIR_PASS_V(s, nir_lower_global_vars_to_local);
+   }
+
+   /* Regardless of the above, we need to lower indirect references to
+    * compact variables such as clip/cull distances because due to how
+    * TCS<->TES IO works we cannot handle indirect accesses that "straddle"
+    * vec4 components. nir_lower_indirect_derefs has a special case for
+    * compact variables, so it will actually lower them even though we pass
+    * in 0 modes.
+    *
+    * Using temporaries would be slightly better but
+    * nir_lower_io_to_temporaries currently doesn't support TCS i/o.
+    */
+   NIR_PASS_V(s, nir_lower_indirect_derefs, 0, UINT32_MAX);
  }
  
  void
  ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
  {
-       struct nir_lower_tex_options tex_options = {
-                       .lower_rect = 0,
-                       .lower_tg4_offsets = true,
-       };
-
-       if (compiler->gpu_id >= 400) {
-               /* a4xx seems to have *no* sam.p */
-               tex_options.lower_txp = ~0;  /* lower all txp */
-       } else {
-               /* a3xx just needs to avoid sam.p for 3d tex */
-               tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-       }
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               mesa_logi("----------------------");
-               nir_log_shaderi(s);
-               mesa_logi("----------------------");
-       }
-
-       if (s->info.stage == MESA_SHADER_GEOMETRY)
-               NIR_PASS_V(s, ir3_nir_lower_gs);
-
-       NIR_PASS_V(s, nir_lower_amul, ir3_glsl_type_size);
-
-       OPT_V(s, nir_lower_regs_to_ssa);
-       OPT_V(s, nir_lower_wrmasks, should_split_wrmask, s);
-
-       OPT_V(s, nir_lower_tex, &tex_options);
-       OPT_V(s, nir_lower_load_const_to_scalar);
-       if (compiler->gpu_id < 500)
-               OPT_V(s, ir3_nir_lower_tg4_to_tex);
-
-       ir3_optimize_loop(compiler, s);
-
-       /* do idiv lowering after first opt loop to get a chance to propagate
-        * constants for divide by immed power-of-two:
-        */
-       nir_lower_idiv_options idiv_options = {
-               .imprecise_32bit_lowering = true,
-               .allow_fp16 = true,
-       };
-       const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);
-
-       if (idiv_progress)
-               ir3_optimize_loop(compiler, s);
-
-       OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               mesa_logi("----------------------");
-               nir_log_shaderi(s);
-               mesa_logi("----------------------");
-       }
-
-       /* st_program.c's parameter list optimization requires that future nir
-        * variants don't reallocate the uniform storage, so we have to remove
-        * uniforms that occupy storage.  But we don't want to remove samplers,
-        * because they're needed for YUV variant lowering.
-        */
-       nir_foreach_uniform_variable_safe(var, s) {
+   struct nir_lower_tex_options tex_options = {
+      .lower_rect = 0,
+      .lower_tg4_offsets = true,
+   };
+
+   if (compiler->gpu_id >= 400) {
+      /* a4xx seems to have *no* sam.p */
+      tex_options.lower_txp = ~0; /* lower all txp */
+   } else {
+      /* a3xx just needs to avoid sam.p for 3d tex */
+      tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+   }
+
+   if (ir3_shader_debug & IR3_DBG_DISASM) {
+      mesa_logi("----------------------");
+      nir_log_shaderi(s);
+      mesa_logi("----------------------");
+   }
+
+   if (s->info.stage == MESA_SHADER_GEOMETRY)
+      NIR_PASS_V(s, ir3_nir_lower_gs);
+
+   NIR_PASS_V(s, nir_lower_amul, ir3_glsl_type_size);
+
+   OPT_V(s, nir_lower_regs_to_ssa);
+   OPT_V(s, nir_lower_wrmasks, should_split_wrmask, s);
+
+   OPT_V(s, nir_lower_tex, &tex_options);
+   OPT_V(s, nir_lower_load_const_to_scalar);
+   if (compiler->gpu_id < 500)
+      OPT_V(s, ir3_nir_lower_tg4_to_tex);
+
+   ir3_optimize_loop(compiler, s);
+
+   /* do idiv lowering after first opt loop to get a chance to propagate
+    * constants for divide by immed power-of-two:
+    */
+   nir_lower_idiv_options idiv_options = {
+      .imprecise_32bit_lowering = true,
+      .allow_fp16 = true,
+   };
+   const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);
+
+   if (idiv_progress)
+      ir3_optimize_loop(compiler, s);
+
+   OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
+
+   if (ir3_shader_debug & IR3_DBG_DISASM) {
+      mesa_logi("----------------------");
+      nir_log_shaderi(s);
+      mesa_logi("----------------------");
+   }
+
+   /* st_program.c's parameter list optimization requires that future nir
+    * variants don't reallocate the uniform storage, so we have to remove
+    * uniforms that occupy storage.  But we don't want to remove samplers,
+    * because they're needed for YUV variant lowering.
+    */
+   nir_foreach_uniform_variable_safe (var, s) {
        if (var->data.mode == nir_var_uniform &&
            (glsl_type_get_image_count(var->type) ||
             glsl_type_get_sampler_count(var->type)))
           continue;
  
-               exec_node_remove(&var->node);
-       }
-       nir_validate_shader(s, "after uniform var removal");
+      exec_node_remove(&var->node);
+   }
+   nir_validate_shader(s, "after uniform var removal");
  
-       nir_sweep(s);
+   nir_sweep(s);
  }
  
  static bool
  lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
  {
-       (void)unused;
+   (void)unused;
  
-       if (instr->type != nir_instr_type_intrinsic)
-               return false;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
  
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-       return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||
-                  intr->intrinsic == nir_intrinsic_load_subgroup_id ||
-                  intr->intrinsic == nir_intrinsic_load_num_subgroups;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||
+          intr->intrinsic == nir_intrinsic_load_subgroup_id ||
+          intr->intrinsic == nir_intrinsic_load_num_subgroups;
  }
  
  static nir_ssa_def *
  lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
  {
-       (void)instr;
-       (void)unused;
-
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-       if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
-               return nir_iand(b, nir_load_local_invocation_index(b),
-                                               nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));
-       } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
-               return nir_ishr(b, nir_load_local_invocation_index(b),
-                                               nir_load_subgroup_id_shift_ir3(b));
-       } else {
-               assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
-               /* If the workgroup size is constant,
-                * nir_lower_compute_system_values() will replace local_size with a
-                * constant so this can mostly be constant folded away.
-                */
-               nir_ssa_def *local_size = nir_load_workgroup_size(b);
-               nir_ssa_def *size =
-                       nir_imul24(b, nir_channel(b, local_size, 0),
-                                          nir_imul24(b, nir_channel(b, local_size, 1),
-                                                                 nir_channel(b, local_size, 2)));
-               nir_ssa_def *one = nir_imm_int(b, 1);
-               return nir_iadd(b, one,
-                                               nir_ishr(b, nir_isub(b, size, one),
-                                                            nir_load_subgroup_id_shift_ir3(b)));
-       }
+   (void)instr;
+   (void)unused;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
+      return nir_iand(
+         b, nir_load_local_invocation_index(b),
+         nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));
+   } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
+      return nir_ishr(b, nir_load_local_invocation_index(b),
+                      nir_load_subgroup_id_shift_ir3(b));
+   } else {
+      assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
+      /* If the workgroup size is constant,
+       * nir_lower_compute_system_values() will replace local_size with a
+       * constant so this can mostly be constant folded away.
+       */
+      nir_ssa_def *local_size = nir_load_workgroup_size(b);
+      nir_ssa_def *size =
+         nir_imul24(b, nir_channel(b, local_size, 0),
+                    nir_imul24(b, nir_channel(b, local_size, 1),
+                               nir_channel(b, local_size, 2)));
+      nir_ssa_def *one = nir_imm_int(b, 1);
+      return nir_iadd(b, one,
+                      nir_ishr(b, nir_isub(b, size, one),
+                               nir_load_subgroup_id_shift_ir3(b)));
+   }
  }
  
  static bool
  ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
  {
-       return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
-                                                                                lower_subgroup_id, NULL);
+   return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
+                                        lower_subgroup_id, NULL);
  }
  
  /**
@@ -479,333 +477,331 @@ ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
  void
  ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
  {
-       NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
-                          ir3_glsl_type_size, (nir_lower_io_options)0);
-
-       if (s->info.stage == MESA_SHADER_FRAGMENT) {
-               /* NOTE: lower load_barycentric_at_sample first, since it
-                * produces load_barycentric_at_offset:
-                */
-               NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_sample);
-               NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_offset);
-               NIR_PASS_V(s, ir3_nir_move_varying_inputs);
-               NIR_PASS_V(s, nir_lower_fb_read);
-       }
-
-       if (compiler->gpu_id >= 600 &&
-                       s->info.stage == MESA_SHADER_FRAGMENT &&
-                       !(ir3_shader_debug & IR3_DBG_NOFP16)) {
-               NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);
-       }
-
-       if (s->info.stage == MESA_SHADER_COMPUTE) {
-               bool progress = false;
-               NIR_PASS(progress, s, nir_lower_subgroups, &(nir_lower_subgroups_options) {
-                                               .subgroup_size = 128,
-                                               .ballot_bit_size = 32,
-                                               .ballot_components = 4,
-                                               .lower_to_scalar = true,
-                                               .lower_vote_eq = true,
-                                               .lower_subgroup_masks = true,
-                                               .lower_read_invocation_to_cond = true,
-                                  });
-
-               progress = false;
-               NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);
-
-               /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
-                * we need to lower again.
-                */
-               if (progress)
-                       NIR_PASS_V(s, nir_lower_compute_system_values, NULL);
-       }
-
-       /* we cannot ensure that ir3_finalize_nir() is only called once, so
-        * we also need to do trig workarounds here:
-        */
-       OPT_V(s, ir3_nir_apply_trig_workarounds);
-
-       ir3_optimize_loop(compiler, s);
+   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+              ir3_glsl_type_size, (nir_lower_io_options)0);
+
+   if (s->info.stage == MESA_SHADER_FRAGMENT) {
+      /* NOTE: lower load_barycentric_at_sample first, since it
+       * produces load_barycentric_at_offset:
+       */
+      NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_sample);
+      NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_offset);
+      NIR_PASS_V(s, ir3_nir_move_varying_inputs);
+      NIR_PASS_V(s, nir_lower_fb_read);
+   }
+
+   if (compiler->gpu_id >= 600 && s->info.stage == MESA_SHADER_FRAGMENT &&
+       !(ir3_shader_debug & IR3_DBG_NOFP16)) {
+      NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);
+   }
+
+   if (s->info.stage == MESA_SHADER_COMPUTE) {
+      bool progress = false;
+      NIR_PASS(progress, s, nir_lower_subgroups,
+               &(nir_lower_subgroups_options){
+                  .subgroup_size = 128,
+                  .ballot_bit_size = 32,
+                  .ballot_components = 4,
+                  .lower_to_scalar = true,
+                  .lower_vote_eq = true,
+                  .lower_subgroup_masks = true,
+                  .lower_read_invocation_to_cond = true,
+               });
+
+      progress = false;
+      NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);
+
+      /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
+       * we need to lower again.
+       */
+      if (progress)
+         NIR_PASS_V(s, nir_lower_compute_system_values, NULL);
+   }
+
+   /* we cannot ensure that ir3_finalize_nir() is only called once, so
+    * we also need to do trig workarounds here:
+    */
+   OPT_V(s, ir3_nir_apply_trig_workarounds);
+
+   ir3_optimize_loop(compiler, s);
  }
  
  static bool
  ir3_nir_lower_view_layer_id(nir_shader *nir, bool layer_zero, bool view_zero)
  {
-       unsigned layer_id_loc = ~0, view_id_loc = ~0;
-       nir_foreach_shader_in_variable(var, nir) {
-               if (var->data.location == VARYING_SLOT_LAYER)
-                       layer_id_loc = var->data.driver_location;
-               if (var->data.location == VARYING_SLOT_VIEWPORT)
-                       view_id_loc = var->data.driver_location;
-       }
-
-       assert(!layer_zero || layer_id_loc != ~0);
-       assert(!view_zero || view_id_loc != ~0);
-
-       bool progress = false;
-       nir_builder b;
-
-       nir_foreach_function(func, nir) {
-               nir_builder_init(&b, func->impl);
-
-               nir_foreach_block(block, func->impl) {
-                       nir_foreach_instr_safe(instr, block) {
-                               if (instr->type != nir_instr_type_intrinsic)
-                                       continue;
-
-                               nir_intrinsic_instr *intrin =
-                                       nir_instr_as_intrinsic(instr);
-
-                               if (intrin->intrinsic != nir_intrinsic_load_input)
-                                       continue;
-
-                               unsigned base = nir_intrinsic_base(intrin);
-                               if (base != layer_id_loc && base != view_id_loc)
-                                       continue;
-
-                               b.cursor = nir_before_instr(&intrin->instr);
-                               nir_ssa_def *zero = nir_imm_int(&b, 0);
-                               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                                                                zero);
-                               nir_instr_remove(&intrin->instr);
-                               progress = true;
-                       }
-               }
-
-               if (progress) {
-                       nir_metadata_preserve(func->impl,
-                                                                 nir_metadata_block_index |
-                                                                 nir_metadata_dominance);
-               } else {
-                       nir_metadata_preserve(func->impl, nir_metadata_all);
-               }
-       }
-
-       return progress;
+   unsigned layer_id_loc = ~0, view_id_loc = ~0;
+   nir_foreach_shader_in_variable (var, nir) {
+      if (var->data.location == VARYING_SLOT_LAYER)
+         layer_id_loc = var->data.driver_location;
+      if (var->data.location == VARYING_SLOT_VIEWPORT)
+         view_id_loc = var->data.driver_location;
+   }
+
+   assert(!layer_zero || layer_id_loc != ~0);
+   assert(!view_zero || view_id_loc != ~0);
+
+   bool progress = false;
+   nir_builder b;
+
+   nir_foreach_function (func, nir) {
+      nir_builder_init(&b, func->impl);
+
+      nir_foreach_block (block, func->impl) {
+         nir_foreach_instr_safe (instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic != nir_intrinsic_load_input)
+               continue;
+
+            unsigned base = nir_intrinsic_base(intrin);
+            if (base != layer_id_loc && base != view_id_loc)
+               continue;
+
+            b.cursor = nir_before_instr(&intrin->instr);
+            nir_ssa_def *zero = nir_imm_int(&b, 0);
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa, zero);
+            nir_instr_remove(&intrin->instr);
+            progress = true;
+         }
+      }
+
+      if (progress) {
+         nir_metadata_preserve(
+            func->impl, nir_metadata_block_index | nir_metadata_dominance);
+      } else {
+         nir_metadata_preserve(func->impl, nir_metadata_all);
+      }
+   }
+
+   return progress;
  }
  
  void
  ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
  {
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               mesa_logi("----------------------");
-               nir_log_shaderi(s);
-               mesa_logi("----------------------");
-       }
-
-       bool progress = false;
-
-       if (so->key.has_gs || so->key.tessellation) {
-               switch (so->shader->type) {
-               case MESA_SHADER_VERTEX:
-                       NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so, so->key.tessellation);
-                       progress = true;
-                       break;
-               case MESA_SHADER_TESS_CTRL:
-                       NIR_PASS_V(s, ir3_nir_lower_tess_ctrl, so, so->key.tessellation);
-                       NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);
-                       progress = true;
-                       break;
-               case MESA_SHADER_TESS_EVAL:
-                       NIR_PASS_V(s, ir3_nir_lower_tess_eval, so, so->key.tessellation);
-                       if (so->key.has_gs)
-                               NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so, so->key.tessellation);
-                       progress = true;
-                       break;
-               case MESA_SHADER_GEOMETRY:
-                       NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);
-                       progress = true;
-                       break;
-               default:
-                       break;
-               }
-       }
-
-       if (s->info.stage == MESA_SHADER_VERTEX) {
-               if (so->key.ucp_enables)
-                       progress |= OPT(s, nir_lower_clip_vs, so->key.ucp_enables, false, false, NULL);
-       } else if (s->info.stage == MESA_SHADER_FRAGMENT) {
-               bool layer_zero = so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER);
-               bool view_zero = so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT);
-
-               if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull)
-                       progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false);
-               if (layer_zero || view_zero)
-                       progress |= OPT(s, ir3_nir_lower_view_layer_id, layer_zero, view_zero);
-       }
-
-       /* Move large constant variables to the constants attached to the NIR
-        * shader, which we will upload in the immediates range.  This generates
-        * amuls, so we need to clean those up after.
-        *
-        * Passing no size_align, we would get packed values, which if we end up
-        * having to load with LDC would result in extra reads to unpack from
-        * straddling loads.  Align everything to vec4 to avoid that, though we
-        * could theoretically do better.
-        */
-       OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */);
-       OPT_V(s, ir3_nir_lower_load_constant, so);
-
-       if (!so->binning_pass)
-               OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
-
-       progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
-
-       /* Lower large temporaries to scratch, which in Qualcomm terms is private
-        * memory, to avoid excess register pressure. This should happen after
-        * nir_opt_large_constants, because loading from a UBO is much, much less
-        * expensive.
-        */
-       if (so->shader->compiler->has_pvtmem) {
-               progress |=
-                       OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp,
-                               16 * 16 /* bytes */, glsl_get_natural_size_align_bytes);
-       }
-
-       /* Lower scratch writemasks */
-       progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s);
-
-       OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
-
-       /* UBO offset lowering has to come after we've decided what will
-        * be left as load_ubo
-        */
-       if (so->shader->compiler->gpu_id >= 600)
-               progress |= OPT(s, nir_lower_ubo_vec4);
-
-       OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);
-
-       if (progress)
-               ir3_optimize_loop(so->shader->compiler, s);
-
-       /* Fixup indirect load_uniform's which end up with a const base offset
-        * which is too large to encode.  Do this late(ish) so we actually
-        * can differentiate indirect vs non-indirect.
-        */
-       if (OPT(s, ir3_nir_fixup_load_uniform))
-               ir3_optimize_loop(so->shader->compiler, s);
-
-       /* Do late algebraic optimization to turn add(a, neg(b)) back into
-       * subs, then the mandatory cleanup after algebraic.  Note that it may
-       * produce fnegs, and if so then we need to keep running to squash
-       * fneg(fneg(a)).
-       */
-       bool more_late_algebraic = true;
-       while (more_late_algebraic) {
-               more_late_algebraic = OPT(s, nir_opt_algebraic_late);
-               OPT_V(s, nir_opt_constant_folding);
-               OPT_V(s, nir_copy_prop);
-               OPT_V(s, nir_opt_dce);
-               OPT_V(s, nir_opt_cse);
-       }
-
-       OPT_V(s, nir_opt_sink, nir_move_const_undef);
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               mesa_logi("----------------------");
-               nir_log_shaderi(s);
-               mesa_logi("----------------------");
-       }
-
-       nir_sweep(s);
-
-       /* Binning pass variants re-use  the const_state of the corresponding
-        * draw pass shader, so that same const emit can be re-used for both
-        * passes:
-        */
-       if (!so->binning_pass)
-               ir3_setup_const_state(s, so, ir3_const_state(so));
+   if (ir3_shader_debug & IR3_DBG_DISASM) {
+      mesa_logi("----------------------");
+      nir_log_shaderi(s);
+      mesa_logi("----------------------");
+   }
+
+   bool progress = false;
+
+   if (so->key.has_gs || so->key.tessellation) {
+      switch (so->shader->type) {
+      case MESA_SHADER_VERTEX:
+         NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so,
+                    so->key.tessellation);
+         progress = true;
+         break;
+      case MESA_SHADER_TESS_CTRL:
+         NIR_PASS_V(s, ir3_nir_lower_tess_ctrl, so, so->key.tessellation);
+         NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);
+         progress = true;
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         NIR_PASS_V(s, ir3_nir_lower_tess_eval, so, so->key.tessellation);
+         if (so->key.has_gs)
+            NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so,
+                       so->key.tessellation);
+         progress = true;
+         break;
+      case MESA_SHADER_GEOMETRY:
+         NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so);
+         progress = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (s->info.stage == MESA_SHADER_VERTEX) {
+      if (so->key.ucp_enables)
+         progress |=
+            OPT(s, nir_lower_clip_vs, so->key.ucp_enables, false, false, NULL);
+   } else if (s->info.stage == MESA_SHADER_FRAGMENT) {
+      bool layer_zero =
+         so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER);
+      bool view_zero =
+         so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT);
+
+      if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull)
+         progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false);
+      if (layer_zero || view_zero)
+         progress |= OPT(s, ir3_nir_lower_view_layer_id, layer_zero, view_zero);
+   }
+
+   /* Move large constant variables to the constants attached to the NIR
+    * shader, which we will upload in the immediates range.  This generates
+    * amuls, so we need to clean those up after.
+    *
+    * Passing no size_align, we would get packed values, which if we end up
+    * having to load with LDC would result in extra reads to unpack from
+    * straddling loads.  Align everything to vec4 to avoid that, though we
+    * could theoretically do better.
+    */
+   OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes,
+         32 /* bytes */);
+   OPT_V(s, ir3_nir_lower_load_constant, so);
+
+   if (!so->binning_pass)
+      OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
+
+   progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
+
+   /* Lower large temporaries to scratch, which in Qualcomm terms is private
+    * memory, to avoid excess register pressure. This should happen after
+    * nir_opt_large_constants, because loading from a UBO is much, much less
+    * expensive.
+    */
+   if (so->shader->compiler->has_pvtmem) {
+      progress |= OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp,
+                      16 * 16 /* bytes */, glsl_get_natural_size_align_bytes);
+   }
+
+   /* Lower scratch writemasks */
+   progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s);
+
+   OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
+
+   /* UBO offset lowering has to come after we've decided what will
+    * be left as load_ubo
+    */
+   if (so->shader->compiler->gpu_id >= 600)
+      progress |= OPT(s, nir_lower_ubo_vec4);
+
+   OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);
+
+   if (progress)
+      ir3_optimize_loop(so->shader->compiler, s);
+
+   /* Fixup indirect load_uniform's which end up with a const base offset
+    * which is too large to encode.  Do this late(ish) so we actually
+    * can differentiate indirect vs non-indirect.
+    */
+   if (OPT(s, ir3_nir_fixup_load_uniform))
+      ir3_optimize_loop(so->shader->compiler, s);
+
+   /* Do late algebraic optimization to turn add(a, neg(b)) back into
+    * subs, then the mandatory cleanup after algebraic.  Note that it may
+    * produce fnegs, and if so then we need to keep running to squash
+    * fneg(fneg(a)).
+    */
+   bool more_late_algebraic = true;
+   while (more_late_algebraic) {
+      more_late_algebraic = OPT(s, nir_opt_algebraic_late);
+      OPT_V(s, nir_opt_constant_folding);
+      OPT_V(s, nir_copy_prop);
+      OPT_V(s, nir_opt_dce);
+      OPT_V(s, nir_opt_cse);
+   }
+
+   OPT_V(s, nir_opt_sink, nir_move_const_undef);
+
+   if (ir3_shader_debug & IR3_DBG_DISASM) {
+      mesa_logi("----------------------");
+      nir_log_shaderi(s);
+      mesa_logi("----------------------");
+   }
+
+   nir_sweep(s);
+
+   /* Binning pass variants re-use  the const_state of the corresponding
+    * draw pass shader, so that same const emit can be re-used for both
+    * passes:
+    */
+   if (!so->binning_pass)
+      ir3_setup_const_state(s, so, ir3_const_state(so));
  }
  
  static void
-ir3_nir_scan_driver_consts(nir_shader *shader,
-               struct ir3_const_state *layout)
+ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_const_state *layout)
  {
-       nir_foreach_function (function, shader) {
-               if (!function->impl)
-                       continue;
-
-               nir_foreach_block (block, function->impl) {
-                       nir_foreach_instr (instr, block) {
-                               if (instr->type != nir_instr_type_intrinsic)
-                                       continue;
-
-                               nir_intrinsic_instr *intr =
-                                       nir_instr_as_intrinsic(instr);
-                               unsigned idx;
-
-                               switch (intr->intrinsic) {
-                               case nir_intrinsic_get_ssbo_size:
-                                       if (ir3_bindless_resource(intr->src[0]))
-                                               break;
-                                       idx = nir_src_as_uint(intr->src[0]);
-                                       if (layout->ssbo_size.mask & (1 << idx))
-                                               break;
-                                       layout->ssbo_size.mask |= (1 << idx);
-                                       layout->ssbo_size.off[idx] =
-                                               layout->ssbo_size.count;
-                                       layout->ssbo_size.count += 1; /* one const per */
-                                       break;
-                               case nir_intrinsic_image_atomic_add:
-                               case nir_intrinsic_image_atomic_imin:
-                               case nir_intrinsic_image_atomic_umin:
-                               case nir_intrinsic_image_atomic_imax:
-                               case nir_intrinsic_image_atomic_umax:
-                               case nir_intrinsic_image_atomic_and:
-                               case nir_intrinsic_image_atomic_or:
-                               case nir_intrinsic_image_atomic_xor:
-                               case nir_intrinsic_image_atomic_exchange:
-                               case nir_intrinsic_image_atomic_comp_swap:
-                               case nir_intrinsic_image_store:
-                               case nir_intrinsic_image_size:
-                                       idx = nir_src_as_uint(intr->src[0]);
-                                       if (layout->image_dims.mask & (1 << idx))
-                                               break;
-                                       layout->image_dims.mask |= (1 << idx);
-                                       layout->image_dims.off[idx] =
-                                               layout->image_dims.count;
-                                       layout->image_dims.count += 3; /* three const per */
-                                       break;
-                               case nir_intrinsic_load_base_vertex:
-                               case nir_intrinsic_load_first_vertex:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1);
-                                       break;
-                               case nir_intrinsic_load_base_instance:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1);
-                                       break;
-                               case nir_intrinsic_load_user_clip_plane:
-                                       idx = nir_intrinsic_ucp_id(intr);
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_UCP0_X + (idx + 1) * 4);
-                                       break;
-                               case nir_intrinsic_load_num_workgroups:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1);
-                                       break;
-                               case nir_intrinsic_load_workgroup_size:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_LOCAL_GROUP_SIZE_Z + 1);
-                                       break;
-                               case nir_intrinsic_load_base_workgroup_id:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
-                                       break;
-                               case nir_intrinsic_load_subgroup_size:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);
-                                       break;
-                               case nir_intrinsic_load_subgroup_id_shift_ir3:
-                                       layout->num_driver_params =
-                                               MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
-                                       break;
-                               default:
-                                       break;
-                               }
-                       }
-               }
-       }
+   nir_foreach_function (function, shader) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block (block, function->impl) {
+         nir_foreach_instr (instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            unsigned idx;
+
+            switch (intr->intrinsic) {
+            case nir_intrinsic_get_ssbo_size:
+               if (ir3_bindless_resource(intr->src[0]))
+                  break;
+               idx = nir_src_as_uint(intr->src[0]);
+               if (layout->ssbo_size.mask & (1 << idx))
+                  break;
+               layout->ssbo_size.mask |= (1 << idx);
+               layout->ssbo_size.off[idx] = layout->ssbo_size.count;
+               layout->ssbo_size.count += 1; /* one const per */
+               break;
+            case nir_intrinsic_image_atomic_add:
+            case nir_intrinsic_image_atomic_imin:
+            case nir_intrinsic_image_atomic_umin:
+            case nir_intrinsic_image_atomic_imax:
+            case nir_intrinsic_image_atomic_umax:
+            case nir_intrinsic_image_atomic_and:
+            case nir_intrinsic_image_atomic_or:
+            case nir_intrinsic_image_atomic_xor:
+            case nir_intrinsic_image_atomic_exchange:
+            case nir_intrinsic_image_atomic_comp_swap:
+            case nir_intrinsic_image_store:
+            case nir_intrinsic_image_size:
+               idx = nir_src_as_uint(intr->src[0]);
+               if (layout->image_dims.mask & (1 << idx))
+                  break;
+               layout->image_dims.mask |= (1 << idx);
+               layout->image_dims.off[idx] = layout->image_dims.count;
+               layout->image_dims.count += 3; /* three const per */
+               break;
+            case nir_intrinsic_load_base_vertex:
+            case nir_intrinsic_load_first_vertex:
+               layout->num_driver_params =
+                  MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1);
+               break;
+            case nir_intrinsic_load_base_instance:
+               layout->num_driver_params =
+                  MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1);
+               break;
+            case nir_intrinsic_load_user_clip_plane:
+               idx = nir_intrinsic_ucp_id(intr);
+               layout->num_driver_params = MAX2(layout->num_driver_params,
+                                                IR3_DP_UCP0_X + (idx + 1) * 4);
+               break;
+            case nir_intrinsic_load_num_workgroups:
+               layout->num_driver_params =
+                  MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1);
+               break;
+            case nir_intrinsic_load_workgroup_size:
+               layout->num_driver_params = MAX2(layout->num_driver_params,
+                                                IR3_DP_LOCAL_GROUP_SIZE_Z + 1);
+               break;
+            case nir_intrinsic_load_base_workgroup_id:
+               layout->num_driver_params =
+                  MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
+               break;
+            case nir_intrinsic_load_subgroup_size:
+               layout->num_driver_params =
+                  MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);
+               break;
+            case nir_intrinsic_load_subgroup_id_shift_ir3:
+               layout->num_driver_params =
+                  MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
+               break;
+            default:
+               break;
+            }
+         }
+      }
+   }
  }
  
  /* Sets up the variant-dependent constant state for the ir3_shader.  Note
@@ -815,83 +811,81 @@ ir3_nir_scan_driver_consts(nir_shader *shader,
   */
  void
  ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
-               struct ir3_const_state *const_state)
+                      struct ir3_const_state *const_state)
  {
-       struct ir3_compiler *compiler = v->shader->compiler;
-
-       memset(&const_state->offsets, ~0, sizeof(const_state->offsets));
-
-       ir3_nir_scan_driver_consts(nir, const_state);
-
-       if ((compiler->gpu_id < 500) &&
-                       (v->shader->stream_output.num_outputs > 0)) {
-               const_state->num_driver_params =
-                       MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);
-       }
-
-       const_state->num_ubos = nir->info.num_ubos;
-
-       /* num_driver_params is scalar, align to vec4: */
-       const_state->num_driver_params = align(const_state->num_driver_params, 4);
-
-       debug_assert((const_state->ubo_state.size % 16) == 0);
-       unsigned constoff = const_state->ubo_state.size / 16;
-       unsigned ptrsz = ir3_pointer_size(compiler);
-
-       if (const_state->num_ubos > 0) {
-               const_state->offsets.ubo = constoff;
-               constoff += align(const_state->num_ubos * ptrsz, 4) / 4;
-       }
-
-       if (const_state->ssbo_size.count > 0) {
-               unsigned cnt = const_state->ssbo_size.count;
-               const_state->offsets.ssbo_sizes = constoff;
-               constoff += align(cnt, 4) / 4;
-       }
-
-       if (const_state->image_dims.count > 0) {
-               unsigned cnt = const_state->image_dims.count;
-               const_state->offsets.image_dims = constoff;
-               constoff += align(cnt, 4) / 4;
-       }
-
-       if (const_state->num_driver_params > 0) {
-               /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
-               if (v->type == MESA_SHADER_VERTEX && compiler->gpu_id >= 600)
-                       constoff = MAX2(constoff, 1);
-               const_state->offsets.driver_param = constoff;
-       }
-       constoff += const_state->num_driver_params / 4;
-
-       if ((v->type == MESA_SHADER_VERTEX) &&
-                       (compiler->gpu_id < 500) &&
-                       v->shader->stream_output.num_outputs > 0) {
-               const_state->offsets.tfbo = constoff;
-               constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
-       }
-
-       switch (v->type) {
-       case MESA_SHADER_VERTEX:
-               const_state->offsets.primitive_param = constoff;
-               constoff += 1;
-               break;
-       case MESA_SHADER_TESS_CTRL:
-       case MESA_SHADER_TESS_EVAL:
-               constoff = align(constoff - 1, 4) + 3;
-               const_state->offsets.primitive_param = constoff;
-               const_state->offsets.primitive_map = constoff + 5;
-               constoff += 5 + DIV_ROUND_UP(v->input_size, 4);
-               break;
-       case MESA_SHADER_GEOMETRY:
-               const_state->offsets.primitive_param = constoff;
-               const_state->offsets.primitive_map = constoff + 1;
-               constoff += 1 + DIV_ROUND_UP(v->input_size, 4);
-               break;
-       default:
-               break;
-       }
-
-       const_state->offsets.immediate = constoff;
-
-       assert(constoff <= ir3_max_const(v));
+   struct ir3_compiler *compiler = v->shader->compiler;
+
+   memset(&const_state->offsets, ~0, sizeof(const_state->offsets));
+
+   ir3_nir_scan_driver_consts(nir, const_state);
+
+   if ((compiler->gpu_id < 500) && (v->shader->stream_output.num_outputs > 0)) {
+      const_state->num_driver_params =
+         MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);
+   }
+
+   const_state->num_ubos = nir->info.num_ubos;
+
+   /* num_driver_params is scalar, align to vec4: */
+   const_state->num_driver_params = align(const_state->num_driver_params, 4);
+
+   debug_assert((const_state->ubo_state.size % 16) == 0);
+   unsigned constoff = const_state->ubo_state.size / 16;
+   unsigned ptrsz = ir3_pointer_size(compiler);
+
+   if (const_state->num_ubos > 0) {
+      const_state->offsets.ubo = constoff;
+      constoff += align(const_state->num_ubos * ptrsz, 4) / 4;
+   }
+
+   if (const_state->ssbo_size.count > 0) {
+      unsigned cnt = const_state->ssbo_size.count;
+      const_state->offsets.ssbo_sizes = constoff;
+      constoff += align(cnt, 4) / 4;
+   }
+
+   if (const_state->image_dims.count > 0) {
+      unsigned cnt = const_state->image_dims.count;
+      const_state->offsets.image_dims = constoff;
+      constoff += align(cnt, 4) / 4;
+   }
+
+   if (const_state->num_driver_params > 0) {
+      /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
+      if (v->type == MESA_SHADER_VERTEX && compiler->gpu_id >= 600)
+         constoff = MAX2(constoff, 1);
+      const_state->offsets.driver_param = constoff;
+   }
+   constoff += const_state->num_driver_params / 4;
+
+   if ((v->type == MESA_SHADER_VERTEX) && (compiler->gpu_id < 500) &&
+       v->shader->stream_output.num_outputs > 0) {
+      const_state->offsets.tfbo = constoff;
+      constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+   }
+
+   switch (v->type) {
+   case MESA_SHADER_VERTEX:
+      const_state->offsets.primitive_param = constoff;
+      constoff += 1;
+      break;
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
+      constoff = align(constoff - 1, 4) + 3;
+      const_state->offsets.primitive_param = constoff;
+      const_state->offsets.primitive_map = constoff + 5;
+      constoff += 5 + DIV_ROUND_UP(v->input_size, 4);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      const_state->offsets.primitive_param = constoff;
+      const_state->offsets.primitive_map = constoff + 1;
+      constoff += 1 + DIV_ROUND_UP(v->input_size, 4);
+      break;
+   default:
+      break;
+   }
+
+   const_state->offsets.immediate = constoff;
+
+   assert(constoff <= ir3_max_const(v));
  }
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h

index 03e847e..1e85667 100644 (file)
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -43,15 +43,19 @@ bool ir3_nir_move_varying_inputs(nir_shader *shader);
  int ir3_nir_coord_offset(nir_ssa_def *ssa);
  bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
  
-
  void ir3_nir_lower_to_explicit_output(nir_shader *shader,
-               struct ir3_shader_variant *v, unsigned topology);
-void ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_shader_variant *v);
-void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology);
-void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology);
+                                      struct ir3_shader_variant *v,
+                                      unsigned topology);
+void ir3_nir_lower_to_explicit_input(nir_shader *shader,
+                                     struct ir3_shader_variant *v);
+void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
+                             unsigned topology);
+void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
+                             unsigned topology);
  void ir3_nir_lower_gs(nir_shader *shader);
  
-const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
+const nir_shader_compiler_options *
+ir3_get_compiler_options(struct ir3_compiler *compiler);
  void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
  void ir3_nir_lower_io_to_temporaries(nir_shader *s);
  void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s);
@@ -59,29 +63,30 @@ void ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s);
  void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
  
  void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
-               struct ir3_const_state *const_state);
+                           struct ir3_const_state *const_state);
  bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
  void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
  bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
  bool ir3_nir_fixup_load_uniform(nir_shader *nir);
  
-nir_ssa_def *
-ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift);
+nir_ssa_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
+                                             nir_ssa_def *offset,
+                                             int32_t shift);
  
  static inline nir_intrinsic_instr *
  ir3_bindless_resource(nir_src src)
  {
-       if (!src.is_ssa)
-               return NULL;
+   if (!src.is_ssa)
+      return NULL;
  
-       if (src.ssa->parent_instr->type != nir_instr_type_intrinsic)
-               return NULL;
+   if (src.ssa->parent_instr->type != nir_instr_type_intrinsic)
+      return NULL;
  
-       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
-       if (intrin->intrinsic != nir_intrinsic_bindless_resource_ir3)
-               return NULL;
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
+   if (intrin->intrinsic != nir_intrinsic_bindless_resource_ir3)
+      return NULL;
  
-       return intrin;
+   return intrin;
  }
  
  #endif /* IR3_NIR_H_ */
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c

index c4d1cce..83b9a45 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -21,54 +21,55 @@
   * SOFTWARE.
   */
  
-#include "ir3_nir.h"
-#include "ir3_compiler.h"
  #include "compiler/nir/nir.h"
  #include "compiler/nir/nir_builder.h"
  #include "util/u_math.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
  
  static inline bool
-get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr, uint32_t alignment, struct ir3_ubo_range *r)
+get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr,
+                   uint32_t alignment, struct ir3_ubo_range *r)
  {
-       uint32_t offset = nir_intrinsic_range_base(instr);
-       uint32_t size = nir_intrinsic_range(instr);
+   uint32_t offset = nir_intrinsic_range_base(instr);
+   uint32_t size = nir_intrinsic_range(instr);
  
-       /* If the offset is constant, the range is trivial (and NIR may not have
-        * figured it out).
-        */
-       if (nir_src_is_const(instr->src[1])) {
-               offset = nir_src_as_uint(instr->src[1]);
-               size = nir_intrinsic_dest_components(instr) * 4;
-       }
+   /* If the offset is constant, the range is trivial (and NIR may not have
+    * figured it out).
+    */
+   if (nir_src_is_const(instr->src[1])) {
+      offset = nir_src_as_uint(instr->src[1]);
+      size = nir_intrinsic_dest_components(instr) * 4;
+   }
  
-       /* If we haven't figured out the range accessed in the UBO, bail. */
-       if (size == ~0)
-               return false;
+   /* If we haven't figured out the range accessed in the UBO, bail. */
+   if (size == ~0)
+      return false;
  
-       r->start = ROUND_DOWN_TO(offset, alignment * 16);
-       r->end = ALIGN(offset + size, alignment * 16);
+   r->start = ROUND_DOWN_TO(offset, alignment * 16);
+   r->end = ALIGN(offset + size, alignment * 16);
  
-       return true;
+   return true;
  }
  
  static bool
  get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo)
  {
-       if (nir_src_is_const(instr->src[0])) {
-               ubo->block = nir_src_as_uint(instr->src[0]);
-               ubo->bindless_base = 0;
-               ubo->bindless = false;
-               return true;
-       } else {
-               nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
-               if (rsrc && nir_src_is_const(rsrc->src[0])) {
-                       ubo->block = nir_src_as_uint(rsrc->src[0]);
-                       ubo->bindless_base = nir_intrinsic_desc_set(rsrc);
-                       ubo->bindless = true;
-                       return true;
-               }
-       }
-       return false;
+   if (nir_src_is_const(instr->src[0])) {
+      ubo->block = nir_src_as_uint(instr->src[0]);
+      ubo->bindless_base = 0;
+      ubo->bindless = false;
+      return true;
+   } else {
+      nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
+      if (rsrc && nir_src_is_const(rsrc->src[0])) {
+         ubo->block = nir_src_as_uint(rsrc->src[0]);
+         ubo->bindless_base = nir_intrinsic_desc_set(rsrc);
+         ubo->bindless = true;
+         return true;
+      }
+   }
+   return false;
  }
  
  /**
@@ -76,24 +77,23 @@ get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo)
   */
  static const struct ir3_ubo_range *
  get_existing_range(nir_intrinsic_instr *instr,
-               const struct ir3_ubo_analysis_state *state,
-               struct ir3_ubo_range *r)
+                   const struct ir3_ubo_analysis_state *state,
+                   struct ir3_ubo_range *r)
  {
-       struct ir3_ubo_info ubo = {};
+   struct ir3_ubo_info ubo = {};
  
-       if (!get_ubo_info(instr, &ubo))
-               return NULL;
+   if (!get_ubo_info(instr, &ubo))
+      return NULL;
  
-       for (int i = 0; i < state->num_enabled; i++) {
-               const struct ir3_ubo_range *range = &state->range[i];
-               if (!memcmp(&range->ubo, &ubo, sizeof(ubo)) &&
-                               r->start >= range->start &&
-                               r->end <= range->end) {
-                       return range;
-               }
-       }
+   for (int i = 0; i < state->num_enabled; i++) {
+      const struct ir3_ubo_range *range = &state->range[i];
+      if (!memcmp(&range->ubo, &ubo, sizeof(ubo)) && r->start >= range->start &&
+          r->end <= range->end) {
+         return range;
+      }
+   }
  
-       return NULL;
+   return NULL;
  }
  
  /**
@@ -103,26 +103,26 @@ get_existing_range(nir_intrinsic_instr *instr,
  static void
  merge_neighbors(struct ir3_ubo_analysis_state *state, int index)
  {
-       struct ir3_ubo_range *a = &state->range[index];
-
-       /* index is always the first slot that would have neighbored/overlapped with
-        * the new range.
-        */
-       for (int i = index + 1; i < state->num_enabled; i++) {
-               struct ir3_ubo_range *b = &state->range[i];
-               if (memcmp(&a->ubo, &b->ubo, sizeof(a->ubo)))
-                       continue;
-
-               if (a->start > b->end || a->end < b->start)
-                       continue;
-
-               /* Merge B into A. */
-               a->start = MIN2(a->start, b->start);
-               a->end = MAX2(a->end, b->end);
-
-               /* Swap the last enabled range into B's now unused slot */
-               *b = state->range[--state->num_enabled];
-       }
+   struct ir3_ubo_range *a = &state->range[index];
+
+   /* index is always the first slot that would have neighbored/overlapped with
+    * the new range.
+    */
+   for (int i = index + 1; i < state->num_enabled; i++) {
+      struct ir3_ubo_range *b = &state->range[i];
+      if (memcmp(&a->ubo, &b->ubo, sizeof(a->ubo)))
+         continue;
+
+      if (a->start > b->end || a->end < b->start)
+         continue;
+
+      /* Merge B into A. */
+      a->start = MIN2(a->start, b->start);
+      a->end = MAX2(a->end, b->end);
+
+      /* Swap the last enabled range into B's now unused slot */
+      *b = state->range[--state->num_enabled];
+   }
  }
  
  /**
@@ -134,59 +134,59 @@ merge_neighbors(struct ir3_ubo_analysis_state *state, int index)
   */
  static void
  gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
-               struct ir3_ubo_analysis_state *state, uint32_t alignment,
-               uint32_t *upload_remaining)
+                  struct ir3_ubo_analysis_state *state, uint32_t alignment,
+                  uint32_t *upload_remaining)
  {
-       if (ir3_shader_debug & IR3_DBG_NOUBOOPT)
-               return;
-
-       struct ir3_ubo_info ubo = {};
-       if (!get_ubo_info(instr, &ubo))
-               return;
-
-       struct ir3_ubo_range r;
-       if (!get_ubo_load_range(nir, instr, alignment, &r))
-               return;
-
-       /* See if there's an existing range for this UBO we want to merge into. */
-       for (int i = 0; i < state->num_enabled; i++) {
-               struct ir3_ubo_range *plan_r = &state->range[i];
-               if (memcmp(&plan_r->ubo, &ubo, sizeof(ubo)))
-                       continue;
-
-               /* Don't extend existing uploads unless they're
-                * neighboring/overlapping.
-                */
-               if (r.start > plan_r->end || r.end < plan_r->start)
-                       continue;
-
-               r.start = MIN2(r.start, plan_r->start);
-               r.end = MAX2(r.end, plan_r->end);
-
-               uint32_t added = (plan_r->start - r.start) + (r.end - plan_r->end);
-               if (added >= *upload_remaining)
-                       return;
-
-               plan_r->start = r.start;
-               plan_r->end = r.end;
-               *upload_remaining -= added;
-
-               merge_neighbors(state, i);
-               return;
-       }
-
-       if (state->num_enabled == ARRAY_SIZE(state->range))
-               return;
-
-       uint32_t added = r.end - r.start;
-       if (added >= *upload_remaining)
-               return;
-
-       struct ir3_ubo_range *plan_r = &state->range[state->num_enabled++];
-       plan_r->ubo = ubo;
-       plan_r->start = r.start;
-       plan_r->end = r.end;
-       *upload_remaining -= added;
+   if (ir3_shader_debug & IR3_DBG_NOUBOOPT)
+      return;
+
+   struct ir3_ubo_info ubo = {};
+   if (!get_ubo_info(instr, &ubo))
+      return;
+
+   struct ir3_ubo_range r;
+   if (!get_ubo_load_range(nir, instr, alignment, &r))
+      return;
+
+   /* See if there's an existing range for this UBO we want to merge into. */
+   for (int i = 0; i < state->num_enabled; i++) {
+      struct ir3_ubo_range *plan_r = &state->range[i];
+      if (memcmp(&plan_r->ubo, &ubo, sizeof(ubo)))
+         continue;
+
+      /* Don't extend existing uploads unless they're
+       * neighboring/overlapping.
+       */
+      if (r.start > plan_r->end || r.end < plan_r->start)
+         continue;
+
+      r.start = MIN2(r.start, plan_r->start);
+      r.end = MAX2(r.end, plan_r->end);
+
+      uint32_t added = (plan_r->start - r.start) + (r.end - plan_r->end);
+      if (added >= *upload_remaining)
+         return;
+
+      plan_r->start = r.start;
+      plan_r->end = r.end;
+      *upload_remaining -= added;
+
+      merge_neighbors(state, i);
+      return;
+   }
+
+   if (state->num_enabled == ARRAY_SIZE(state->range))
+      return;
+
+   uint32_t added = r.end - r.start;
+   if (added >= *upload_remaining)
+      return;
+
+   struct ir3_ubo_range *plan_r = &state->range[state->num_enabled++];
+   plan_r->ubo = ubo;
+   plan_r->start = r.start;
+   plan_r->end = r.end;
+   *upload_remaining -= added;
  }
  
  /* For indirect offset, it is common to see a pattern of multiple
@@ -197,7 +197,8 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
   *
   * Detect this, and peel out the const_offset part, to end up with:
   *
- *    vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset, 0, 0)
+ *    vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset,
+ * 0, 0)
   *
   * Or similarly:
   *
@@ -207,7 +208,8 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
   * Can be converted to:
   *
   *    vec1 32 ssa_base = imul24 a, b
- *    vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset, 0, 0)
+ *    vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset,
+ * 0, 0)
   *
   * This gives the other opt passes something much easier to work
   * with (ie. not requiring value range tracking)
@@ -215,38 +217,38 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
  static void
  handle_partial_const(nir_builder *b, nir_ssa_def **srcp, int *offp)
  {
-       if ((*srcp)->parent_instr->type != nir_instr_type_alu)
-               return;
-
-       nir_alu_instr *alu = nir_instr_as_alu((*srcp)->parent_instr);
-
-       if (alu->op == nir_op_imad24_ir3) {
-               /* This case is slightly more complicated as we need to
-                * replace the imad24_ir3 with an imul24:
-                */
-               if (!nir_src_is_const(alu->src[2].src))
-                       return;
-
-               *offp += nir_src_as_uint(alu->src[2].src);
-               *srcp = nir_imul24(b, nir_ssa_for_alu_src(b, alu, 0),
-                               nir_ssa_for_alu_src(b, alu, 1));
-
-               return;
-       }
-
-       if (alu->op != nir_op_iadd)
-               return;
-
-       if (!(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa))
-               return;
-
-       if (nir_src_is_const(alu->src[0].src)) {
-               *offp += nir_src_as_uint(alu->src[0].src);
-               *srcp = alu->src[1].src.ssa;
-       } else if (nir_src_is_const(alu->src[1].src)) {
-               *srcp = alu->src[0].src.ssa;
-               *offp += nir_src_as_uint(alu->src[1].src);
-       }
+   if ((*srcp)->parent_instr->type != nir_instr_type_alu)
+      return;
+
+   nir_alu_instr *alu = nir_instr_as_alu((*srcp)->parent_instr);
+
+   if (alu->op == nir_op_imad24_ir3) {
+      /* This case is slightly more complicated as we need to
+       * replace the imad24_ir3 with an imul24:
+       */
+      if (!nir_src_is_const(alu->src[2].src))
+         return;
+
+      *offp += nir_src_as_uint(alu->src[2].src);
+      *srcp = nir_imul24(b, nir_ssa_for_alu_src(b, alu, 0),
+                         nir_ssa_for_alu_src(b, alu, 1));
+
+      return;
+   }
+
+   if (alu->op != nir_op_iadd)
+      return;
+
+   if (!(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa))
+      return;
+
+   if (nir_src_is_const(alu->src[0].src)) {
+      *offp += nir_src_as_uint(alu->src[0].src);
+      *srcp = alu->src[1].src.ssa;
+   } else if (nir_src_is_const(alu->src[1].src)) {
+      *srcp = alu->src[0].src.ssa;
+      *offp += nir_src_as_uint(alu->src[1].src);
+   }
  }
  
  /* Tracks the maximum bindful UBO accessed so that we reduce the UBO
@@ -255,258 +257,256 @@ handle_partial_const(nir_builder *b, nir_ssa_def **srcp, int *offp)
  static void
  track_ubo_use(nir_intrinsic_instr *instr, nir_builder *b, int *num_ubos)
  {
-       if (ir3_bindless_resource(instr->src[0])) {
-               assert(!b->shader->info.first_ubo_is_default_ubo); /* only set for GL */
-               return;
-       }
-
-       if (nir_src_is_const(instr->src[0])) {
-               int block = nir_src_as_uint(instr->src[0]);
-               *num_ubos = MAX2(*num_ubos, block + 1);
-       } else {
-               *num_ubos = b->shader->info.num_ubos;
-       }
+   if (ir3_bindless_resource(instr->src[0])) {
+      assert(!b->shader->info.first_ubo_is_default_ubo); /* only set for GL */
+      return;
+   }
+
+   if (nir_src_is_const(instr->src[0])) {
+      int block = nir_src_as_uint(instr->src[0]);
+      *num_ubos = MAX2(*num_ubos, block + 1);
+   } else {
+      *num_ubos = b->shader->info.num_ubos;
+   }
  }
  
  static bool
  lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
-               const struct ir3_ubo_analysis_state *state,
-               int *num_ubos, uint32_t alignment)
+                          const struct ir3_ubo_analysis_state *state,
+                          int *num_ubos, uint32_t alignment)
  {
-       b->cursor = nir_before_instr(&instr->instr);
-
-       struct ir3_ubo_range r;
-       if (!get_ubo_load_range(b->shader, instr, alignment, &r)) {
-               track_ubo_use(instr, b, num_ubos);
-               return false;
-       }
-
-       /* We don't lower dynamic block index UBO loads to load_uniform, but we
-        * could probably with some effort determine a block stride in number of
-        * registers.
-        */
-       const struct ir3_ubo_range *range = get_existing_range(instr, state, &r);
-       if (!range) {
-               track_ubo_use(instr, b, num_ubos);
-               return false;
-       }
-
-       nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
-       int const_offset = 0;
-
-       handle_partial_const(b, &ubo_offset, &const_offset);
-
-       /* UBO offset is in bytes, but uniform offset is in units of
-        * dwords, so we need to divide by 4 (right-shift by 2). For ldc the
-        * offset is in units of 16 bytes, so we need to multiply by 4. And
-        * also the same for the constant part of the offset:
-        */
-       const int shift = -2;
-       nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
-       nir_ssa_def *uniform_offset = NULL;
-       if (new_offset) {
-               uniform_offset = new_offset;
-       } else {
-               uniform_offset = shift > 0 ?
-                       nir_ishl(b, ubo_offset, nir_imm_int(b,  shift)) :
-                       nir_ushr(b, ubo_offset, nir_imm_int(b, -shift));
-       }
-
-       debug_assert(!(const_offset & 0x3));
-       const_offset >>= 2;
-
-       const int range_offset = ((int)range->offset - (int)range->start) / 4;
-       const_offset += range_offset;
-
-       /* The range_offset could be negative, if if only part of the UBO
-        * block is accessed, range->start can be greater than range->offset.
-        * But we can't underflow const_offset.  If necessary we need to
-        * insert nir instructions to compensate (which can hopefully be
-        * optimized away)
-        */
-       if (const_offset < 0) {
-               uniform_offset = nir_iadd_imm(b, uniform_offset, const_offset);
-               const_offset = 0;
-       }
-
-       nir_ssa_def *uniform =
-               nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size, uniform_offset, .base = const_offset);
-
-       nir_ssa_def_rewrite_uses(&instr->dest.ssa,
-                                                        uniform);
-
-       nir_instr_remove(&instr->instr);
-
-       return true;
+   b->cursor = nir_before_instr(&instr->instr);
+
+   struct ir3_ubo_range r;
+   if (!get_ubo_load_range(b->shader, instr, alignment, &r)) {
+      track_ubo_use(instr, b, num_ubos);
+      return false;
+   }
+
+   /* We don't lower dynamic block index UBO loads to load_uniform, but we
+    * could probably with some effort determine a block stride in number of
+    * registers.
+    */
+   const struct ir3_ubo_range *range = get_existing_range(instr, state, &r);
+   if (!range) {
+      track_ubo_use(instr, b, num_ubos);
+      return false;
+   }
+
+   nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
+   int const_offset = 0;
+
+   handle_partial_const(b, &ubo_offset, &const_offset);
+
+   /* UBO offset is in bytes, but uniform offset is in units of
+    * dwords, so we need to divide by 4 (right-shift by 2). For ldc the
+    * offset is in units of 16 bytes, so we need to multiply by 4. And
+    * also the same for the constant part of the offset:
+    */
+   const int shift = -2;
+   nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
+   nir_ssa_def *uniform_offset = NULL;
+   if (new_offset) {
+      uniform_offset = new_offset;
+   } else {
+      uniform_offset = shift > 0
+                          ? nir_ishl(b, ubo_offset, nir_imm_int(b, shift))
+                          : nir_ushr(b, ubo_offset, nir_imm_int(b, -shift));
+   }
+
+   debug_assert(!(const_offset & 0x3));
+   const_offset >>= 2;
+
+   const int range_offset = ((int)range->offset - (int)range->start) / 4;
+   const_offset += range_offset;
+
+   /* The range_offset could be negative, if if only part of the UBO
+    * block is accessed, range->start can be greater than range->offset.
+    * But we can't underflow const_offset.  If necessary we need to
+    * insert nir instructions to compensate (which can hopefully be
+    * optimized away)
+    */
+   if (const_offset < 0) {
+      uniform_offset = nir_iadd_imm(b, uniform_offset, const_offset);
+      const_offset = 0;
+   }
+
+   nir_ssa_def *uniform =
+      nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size,
+                       uniform_offset, .base = const_offset);
+
+   nir_ssa_def_rewrite_uses(&instr->dest.ssa, uniform);
+
+   nir_instr_remove(&instr->instr);
+
+   return true;
  }
  
  static bool
  instr_is_load_ubo(nir_instr *instr)
  {
-       if (instr->type != nir_instr_type_intrinsic)
-               return false;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
  
-       nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
+   nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
  
-       /* nir_lower_ubo_vec4 happens after this pass. */
-       assert(op != nir_intrinsic_load_ubo_vec4);
+   /* nir_lower_ubo_vec4 happens after this pass. */
+   assert(op != nir_intrinsic_load_ubo_vec4);
  
-       return op == nir_intrinsic_load_ubo;
+   return op == nir_intrinsic_load_ubo;
  }
  
  void
  ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
  {
-       struct ir3_const_state *const_state = ir3_const_state(v);
-       struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
-       struct ir3_compiler *compiler = v->shader->compiler;
-
-       /* Limit our uploads to the amount of constant buffer space available in
-        * the hardware, minus what the shader compiler may need for various
-        * driver params.  We do this UBO-to-push-constant before the real
-        * allocation of the driver params' const space, because UBO pointers can
-        * be driver params but this pass usually eliminatings them.
-        */
-       struct ir3_const_state worst_case_const_state = { };
-       ir3_setup_const_state(nir, v, &worst_case_const_state);
-       const uint32_t max_upload = (ir3_max_const(v) -
-                       worst_case_const_state.offsets.immediate) * 16;
-
-       memset(state, 0, sizeof(*state));
-
-       uint32_t upload_remaining = max_upload;
-       nir_foreach_function (function, nir) {
-               if (function->impl) {
-                       nir_foreach_block (block, function->impl) {
-                               nir_foreach_instr (instr, block) {
-                                       if (instr_is_load_ubo(instr))
-                                               gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr),
-                                                               state, compiler->const_upload_unit,
-                                                               &upload_remaining);
-                               }
-                       }
-               }
-       }
-
-       /* For now, everything we upload is accessed statically and thus will be
-        * used by the shader. Once we can upload dynamically indexed data, we may
-        * upload sparsely accessed arrays, at which point we probably want to
-        * give priority to smaller UBOs, on the assumption that big UBOs will be
-        * accessed dynamically.  Alternatively, we can track statically and
-        * dynamically accessed ranges separately and upload static rangtes
-        * first.
-        */
-
-       uint32_t offset = v->shader->num_reserved_user_consts * 16;
-       for (uint32_t i = 0; i < state->num_enabled; i++) {
-               uint32_t range_size = state->range[i].end - state->range[i].start;
-
-               debug_assert(offset <= max_upload);
-               state->range[i].offset = offset;
-               assert(offset <= max_upload);
-               offset += range_size;
-
-       }
-       state->size = offset;
+   struct ir3_const_state *const_state = ir3_const_state(v);
+   struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+   struct ir3_compiler *compiler = v->shader->compiler;
+
+   /* Limit our uploads to the amount of constant buffer space available in
+    * the hardware, minus what the shader compiler may need for various
+    * driver params.  We do this UBO-to-push-constant before the real
+    * allocation of the driver params' const space, because UBO pointers can
+    * be driver params but this pass usually eliminatings them.
+    */
+   struct ir3_const_state worst_case_const_state = {};
+   ir3_setup_const_state(nir, v, &worst_case_const_state);
+   const uint32_t max_upload =
+      (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16;
+
+   memset(state, 0, sizeof(*state));
+
+   uint32_t upload_remaining = max_upload;
+   nir_foreach_function (function, nir) {
+      if (function->impl) {
+         nir_foreach_block (block, function->impl) {
+            nir_foreach_instr (instr, block) {
+               if (instr_is_load_ubo(instr))
+                  gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), state,
+                                    compiler->const_upload_unit,
+                                    &upload_remaining);
+            }
+         }
+      }
+   }
+
+   /* For now, everything we upload is accessed statically and thus will be
+    * used by the shader. Once we can upload dynamically indexed data, we may
+    * upload sparsely accessed arrays, at which point we probably want to
+    * give priority to smaller UBOs, on the assumption that big UBOs will be
+    * accessed dynamically.  Alternatively, we can track statically and
+    * dynamically accessed ranges separately and upload static rangtes
+    * first.
+    */
+
+   uint32_t offset = v->shader->num_reserved_user_consts * 16;
+   for (uint32_t i = 0; i < state->num_enabled; i++) {
+      uint32_t range_size = state->range[i].end - state->range[i].start;
+
+      debug_assert(offset <= max_upload);
+      state->range[i].offset = offset;
+      assert(offset <= max_upload);
+      offset += range_size;
+   }
+   state->size = offset;
  }
  
  bool
  ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
  {
-       struct ir3_compiler *compiler = v->shader->compiler;
-       /* For the binning pass variant, we re-use the corresponding draw-pass
-        * variants const_state and ubo state.  To make these clear, in this
-        * pass it is const (read-only)
-        */
-       const struct ir3_const_state *const_state = ir3_const_state(v);
-       const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
-
-       int num_ubos = 0;
-       bool progress = false;
-       nir_foreach_function (function, nir) {
-               if (function->impl) {
-                       nir_builder builder;
-                       nir_builder_init(&builder, function->impl);
-                       nir_foreach_block (block, function->impl) {
-                               nir_foreach_instr_safe (instr, block) {
-                                       if (!instr_is_load_ubo(instr))
-                                               continue;
-                                       progress |=
-                                               lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr),
-                                                               &builder, state, &num_ubos,
-                                                               compiler->const_upload_unit);
-                               }
-                       }
-
-                       nir_metadata_preserve(function->impl, nir_metadata_block_index |
-                                                                 nir_metadata_dominance);
-               }
-       }
-       /* Update the num_ubos field for GL (first_ubo_is_default_ubo).  With
-        * Vulkan's bindless, we don't use the num_ubos field, so we can leave it
-        * incremented.
-        */
-       if (nir->info.first_ubo_is_default_ubo)
-           nir->info.num_ubos = num_ubos;
-
-       return progress;
+   struct ir3_compiler *compiler = v->shader->compiler;
+   /* For the binning pass variant, we re-use the corresponding draw-pass
+    * variants const_state and ubo state.  To make these clear, in this
+    * pass it is const (read-only)
+    */
+   const struct ir3_const_state *const_state = ir3_const_state(v);
+   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+
+   int num_ubos = 0;
+   bool progress = false;
+   nir_foreach_function (function, nir) {
+      if (function->impl) {
+         nir_builder builder;
+         nir_builder_init(&builder, function->impl);
+         nir_foreach_block (block, function->impl) {
+            nir_foreach_instr_safe (instr, block) {
+               if (!instr_is_load_ubo(instr))
+                  continue;
+               progress |= lower_ubo_load_to_uniform(
+                  nir_instr_as_intrinsic(instr), &builder, state, &num_ubos,
+                  compiler->const_upload_unit);
+            }
+         }
+
+         nir_metadata_preserve(
+            function->impl, nir_metadata_block_index | nir_metadata_dominance);
+      }
+   }
+   /* Update the num_ubos field for GL (first_ubo_is_default_ubo).  With
+    * Vulkan's bindless, we don't use the num_ubos field, so we can leave it
+    * incremented.
+    */
+   if (nir->info.first_ubo_is_default_ubo)
+      nir->info.num_ubos = num_ubos;
+
+   return progress;
  }
  
-
  static bool
  fixup_load_uniform_filter(const nir_instr *instr, const void *arg)
  {
-       if (instr->type != nir_instr_type_intrinsic)
-               return false;
-       return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_uniform;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   return nir_instr_as_intrinsic(instr)->intrinsic ==
+          nir_intrinsic_load_uniform;
  }
  
  static nir_ssa_def *
  fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
  {
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-       /* We don't need to worry about non-indirect case: */
-       if (nir_src_is_const(intr->src[0]))
-               return NULL;
-
-       const unsigned base_offset_limit = (1 << 9);  /* 9 bits */
-       unsigned base_offset = nir_intrinsic_base(intr);
-
-       /* Or cases were base offset is lower than the hw limit: */
-       if (base_offset < base_offset_limit)
-               return NULL;
-
-       b->cursor = nir_before_instr(instr);
-
-       nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
-
-       /* We'd like to avoid a sequence like:
-        *
-        *   vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
-        *   vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
-        *   vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
-        *
-        * From turning into a unique offset value (which requires reloading
-        * a0.x for each instruction).  So instead of just adding the constant
-        * base_offset to the non-const offset, be a bit more clever and only
-        * extract the part that cannot be encoded.  Afterwards CSE should
-        * turn the result into:
-        *
-        *   vec1 32 ssa_5 = load_const (1024)
-        *   vec4 32 ssa_6  = iadd ssa4_, ssa_5
-        *   vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
-        *   vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
-        *   vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
-        */
-       unsigned new_base_offset = base_offset % base_offset_limit;
-
-       nir_intrinsic_set_base(intr, new_base_offset);
-       offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
-
-       nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
-
-       return NIR_LOWER_INSTR_PROGRESS;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   /* We don't need to worry about non-indirect case: */
+   if (nir_src_is_const(intr->src[0]))
+      return NULL;
+
+   const unsigned base_offset_limit = (1 << 9); /* 9 bits */
+   unsigned base_offset = nir_intrinsic_base(intr);
+
+   /* Or cases were base offset is lower than the hw limit: */
+   if (base_offset < base_offset_limit)
+      return NULL;
+
+   b->cursor = nir_before_instr(instr);
+
+   nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
+
+   /* We'd like to avoid a sequence like:
+    *
+    *   vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
+    *   vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
+    *   vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
+    *
+    * From turning into a unique offset value (which requires reloading
+    * a0.x for each instruction).  So instead of just adding the constant
+    * base_offset to the non-const offset, be a bit more clever and only
+    * extract the part that cannot be encoded.  Afterwards CSE should
+    * turn the result into:
+    *
+    *   vec1 32 ssa_5 = load_const (1024)
+    *   vec4 32 ssa_6  = iadd ssa4_, ssa_5
+    *   vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
+    *   vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
+    *   vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
+    */
+   unsigned new_base_offset = base_offset % base_offset_limit;
+
+   nir_intrinsic_set_base(intr, new_base_offset);
+   offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
+
+   nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
+
+   return NIR_LOWER_INSTR_PROGRESS;
  }
  
  /**
@@ -520,59 +520,59 @@ fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
  bool
  ir3_nir_fixup_load_uniform(nir_shader *nir)
  {
-       return nir_shader_lower_instructions(nir,
-                       fixup_load_uniform_filter, fixup_load_uniform_instr,
-                       NULL);
+   return nir_shader_lower_instructions(nir, fixup_load_uniform_filter,
+                                        fixup_load_uniform_instr, NULL);
  }
  static nir_ssa_def *
  ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
  {
-       struct ir3_const_state *const_state = data;
-       nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
-
-       /* Pick a UBO index to use as our constant data.  Skip UBO 0 since that's
-        * reserved for gallium's cb0.
-        */
-       if (const_state->constant_data_ubo == -1) {
-               if (b->shader->info.num_ubos == 0)
-                       b->shader->info.num_ubos++;
-               const_state->constant_data_ubo = b->shader->info.num_ubos++;
-       }
-
-       unsigned num_components = instr->num_components;
-       if (nir_dest_bit_size(instr->dest) == 16) {
-               /* We can't do 16b loads -- either from LDC (32-bit only in any of our
-                * traces, and disasm that doesn't look like it really supports it) or
-                * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
-                * automatic 32b-to-16b conversions when we ask for 16b from it).
-                * Instead, we'll load 32b from a UBO and unpack from there.
-                */
-               num_components = DIV_ROUND_UP(num_components, 2);
-       }
-       unsigned base = nir_intrinsic_base(instr);
-       nir_ssa_def *index = nir_imm_int(b, const_state->constant_data_ubo);
-       nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, instr->src[0], 1), base);
-
-       nir_ssa_def *result =
-               nir_load_ubo(b, num_components, 32, index, offset,
-                                        .align_mul = nir_intrinsic_align_mul(instr),
-                                        .align_offset = nir_intrinsic_align_offset(instr),
-                                        .range_base = base,
-                                        .range = nir_intrinsic_range(instr));
-
-       if (nir_dest_bit_size(instr->dest) == 16) {
-               result = nir_bitcast_vector(b, result, 16);
-               result = nir_channels(b, result, BITSET_MASK(instr->num_components));
-       }
-
-       return result;
+   struct ir3_const_state *const_state = data;
+   nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
+
+   /* Pick a UBO index to use as our constant data.  Skip UBO 0 since that's
+    * reserved for gallium's cb0.
+    */
+   if (const_state->constant_data_ubo == -1) {
+      if (b->shader->info.num_ubos == 0)
+         b->shader->info.num_ubos++;
+      const_state->constant_data_ubo = b->shader->info.num_ubos++;
+   }
+
+   unsigned num_components = instr->num_components;
+   if (nir_dest_bit_size(instr->dest) == 16) {
+      /* We can't do 16b loads -- either from LDC (32-bit only in any of our
+       * traces, and disasm that doesn't look like it really supports it) or
+       * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
+       * automatic 32b-to-16b conversions when we ask for 16b from it).
+       * Instead, we'll load 32b from a UBO and unpack from there.
+       */
+      num_components = DIV_ROUND_UP(num_components, 2);
+   }
+   unsigned base = nir_intrinsic_base(instr);
+   nir_ssa_def *index = nir_imm_int(b, const_state->constant_data_ubo);
+   nir_ssa_def *offset =
+      nir_iadd_imm(b, nir_ssa_for_src(b, instr->src[0], 1), base);
+
+   nir_ssa_def *result =
+      nir_load_ubo(b, num_components, 32, index, offset,
+                   .align_mul = nir_intrinsic_align_mul(instr),
+                   .align_offset = nir_intrinsic_align_offset(instr),
+                   .range_base = base, .range = nir_intrinsic_range(instr));
+
+   if (nir_dest_bit_size(instr->dest) == 16) {
+      result = nir_bitcast_vector(b, result, 16);
+      result = nir_channels(b, result, BITSET_MASK(instr->num_components));
+   }
+
+   return result;
  }
  
  static bool
  ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
  {
-        return (instr->type == nir_instr_type_intrinsic &&
-                nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
+   return (instr->type == nir_instr_type_intrinsic &&
+           nir_instr_as_intrinsic(instr)->intrinsic ==
+              nir_intrinsic_load_constant);
  }
  
  /* Lowers load_constant intrinsics to UBO accesses so we can run them through
@@ -581,26 +581,26 @@ ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
  bool
  ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
  {
-       struct ir3_const_state *const_state = ir3_const_state(v);
+   struct ir3_const_state *const_state = ir3_const_state(v);
  
-       const_state->constant_data_ubo = -1;
+   const_state->constant_data_ubo = -1;
  
-       bool progress = nir_shader_lower_instructions(nir,
-                       ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
-                       const_state);
+   bool progress = nir_shader_lower_instructions(
+      nir, ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
+      const_state);
  
-       if (progress) {
-               struct ir3_compiler *compiler = v->shader->compiler;
+   if (progress) {
+      struct ir3_compiler *compiler = v->shader->compiler;
  
-               /* Save a copy of the NIR constant data to the variant for
-                       * inclusion in the final assembly.
-                       */
-               v->constant_data_size = align(nir->constant_data_size,
-                               compiler->const_upload_unit * 4 * sizeof(uint32_t));
-               v->constant_data = rzalloc_size(v, v->constant_data_size);
-               memcpy(v->constant_data, nir->constant_data,
-                               nir->constant_data_size);
-       }
+      /* Save a copy of the NIR constant data to the variant for
+       * inclusion in the final assembly.
+       */
+      v->constant_data_size =
+         align(nir->constant_data_size,
+               compiler->const_upload_unit * 4 * sizeof(uint32_t));
+      v->constant_data = rzalloc_size(v, v->constant_data_size);
+      memcpy(v->constant_data, nir->constant_data, nir->constant_data_size);
+   }
  
-       return progress;
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c

index 596b7e1..f28e9ec 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
+++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
@@ -21,8 +21,8 @@
   * IN THE SOFTWARE.
   */
  
-#include "ir3_nir.h"
  #include "compiler/nir/nir_builder.h"
+#include "ir3_nir.h"
  
  /**
   * This pass moves to NIR certain offset computations for different I/O
@@ -34,7 +34,6 @@
   *   holds the result of the original byte-offset source divided by 4.
   */
  
-
  /* Returns the ir3-specific intrinsic opcode corresponding to an SSBO
   * instruction that is handled by this pass. It also conveniently returns
   * the offset source index in @offset_src_idx.
@@ -44,269 +43,269 @@
   */
  static int
  get_ir3_intrinsic_for_ssbo_intrinsic(unsigned intrinsic,
-                                                                        uint8_t *offset_src_idx)
+                                     uint8_t *offset_src_idx)
  {
-       debug_assert(offset_src_idx);
-
-       *offset_src_idx = 1;
-
-       switch (intrinsic) {
-       case nir_intrinsic_store_ssbo:
-               *offset_src_idx = 2;
-               return nir_intrinsic_store_ssbo_ir3;
-       case nir_intrinsic_load_ssbo:
-               return nir_intrinsic_load_ssbo_ir3;
-       case nir_intrinsic_ssbo_atomic_add:
-               return nir_intrinsic_ssbo_atomic_add_ir3;
-       case nir_intrinsic_ssbo_atomic_imin:
-               return nir_intrinsic_ssbo_atomic_imin_ir3;
-       case nir_intrinsic_ssbo_atomic_umin:
-               return nir_intrinsic_ssbo_atomic_umin_ir3;
-       case nir_intrinsic_ssbo_atomic_imax:
-               return nir_intrinsic_ssbo_atomic_imax_ir3;
-       case nir_intrinsic_ssbo_atomic_umax:
-               return nir_intrinsic_ssbo_atomic_umax_ir3;
-       case nir_intrinsic_ssbo_atomic_and:
-               return nir_intrinsic_ssbo_atomic_and_ir3;
-       case nir_intrinsic_ssbo_atomic_or:
-               return nir_intrinsic_ssbo_atomic_or_ir3;
-       case nir_intrinsic_ssbo_atomic_xor:
-               return nir_intrinsic_ssbo_atomic_xor_ir3;
-       case nir_intrinsic_ssbo_atomic_exchange:
-               return nir_intrinsic_ssbo_atomic_exchange_ir3;
-       case nir_intrinsic_ssbo_atomic_comp_swap:
-               return nir_intrinsic_ssbo_atomic_comp_swap_ir3;
-       default:
-               break;
-       }
-
-       return -1;
+   debug_assert(offset_src_idx);
+
+   *offset_src_idx = 1;
+
+   switch (intrinsic) {
+   case nir_intrinsic_store_ssbo:
+      *offset_src_idx = 2;
+      return nir_intrinsic_store_ssbo_ir3;
+   case nir_intrinsic_load_ssbo:
+      return nir_intrinsic_load_ssbo_ir3;
+   case nir_intrinsic_ssbo_atomic_add:
+      return nir_intrinsic_ssbo_atomic_add_ir3;
+   case nir_intrinsic_ssbo_atomic_imin:
+      return nir_intrinsic_ssbo_atomic_imin_ir3;
+   case nir_intrinsic_ssbo_atomic_umin:
+      return nir_intrinsic_ssbo_atomic_umin_ir3;
+   case nir_intrinsic_ssbo_atomic_imax:
+      return nir_intrinsic_ssbo_atomic_imax_ir3;
+   case nir_intrinsic_ssbo_atomic_umax:
+      return nir_intrinsic_ssbo_atomic_umax_ir3;
+   case nir_intrinsic_ssbo_atomic_and:
+      return nir_intrinsic_ssbo_atomic_and_ir3;
+   case nir_intrinsic_ssbo_atomic_or:
+      return nir_intrinsic_ssbo_atomic_or_ir3;
+   case nir_intrinsic_ssbo_atomic_xor:
+      return nir_intrinsic_ssbo_atomic_xor_ir3;
+   case nir_intrinsic_ssbo_atomic_exchange:
+      return nir_intrinsic_ssbo_atomic_exchange_ir3;
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+      return nir_intrinsic_ssbo_atomic_comp_swap_ir3;
+   default:
+      break;
+   }
+
+   return -1;
  }
  
  static nir_ssa_def *
  check_and_propagate_bit_shift32(nir_builder *b, nir_alu_instr *alu_instr,
-                                                               int32_t direction, int32_t shift)
+                                int32_t direction, int32_t shift)
  {
-       debug_assert(alu_instr->src[1].src.is_ssa);
-       nir_ssa_def *shift_ssa = alu_instr->src[1].src.ssa;
-
-       /* Only propagate if the shift is a const value so we can check value range
-        * statically.
-        */
-       nir_const_value *const_val = nir_src_as_const_value(alu_instr->src[1].src);
-       if (!const_val)
-               return NULL;
-
-       int32_t current_shift = const_val[0].i32 * direction;
-       int32_t new_shift = current_shift + shift;
-
-       /* If the merge would reverse the direction, bail out.
-        * e.g, 'x << 2' then 'x >> 4' is not 'x >> 2'.
-        */
-       if (current_shift * new_shift < 0)
-               return NULL;
-
-       /* If the propagation would overflow an int32_t, bail out too to be on the
-        * safe side.
-        */
-       if (new_shift < -31 || new_shift > 31)
-               return NULL;
-
-       /* Add or substract shift depending on the final direction (SHR vs. SHL). */
-       if (shift * direction < 0)
-               shift_ssa = nir_isub(b, shift_ssa, nir_imm_int(b, abs(shift)));
-       else
-               shift_ssa = nir_iadd(b, shift_ssa, nir_imm_int(b, abs(shift)));
-
-       return shift_ssa;
+   debug_assert(alu_instr->src[1].src.is_ssa);
+   nir_ssa_def *shift_ssa = alu_instr->src[1].src.ssa;
+
+   /* Only propagate if the shift is a const value so we can check value range
+    * statically.
+    */
+   nir_const_value *const_val = nir_src_as_const_value(alu_instr->src[1].src);
+   if (!const_val)
+      return NULL;
+
+   int32_t current_shift = const_val[0].i32 * direction;
+   int32_t new_shift = current_shift + shift;
+
+   /* If the merge would reverse the direction, bail out.
+    * e.g, 'x << 2' then 'x >> 4' is not 'x >> 2'.
+    */
+   if (current_shift * new_shift < 0)
+      return NULL;
+
+   /* If the propagation would overflow an int32_t, bail out too to be on the
+    * safe side.
+    */
+   if (new_shift < -31 || new_shift > 31)
+      return NULL;
+
+   /* Add or substract shift depending on the final direction (SHR vs. SHL). */
+   if (shift * direction < 0)
+      shift_ssa = nir_isub(b, shift_ssa, nir_imm_int(b, abs(shift)));
+   else
+      shift_ssa = nir_iadd(b, shift_ssa, nir_imm_int(b, abs(shift)));
+
+   return shift_ssa;
  }
  
  nir_ssa_def *
-ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift)
+ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset,
+                                int32_t shift)
  {
-       nir_instr *offset_instr = offset->parent_instr;
-       if (offset_instr->type != nir_instr_type_alu)
-               return NULL;
-
-       nir_alu_instr *alu = nir_instr_as_alu(offset_instr);
-       nir_ssa_def *shift_ssa;
-       nir_ssa_def *new_offset = NULL;
-
-       /* the first src could be something like ssa_18.x, but we only want
-        * the single component.  Otherwise the ishl/ishr/ushr could turn
-        * into a vec4 operation:
-        */
-       nir_ssa_def *src0 = nir_mov_alu(b, alu->src[0], 1);
-
-       switch (alu->op) {
-       case nir_op_ishl:
-               shift_ssa = check_and_propagate_bit_shift32(b, alu, 1, shift);
-               if (shift_ssa)
-                       new_offset = nir_ishl(b, src0, shift_ssa);
-               break;
-       case nir_op_ishr:
-               shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
-               if (shift_ssa)
-                       new_offset = nir_ishr(b, src0, shift_ssa);
-               break;
-       case nir_op_ushr:
-               shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
-               if (shift_ssa)
-                       new_offset = nir_ushr(b, src0, shift_ssa);
-               break;
-       default:
-               return NULL;
-       }
-
-       return new_offset;
+   nir_instr *offset_instr = offset->parent_instr;
+   if (offset_instr->type != nir_instr_type_alu)
+      return NULL;
+
+   nir_alu_instr *alu = nir_instr_as_alu(offset_instr);
+   nir_ssa_def *shift_ssa;
+   nir_ssa_def *new_offset = NULL;
+
+   /* the first src could be something like ssa_18.x, but we only want
+    * the single component.  Otherwise the ishl/ishr/ushr could turn
+    * into a vec4 operation:
+    */
+   nir_ssa_def *src0 = nir_mov_alu(b, alu->src[0], 1);
+
+   switch (alu->op) {
+   case nir_op_ishl:
+      shift_ssa = check_and_propagate_bit_shift32(b, alu, 1, shift);
+      if (shift_ssa)
+         new_offset = nir_ishl(b, src0, shift_ssa);
+      break;
+   case nir_op_ishr:
+      shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
+      if (shift_ssa)
+         new_offset = nir_ishr(b, src0, shift_ssa);
+      break;
+   case nir_op_ushr:
+      shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
+      if (shift_ssa)
+         new_offset = nir_ushr(b, src0, shift_ssa);
+      break;
+   default:
+      return NULL;
+   }
+
+   return new_offset;
  }
  
  static bool
  lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
-                                         unsigned ir3_ssbo_opcode, uint8_t offset_src_idx)
+                      unsigned ir3_ssbo_opcode, uint8_t offset_src_idx)
  {
-       unsigned num_srcs = nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
-       int shift = 2;
-
-       bool has_dest = nir_intrinsic_infos[intrinsic->intrinsic].has_dest;
-       nir_ssa_def *new_dest = NULL;
-
-       /* for 16-bit ssbo access, offset is in 16-bit words instead of dwords */
-       if ((has_dest && intrinsic->dest.ssa.bit_size == 16) ||
-               (!has_dest && intrinsic->src[0].ssa->bit_size == 16))
-               shift = 1;
-
-       /* Here we create a new intrinsic and copy over all contents from the old one. */
-
-       nir_intrinsic_instr *new_intrinsic;
-       nir_src *target_src;
-
-       b->cursor = nir_before_instr(&intrinsic->instr);
-
-       /* 'offset_src_idx' holds the index of the source that represent the offset. */
-       new_intrinsic =
-               nir_intrinsic_instr_create(b->shader, ir3_ssbo_opcode);
-
-       debug_assert(intrinsic->src[offset_src_idx].is_ssa);
-       nir_ssa_def *offset = intrinsic->src[offset_src_idx].ssa;
-
-       /* Since we don't have value range checking, we first try to propagate
-        * the division by 4 ('offset >> 2') into another bit-shift instruction that
-        * possibly defines the offset. If that's the case, we emit a similar
-        * instructions adjusting (merging) the shift value.
-        *
-        * Here we use the convention that shifting right is negative while shifting
-        * left is positive. So 'x / 4' ~ 'x >> 2' or 'x << -2'.
-        */
-       nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -shift);
-
-       /* The new source that will hold the dword-offset is always the last
-        * one for every intrinsic.
-        */
-       target_src = &new_intrinsic->src[num_srcs];
-       *target_src = nir_src_for_ssa(offset);
-
-       if (has_dest) {
-               debug_assert(intrinsic->dest.is_ssa);
-               nir_ssa_def *dest = &intrinsic->dest.ssa;
-               nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
-                                                 dest->num_components, dest->bit_size, NULL);
-               new_dest = &new_intrinsic->dest.ssa;
-       }
-
-       for (unsigned i = 0; i < num_srcs; i++)
-               new_intrinsic->src[i] = nir_src_for_ssa(intrinsic->src[i].ssa);
-
-       nir_intrinsic_copy_const_indices(new_intrinsic, intrinsic);
-
-       new_intrinsic->num_components = intrinsic->num_components;
-
-       /* If we managed to propagate the division by 4, just use the new offset
-        * register and don't emit the SHR.
-        */
-       if (new_offset)
-               offset = new_offset;
-       else
-               offset = nir_ushr(b, offset, nir_imm_int(b, shift));
-
-       /* Insert the new intrinsic right before the old one. */
-       nir_builder_instr_insert(b, &new_intrinsic->instr);
-
-       /* Replace the last source of the new intrinsic by the result of
-        * the offset divided by 4.
-        */
-       nir_instr_rewrite_src(&new_intrinsic->instr,
-                                                 target_src,
-                                                 nir_src_for_ssa(offset));
-
-       if (has_dest) {
-               /* Replace the uses of the original destination by that
-                * of the new intrinsic.
-                */
-               nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
-                                                                new_dest);
-       }
-
-       /* Finally remove the original intrinsic. */
-       nir_instr_remove(&intrinsic->instr);
-
-       return true;
+   unsigned num_srcs = nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
+   int shift = 2;
+
+   bool has_dest = nir_intrinsic_infos[intrinsic->intrinsic].has_dest;
+   nir_ssa_def *new_dest = NULL;
+
+   /* for 16-bit ssbo access, offset is in 16-bit words instead of dwords */
+   if ((has_dest && intrinsic->dest.ssa.bit_size == 16) ||
+       (!has_dest && intrinsic->src[0].ssa->bit_size == 16))
+      shift = 1;
+
+   /* Here we create a new intrinsic and copy over all contents from the old
+    * one. */
+
+   nir_intrinsic_instr *new_intrinsic;
+   nir_src *target_src;
+
+   b->cursor = nir_before_instr(&intrinsic->instr);
+
+   /* 'offset_src_idx' holds the index of the source that represent the offset. */
+   new_intrinsic = nir_intrinsic_instr_create(b->shader, ir3_ssbo_opcode);
+
+   debug_assert(intrinsic->src[offset_src_idx].is_ssa);
+   nir_ssa_def *offset = intrinsic->src[offset_src_idx].ssa;
+
+   /* Since we don't have value range checking, we first try to propagate
+    * the division by 4 ('offset >> 2') into another bit-shift instruction that
+    * possibly defines the offset. If that's the case, we emit a similar
+    * instructions adjusting (merging) the shift value.
+    *
+    * Here we use the convention that shifting right is negative while shifting
+    * left is positive. So 'x / 4' ~ 'x >> 2' or 'x << -2'.
+    */
+   nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -shift);
+
+   /* The new source that will hold the dword-offset is always the last
+    * one for every intrinsic.
+    */
+   target_src = &new_intrinsic->src[num_srcs];
+   *target_src = nir_src_for_ssa(offset);
+
+   if (has_dest) {
+      debug_assert(intrinsic->dest.is_ssa);
+      nir_ssa_def *dest = &intrinsic->dest.ssa;
+      nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
+                        dest->num_components, dest->bit_size, NULL);
+      new_dest = &new_intrinsic->dest.ssa;
+   }
+
+   for (unsigned i = 0; i < num_srcs; i++)
+      new_intrinsic->src[i] = nir_src_for_ssa(intrinsic->src[i].ssa);
+
+   nir_intrinsic_copy_const_indices(new_intrinsic, intrinsic);
+
+   new_intrinsic->num_components = intrinsic->num_components;
+
+   /* If we managed to propagate the division by 4, just use the new offset
+    * register and don't emit the SHR.
+    */
+   if (new_offset)
+      offset = new_offset;
+   else
+      offset = nir_ushr(b, offset, nir_imm_int(b, shift));
+
+   /* Insert the new intrinsic right before the old one. */
+   nir_builder_instr_insert(b, &new_intrinsic->instr);
+
+   /* Replace the last source of the new intrinsic by the result of
+    * the offset divided by 4.
+    */
+   nir_instr_rewrite_src(&new_intrinsic->instr, target_src,
+                         nir_src_for_ssa(offset));
+
+   if (has_dest) {
+      /* Replace the uses of the original destination by that
+       * of the new intrinsic.
+       */
+      nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, new_dest);
+   }
+
+   /* Finally remove the original intrinsic. */
+   nir_instr_remove(&intrinsic->instr);
+
+   return true;
  }
  
  static bool
-lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id)
+lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
+                       int gpu_id)
  {
-       bool progress = false;
-
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
-
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               /* SSBO */
-               int ir3_intrinsic;
-               uint8_t offset_src_idx;
-               ir3_intrinsic = get_ir3_intrinsic_for_ssbo_intrinsic(intr->intrinsic,
-                                                                                                                        &offset_src_idx);
-               if (ir3_intrinsic != -1) {
-                       progress |= lower_offset_for_ssbo(intr, b, (unsigned) ir3_intrinsic,
-                                                                                         offset_src_idx);
-               }
-       }
-
-       return progress;
+   bool progress = false;
+
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      /* SSBO */
+      int ir3_intrinsic;
+      uint8_t offset_src_idx;
+      ir3_intrinsic =
+         get_ir3_intrinsic_for_ssbo_intrinsic(intr->intrinsic, &offset_src_idx);
+      if (ir3_intrinsic != -1) {
+         progress |= lower_offset_for_ssbo(intr, b, (unsigned)ir3_intrinsic,
+                                           offset_src_idx);
+      }
+   }
+
+   return progress;
  }
  
  static bool
  lower_io_offsets_func(nir_function_impl *impl, int gpu_id)
  {
-       void *mem_ctx = ralloc_parent(impl);
-       nir_builder b;
-       nir_builder_init(&b, impl);
+   void *mem_ctx = ralloc_parent(impl);
+   nir_builder b;
+   nir_builder_init(&b, impl);
  
-       bool progress = false;
-       nir_foreach_block_safe (block, impl) {
-               progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
-       }
+   bool progress = false;
+   nir_foreach_block_safe (block, impl) {
+      progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
+   }
  
-       if (progress) {
-               nir_metadata_preserve(impl, nir_metadata_block_index |
-                                                                       nir_metadata_dominance);
-       }
+   if (progress) {
+      nir_metadata_preserve(impl,
+                            nir_metadata_block_index | nir_metadata_dominance);
+   }
  
-       return progress;
+   return progress;
  }
  
  bool
  ir3_nir_lower_io_offsets(nir_shader *shader, int gpu_id)
  {
-       bool progress = false;
+   bool progress = false;
  
-       nir_foreach_function (function, shader) {
-               if (function->impl)
-                       progress |= lower_io_offsets_func(function->impl, gpu_id);
-       }
+   nir_foreach_function (function, shader) {
+      if (function->impl)
+         progress |= lower_io_offsets_func(function->impl, gpu_id);
+   }
  
-       return progress;
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_offset.c b/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_offset.c

index c783483..b446111 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_offset.c
+++ b/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_offset.c
@@ -21,8 +21,8 @@
   * IN THE SOFTWARE.
   */
  
-#include "ir3_nir.h"
  #include "compiler/nir/nir_builder.h"
+#include "ir3_nir.h"
  
  /**
   * This pass lowers load_barycentric_at_offset to dsx.3d/dsy.3d and alu
@@ -32,75 +32,72 @@
  static nir_ssa_def *
  load(nir_builder *b, unsigned ncomp, nir_intrinsic_op op)
  {
-       nir_intrinsic_instr *load_size = nir_intrinsic_instr_create(b->shader, op);
-       nir_ssa_dest_init(&load_size->instr, &load_size->dest, ncomp, 32, NULL);
-       nir_builder_instr_insert(b, &load_size->instr);
+   nir_intrinsic_instr *load_size = nir_intrinsic_instr_create(b->shader, op);
+   nir_ssa_dest_init(&load_size->instr, &load_size->dest, ncomp, 32, NULL);
+   nir_builder_instr_insert(b, &load_size->instr);
  
-       return &load_size->dest.ssa;
+   return &load_size->dest.ssa;
  }
  
  static nir_ssa_def *
-ir3_nir_lower_load_barycentric_at_offset_instr(nir_builder *b,
-          nir_instr *instr, void *data)
+ir3_nir_lower_load_barycentric_at_offset_instr(nir_builder *b, nir_instr *instr,
+                                               void *data)
  {
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  
  #define chan(var, c) nir_channel(b, var, c)
  
-       nir_ssa_def *off = intr->src[0].ssa;
-       nir_ssa_def *ij = load(b, 2, nir_intrinsic_load_barycentric_pixel);
-       nir_ssa_def *s  = load(b, 1, nir_intrinsic_load_size_ir3);
+   nir_ssa_def *off = intr->src[0].ssa;
+   nir_ssa_def *ij = load(b, 2, nir_intrinsic_load_barycentric_pixel);
+   nir_ssa_def *s = load(b, 1, nir_intrinsic_load_size_ir3);
  
-       s = nir_frcp(b, s);
+   s = nir_frcp(b, s);
  
-       /* scaled ij with s as 3rd component: */
-       nir_ssa_def *sij = nir_vec3(b,
-                       nir_fmul(b, chan(ij, 0), s),
-                       nir_fmul(b, chan(ij, 1), s),
-                       s);
+   /* scaled ij with s as 3rd component: */
+   nir_ssa_def *sij =
+      nir_vec3(b, nir_fmul(b, chan(ij, 0), s), nir_fmul(b, chan(ij, 1), s), s);
  
-       nir_ssa_def *foo = nir_fddx(b, sij);
-       nir_ssa_def *bar = nir_fddy(b, sij);
+   nir_ssa_def *foo = nir_fddx(b, sij);
+   nir_ssa_def *bar = nir_fddy(b, sij);
  
-       if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
-               b->shader->info.fs.needs_quad_helper_invocations = true;
+   if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
+      b->shader->info.fs.needs_quad_helper_invocations = true;
  
-       nir_ssa_def *x, *y, *z, *i, *j;
+   nir_ssa_def *x, *y, *z, *i, *j;
  
-       x = nir_ffma(b, chan(off, 0), chan(foo, 0), chan(sij, 0));
-       y = nir_ffma(b, chan(off, 0), chan(foo, 1), chan(sij, 1));
-       z = nir_ffma(b, chan(off, 0), chan(foo, 2), chan(sij, 2));
+   x = nir_ffma(b, chan(off, 0), chan(foo, 0), chan(sij, 0));
+   y = nir_ffma(b, chan(off, 0), chan(foo, 1), chan(sij, 1));
+   z = nir_ffma(b, chan(off, 0), chan(foo, 2), chan(sij, 2));
  
-       x = nir_ffma(b, chan(off, 1), chan(bar, 0), x);
-       y = nir_ffma(b, chan(off, 1), chan(bar, 1), y);
-       z = nir_ffma(b, chan(off, 1), chan(bar, 2), z);
+   x = nir_ffma(b, chan(off, 1), chan(bar, 0), x);
+   y = nir_ffma(b, chan(off, 1), chan(bar, 1), y);
+   z = nir_ffma(b, chan(off, 1), chan(bar, 2), z);
  
-       /* convert back into primitive space: */
-       z = nir_frcp(b, z);
-       i = nir_fmul(b, z, x);
-       j = nir_fmul(b, z, y);
+   /* convert back into primitive space: */
+   z = nir_frcp(b, z);
+   i = nir_fmul(b, z, x);
+   j = nir_fmul(b, z, y);
  
-       ij = nir_vec2(b, i, j);
+   ij = nir_vec2(b, i, j);
  
-       return ij;
+   return ij;
  }
  
  static bool
  ir3_nir_lower_load_barycentric_at_offset_filter(const nir_instr *instr,
-               const void *data)
+                                                const void *data)
  {
-       return (instr->type == nir_instr_type_intrinsic &&
-                       nir_instr_as_intrinsic(instr)->intrinsic ==
-                       nir_intrinsic_load_barycentric_at_offset);
+   return (instr->type == nir_instr_type_intrinsic &&
+           nir_instr_as_intrinsic(instr)->intrinsic ==
+              nir_intrinsic_load_barycentric_at_offset);
  }
  
  bool
  ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader)
  {
-       debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+   debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
  
-       return nir_shader_lower_instructions(shader,
-                       ir3_nir_lower_load_barycentric_at_offset_filter,
-                       ir3_nir_lower_load_barycentric_at_offset_instr,
-                       NULL);
+   return nir_shader_lower_instructions(
+      shader, ir3_nir_lower_load_barycentric_at_offset_filter,
+      ir3_nir_lower_load_barycentric_at_offset_instr, NULL);
  }
diff --git a/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_sample.c b/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_sample.c

index f705099..40e0721 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_sample.c
+++ b/src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_sample.c
@@ -21,8 +21,8 @@
   * IN THE SOFTWARE.
   */
  
-#include "ir3_nir.h"
  #include "compiler/nir/nir_builder.h"
+#include "ir3_nir.h"
  
  /**
   * This pass lowers load_barycentric_at_sample to load_sample_pos_from_id
@@ -35,61 +35,60 @@
  static nir_ssa_def *
  load_sample_pos(nir_builder *b, nir_ssa_def *samp_id)
  {
-       return nir_load_sample_pos_from_id(b, 32, samp_id);
+   return nir_load_sample_pos_from_id(b, 32, samp_id);
  }
  
  static nir_ssa_def *
  lower_load_barycentric_at_sample(nir_builder *b, nir_intrinsic_instr *intr)
  {
-       nir_ssa_def *pos = load_sample_pos(b, intr->src[0].ssa);
+   nir_ssa_def *pos = load_sample_pos(b, intr->src[0].ssa);
  
-       return nir_load_barycentric_at_offset(b, 32, pos);
+   return nir_load_barycentric_at_offset(b, 32, pos);
  }
  
  static nir_ssa_def *
  lower_load_sample_pos(nir_builder *b, nir_intrinsic_instr *intr)
  {
-       nir_ssa_def *pos = load_sample_pos(b, nir_load_sample_id(b));
+   nir_ssa_def *pos = load_sample_pos(b, nir_load_sample_id(b));
  
-       /* Note that gl_SamplePosition is offset by +vec2(0.5, 0.5) vs the
-        * offset passed to interpolateAtOffset().   See
-        * dEQP-GLES31.functional.shaders.multisample_interpolation.interpolate_at_offset.at_sample_position.default_framebuffer
-        * for example.
-        */
-       nir_ssa_def *half = nir_imm_float(b, 0.5);
-       return nir_fadd(b, pos, nir_vec2(b, half, half));
+   /* Note that gl_SamplePosition is offset by +vec2(0.5, 0.5) vs the
+    * offset passed to interpolateAtOffset().   See
+    * dEQP-GLES31.functional.shaders.multisample_interpolation.interpolate_at_offset.at_sample_position.default_framebuffer
+    * for example.
+    */
+   nir_ssa_def *half = nir_imm_float(b, 0.5);
+   return nir_fadd(b, pos, nir_vec2(b, half, half));
  }
  
  static nir_ssa_def *
-ir3_nir_lower_load_barycentric_at_sample_instr(nir_builder *b,
-               nir_instr *instr, void *data)
+ir3_nir_lower_load_barycentric_at_sample_instr(nir_builder *b, nir_instr *instr,
+                                               void *data)
  {
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  
-       if (intr->intrinsic == nir_intrinsic_load_sample_pos)
-               return lower_load_sample_pos(b, intr);
-       else
-               return lower_load_barycentric_at_sample(b, intr);
+   if (intr->intrinsic == nir_intrinsic_load_sample_pos)
+      return lower_load_sample_pos(b, intr);
+   else
+      return lower_load_barycentric_at_sample(b, intr);
  }
  
  static bool
  ir3_nir_lower_load_barycentric_at_sample_filter(const nir_instr *instr,
-               const void *data)
+                                                const void *data)
  {
-       if (instr->type != nir_instr_type_intrinsic)
-               return false;
-       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-       return (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
-                       intr->intrinsic == nir_intrinsic_load_sample_pos);
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   return (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
+           intr->intrinsic == nir_intrinsic_load_sample_pos);
  }
  
  bool
  ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader)
  {
-       debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+   debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
  
-       return nir_shader_lower_instructions(shader,
-                       ir3_nir_lower_load_barycentric_at_sample_filter,
-                       ir3_nir_lower_load_barycentric_at_sample_instr,
-                       NULL);
+   return nir_shader_lower_instructions(
+      shader, ir3_nir_lower_load_barycentric_at_sample_filter,
+      ir3_nir_lower_load_barycentric_at_sample_instr, NULL);
  }
diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c

index 0112b19..296a6f5 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@@ -21,62 +21,63 @@
   * SOFTWARE.
   */
  
-#include "ir3_nir.h"
-#include "ir3_compiler.h"
  #include "compiler/nir/nir_builder.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
  
  struct state {
-       uint32_t topology;
+   uint32_t topology;
  
-       struct primitive_map {
-               unsigned loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
-               unsigned stride;
-       } map;
+   struct primitive_map {
+      unsigned loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
+      unsigned stride;
+   } map;
  
-       nir_ssa_def *header;
+   nir_ssa_def *header;
  
-       nir_variable *vertex_count_var;
-       nir_variable *emitted_vertex_var;
-       nir_variable *vertex_flags_out;
+   nir_variable *vertex_count_var;
+   nir_variable *emitted_vertex_var;
+   nir_variable *vertex_flags_out;
  
-       struct exec_list old_outputs;
-       struct exec_list new_outputs;
-       struct exec_list emit_outputs;
+   struct exec_list old_outputs;
+   struct exec_list new_outputs;
+   struct exec_list emit_outputs;
  
-       /* tess ctrl shader on a650 gets the local primitive id at different bits: */
-       unsigned local_primitive_id_start;
+   /* tess ctrl shader on a650 gets the local primitive id at different bits: */
+   unsigned local_primitive_id_start;
  };
  
  static nir_ssa_def *
  bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
  {
-       return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
-                       nir_imm_int(b, mask));
+   return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
+                   nir_imm_int(b, mask));
  }
  
  static nir_ssa_def *
  build_invocation_id(nir_builder *b, struct state *state)
  {
-       return bitfield_extract(b, state->header, 11, 31);
+   return bitfield_extract(b, state->header, 11, 31);
  }
  
  static nir_ssa_def *
  build_vertex_id(nir_builder *b, struct state *state)
  {
-       return bitfield_extract(b, state->header, 6, 31);
+   return bitfield_extract(b, state->header, 6, 31);
  }
  
  static nir_ssa_def *
  build_local_primitive_id(nir_builder *b, struct state *state)
  {
-       return bitfield_extract(b, state->header, state->local_primitive_id_start, 63);
+   return bitfield_extract(b, state->header, state->local_primitive_id_start,
+                           63);
  }
  
  static bool
  is_tess_levels(gl_varying_slot slot)
  {
-       return (slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
-                       slot == VARYING_SLOT_TESS_LEVEL_INNER);
+   return (slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
+           slot == VARYING_SLOT_TESS_LEVEL_INNER);
  }
  
  /* Return a deterministic index for varyings. We can't rely on driver_location
@@ -96,112 +97,113 @@ is_tess_levels(gl_varying_slot slot)
  static unsigned
  shader_io_get_unique_index(gl_varying_slot slot)
  {
-       if (slot == VARYING_SLOT_POS)
-               return 0;
-       if (slot == VARYING_SLOT_PSIZ)
-               return 1;
-       if (slot == VARYING_SLOT_CLIP_DIST0)
-               return 2;
-       if (slot == VARYING_SLOT_CLIP_DIST1)
-               return 3;
-       if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
-               return 4 + (slot - VARYING_SLOT_VAR0);
-       unreachable("illegal slot in get unique index\n");
+   if (slot == VARYING_SLOT_POS)
+      return 0;
+   if (slot == VARYING_SLOT_PSIZ)
+      return 1;
+   if (slot == VARYING_SLOT_CLIP_DIST0)
+      return 2;
+   if (slot == VARYING_SLOT_CLIP_DIST1)
+      return 3;
+   if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
+      return 4 + (slot - VARYING_SLOT_VAR0);
+   unreachable("illegal slot in get unique index\n");
  }
  
  static nir_ssa_def *
-build_local_offset(nir_builder *b, struct state *state,
-               nir_ssa_def *vertex, uint32_t location, uint32_t comp, nir_ssa_def *offset)
+build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex,
+                   uint32_t location, uint32_t comp, nir_ssa_def *offset)
  {
-       nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
-       nir_ssa_def *primitive_offset =
-               nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
-       nir_ssa_def *attr_offset;
-       nir_ssa_def *vertex_stride;
-       unsigned index = shader_io_get_unique_index(location);
-
-       switch (b->shader->info.stage) {
-       case MESA_SHADER_VERTEX:
-       case MESA_SHADER_TESS_EVAL:
-               vertex_stride = nir_imm_int(b, state->map.stride * 4);
-               attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
-               break;
-       case MESA_SHADER_TESS_CTRL:
-       case MESA_SHADER_GEOMETRY:
-               vertex_stride = nir_load_vs_vertex_stride_ir3(b);
-               attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
-                                                          nir_imm_int(b, comp * 4));
-               break;
-       default:
-               unreachable("bad shader stage");
-       }
-
-       nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
-
-       return nir_iadd(b, nir_iadd(b, primitive_offset, vertex_offset),
-                       nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
+   nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
+   nir_ssa_def *primitive_offset =
+      nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
+   nir_ssa_def *attr_offset;
+   nir_ssa_def *vertex_stride;
+   unsigned index = shader_io_get_unique_index(location);
+
+   switch (b->shader->info.stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_TESS_EVAL:
+      vertex_stride = nir_imm_int(b, state->map.stride * 4);
+      attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_GEOMETRY:
+      vertex_stride = nir_load_vs_vertex_stride_ir3(b);
+      attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
+                             nir_imm_int(b, comp * 4));
+      break;
+   default:
+      unreachable("bad shader stage");
+   }
+
+   nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
+
+   return nir_iadd(
+      b, nir_iadd(b, primitive_offset, vertex_offset),
+      nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
  }
  
  static nir_intrinsic_instr *
  replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
-               nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1, nir_ssa_def *src2)
+                  nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1,
+                  nir_ssa_def *src2)
  {
-       nir_intrinsic_instr *new_intr =
-               nir_intrinsic_instr_create(b->shader, op);
+   nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
  
-       new_intr->src[0] = nir_src_for_ssa(src0);
-       if (src1)
-               new_intr->src[1] = nir_src_for_ssa(src1);
-       if (src2)
-               new_intr->src[2] = nir_src_for_ssa(src2);
+   new_intr->src[0] = nir_src_for_ssa(src0);
+   if (src1)
+      new_intr->src[1] = nir_src_for_ssa(src1);
+   if (src2)
+      new_intr->src[2] = nir_src_for_ssa(src2);
  
-       new_intr->num_components = intr->num_components;
+   new_intr->num_components = intr->num_components;
  
-       if (nir_intrinsic_infos[op].has_dest)
-               nir_ssa_dest_init(&new_intr->instr, &new_intr->dest,
-                                                 intr->num_components, 32, NULL);
+   if (nir_intrinsic_infos[op].has_dest)
+      nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
+                        32, NULL);
  
-       nir_builder_instr_insert(b, &new_intr->instr);
+   nir_builder_instr_insert(b, &new_intr->instr);
  
-       if (nir_intrinsic_infos[op].has_dest)
-               nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
+   if (nir_intrinsic_infos[op].has_dest)
+      nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
  
-       nir_instr_remove(&intr->instr);
+   nir_instr_remove(&intr->instr);
  
-       return new_intr;
+   return new_intr;
  }
  
  static void
  build_primitive_map(nir_shader *shader, struct primitive_map *map)
  {
-       /* All interfaces except the TCS <-> TES interface use ldlw, which takes
-        * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
-        * ldg, which takes an offset in dwords, but each per-vertex slot has
-        * space for every vertex, and there's space at the beginning for
-        * per-patch varyings.
-        */
-       unsigned slot_size = 16, start = 0;
-       if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
-               slot_size = shader->info.tess.tcs_vertices_out * 4;
-               start = util_last_bit(shader->info.patch_outputs_written) * 4;
-       }
-
-       uint64_t mask = shader->info.outputs_written;
-       unsigned loc = start;
-       while (mask) {
-               int location = u_bit_scan64(&mask);
-               if (is_tess_levels(location))
-                       continue;
-
-               unsigned index = shader_io_get_unique_index(location);
-               map->loc[index] = loc;
-               loc += slot_size;
-       }
-
-       map->stride = loc;
-       /* Use units of dwords for the stride. */
-       if (shader->info.stage != MESA_SHADER_TESS_CTRL)
-               map->stride /= 4;
+   /* All interfaces except the TCS <-> TES interface use ldlw, which takes
+    * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
+    * ldg, which takes an offset in dwords, but each per-vertex slot has
+    * space for every vertex, and there's space at the beginning for
+    * per-patch varyings.
+    */
+   unsigned slot_size = 16, start = 0;
+   if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
+      slot_size = shader->info.tess.tcs_vertices_out * 4;
+      start = util_last_bit(shader->info.patch_outputs_written) * 4;
+   }
+
+   uint64_t mask = shader->info.outputs_written;
+   unsigned loc = start;
+   while (mask) {
+      int location = u_bit_scan64(&mask);
+      if (is_tess_levels(location))
+         continue;
+
+      unsigned index = shader_io_get_unique_index(location);
+      map->loc[index] = loc;
+      loc += slot_size;
+   }
+
+   map->stride = loc;
+   /* Use units of dwords for the stride. */
+   if (shader->info.stage != MESA_SHADER_TESS_CTRL)
+      map->stride /= 4;
  }
  
  /* For shader stages that receive a primitive map, calculate how big it should
@@ -211,770 +213,782 @@ build_primitive_map(nir_shader *shader, struct primitive_map *map)
  static unsigned
  calc_primitive_map_size(nir_shader *shader)
  {
-       uint64_t mask = shader->info.inputs_read;
-       unsigned max_index = 0;
-       while (mask) {
-               int location = u_bit_scan64(&mask);
-
-               if (is_tess_levels(location))
-                       continue;
-
-               unsigned index = shader_io_get_unique_index(location);
-               max_index = MAX2(max_index, index + 1);
-       }
-       
-       return max_index;
+   uint64_t mask = shader->info.inputs_read;
+   unsigned max_index = 0;
+   while (mask) {
+      int location = u_bit_scan64(&mask);
+
+      if (is_tess_levels(location))
+         continue;
+
+      unsigned index = shader_io_get_unique_index(location);
+      max_index = MAX2(max_index, index + 1);
+   }
+
+   return max_index;
  }
  
  static void
-lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *state)
+lower_block_to_explicit_output(nir_block *block, nir_builder *b,
+                               struct state *state)
  {
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
-
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               switch (intr->intrinsic) {
-               case nir_intrinsic_store_output: {
-                       // src[] = { value, offset }.
-
-                       /* nir_lower_io_to_temporaries replaces all access to output
-                        * variables with temp variables and then emits a nir_copy_var at
-                        * the end of the shader.  Thus, we should always get a full wrmask
-                        * here.
-                        */
-                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
-
-                       b->cursor = nir_instr_remove(&intr->instr);
-
-                       nir_ssa_def *vertex_id = build_vertex_id(b, state);
-                       nir_ssa_def *offset = build_local_offset(b, state, vertex_id,
-                                       nir_intrinsic_io_semantics(intr).location,
-                                       nir_intrinsic_component(intr),
-                                       intr->src[1].ssa);
-
-                       nir_store_shared_ir3(b, intr->src[0].ssa, offset);
-                       break;
-               }
-
-               default:
-                       break;
-               }
-       }
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_store_output: {
+         // src[] = { value, offset }.
+
+         /* nir_lower_io_to_temporaries replaces all access to output
+          * variables with temp variables and then emits a nir_copy_var at
+          * the end of the shader.  Thus, we should always get a full wrmask
+          * here.
+          */
+         assert(
+            util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+         b->cursor = nir_instr_remove(&intr->instr);
+
+         nir_ssa_def *vertex_id = build_vertex_id(b, state);
+         nir_ssa_def *offset = build_local_offset(
+            b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
+            nir_intrinsic_component(intr), intr->src[1].ssa);
+
+         nir_store_shared_ir3(b, intr->src[0].ssa, offset);
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
  }
  
  static nir_ssa_def *
  local_thread_id(nir_builder *b)
  {
-       return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
+   return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
  }
  
  void
-ir3_nir_lower_to_explicit_output(nir_shader *shader, struct ir3_shader_variant *v,
-               unsigned topology)
+ir3_nir_lower_to_explicit_output(nir_shader *shader,
+                                 struct ir3_shader_variant *v,
+                                 unsigned topology)
  {
-       struct state state = { };
+   struct state state = {};
  
-       build_primitive_map(shader, &state.map);
-       memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
+   build_primitive_map(shader, &state.map);
+   memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
  
-       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
-       assert(impl);
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   assert(impl);
  
-       nir_builder b;
-       nir_builder_init(&b, impl);
-       b.cursor = nir_before_cf_list(&impl->body);
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_before_cf_list(&impl->body);
  
-       if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
-               state.header = nir_load_tcs_header_ir3(&b);
-       else
-               state.header = nir_load_gs_header_ir3(&b);
+   if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
+      state.header = nir_load_tcs_header_ir3(&b);
+   else
+      state.header = nir_load_gs_header_ir3(&b);
  
-       nir_foreach_block_safe (block, impl)
-               lower_block_to_explicit_output(block, &b, &state);
+   nir_foreach_block_safe (block, impl)
+      lower_block_to_explicit_output(block, &b, &state);
  
-       nir_metadata_preserve(impl, nir_metadata_block_index |
-                       nir_metadata_dominance);
+   nir_metadata_preserve(impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
  
-       v->output_size = state.map.stride;
+   v->output_size = state.map.stride;
  }
  
-
  static void
-lower_block_to_explicit_input(nir_block *block, nir_builder *b, struct state *state)
+lower_block_to_explicit_input(nir_block *block, nir_builder *b,
+                              struct state *state)
  {
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
-
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               switch (intr->intrinsic) {
-               case nir_intrinsic_load_per_vertex_input: {
-                       // src[] = { vertex, offset }.
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       nir_ssa_def *offset = build_local_offset(b, state,
-                                       intr->src[0].ssa, // this is typically gl_InvocationID
-                                       nir_intrinsic_io_semantics(intr).location,
-                                       nir_intrinsic_component(intr),
-                                       intr->src[1].ssa);
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL);
-                       break;
-               }
-
-               case nir_intrinsic_load_invocation_id: {
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       nir_ssa_def *iid = build_invocation_id(b, state);
-                       nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
-                       nir_instr_remove(&intr->instr);
-                       break;
-               }
-
-               default:
-                       break;
-               }
-       }
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_per_vertex_input: {
+         // src[] = { vertex, offset }.
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         nir_ssa_def *offset = build_local_offset(
+            b, state,
+            intr->src[0].ssa, // this is typically gl_InvocationID
+            nir_intrinsic_io_semantics(intr).location,
+            nir_intrinsic_component(intr), intr->src[1].ssa);
+
+         replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
+                           NULL);
+         break;
+      }
+
+      case nir_intrinsic_load_invocation_id: {
+         b->cursor = nir_before_instr(&intr->instr);
+
+         nir_ssa_def *iid = build_invocation_id(b, state);
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
+         nir_instr_remove(&intr->instr);
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
  }
  
  void
-ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_shader_variant *v)
+ir3_nir_lower_to_explicit_input(nir_shader *shader,
+                                struct ir3_shader_variant *v)
  {
-       struct state state = { };
+   struct state state = {};
  
-       /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
-        * HS uses a different primitive id, which starts at bit 16 in the header
-        */
-       if (shader->info.stage == MESA_SHADER_TESS_CTRL && v->shader->compiler->tess_use_shared)
-               state.local_primitive_id_start = 16;
+   /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
+    * HS uses a different primitive id, which starts at bit 16 in the header
+    */
+   if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
+       v->shader->compiler->tess_use_shared)
+      state.local_primitive_id_start = 16;
  
-       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
-       assert(impl);
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   assert(impl);
  
-       nir_builder b;
-       nir_builder_init(&b, impl);
-       b.cursor = nir_before_cf_list(&impl->body);
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_before_cf_list(&impl->body);
  
-       if (shader->info.stage == MESA_SHADER_GEOMETRY)
-               state.header = nir_load_gs_header_ir3(&b);
-       else
-               state.header = nir_load_tcs_header_ir3(&b);
+   if (shader->info.stage == MESA_SHADER_GEOMETRY)
+      state.header = nir_load_gs_header_ir3(&b);
+   else
+      state.header = nir_load_tcs_header_ir3(&b);
  
-       nir_foreach_block_safe (block, impl)
-               lower_block_to_explicit_input(block, &b, &state);
+   nir_foreach_block_safe (block, impl)
+      lower_block_to_explicit_input(block, &b, &state);
  
-       v->input_size = calc_primitive_map_size(shader);
+   v->input_size = calc_primitive_map_size(shader);
  }
  
  static nir_ssa_def *
  build_tcs_out_vertices(nir_builder *b)
  {
-       if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
-               return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
-       else
-               return nir_load_patch_vertices_in(b);
+   if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
+      return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
+   else
+      return nir_load_patch_vertices_in(b);
  }
  
  static nir_ssa_def *
  build_per_vertex_offset(nir_builder *b, struct state *state,
-               nir_ssa_def *vertex, uint32_t location, uint32_t comp, nir_ssa_def *offset)
+                        nir_ssa_def *vertex, uint32_t location, uint32_t comp,
+                        nir_ssa_def *offset)
  {
-       nir_ssa_def *primitive_id = nir_load_primitive_id(b);
-       nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
-       nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, patch_stride);
-       nir_ssa_def *attr_offset;
-
-       if (nir_src_is_const(nir_src_for_ssa(offset))) {
-               location += nir_src_as_uint(nir_src_for_ssa(offset));
-               offset = nir_imm_int(b, 0);
-       } else {
-               /* Offset is in vec4's, but we need it in unit of components for the
-                * load/store_global_ir3 offset.
-                */
-               offset = nir_ishl(b, offset, nir_imm_int(b, 2));
-       }
-
-       nir_ssa_def *vertex_offset;
-       if (vertex) {
-               unsigned index = shader_io_get_unique_index(location);
-               switch (b->shader->info.stage) {
-               case MESA_SHADER_TESS_CTRL:
-                       attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
-                       break;
-               case MESA_SHADER_TESS_EVAL:
-                       attr_offset =
-                               nir_iadd(b, nir_load_primitive_location_ir3(b, index),
-                                                nir_imm_int(b, comp));
-                       break;
-               default:
-                       unreachable("bad shader state");
-               }
-
-               attr_offset = nir_iadd(b, attr_offset,
-                                                          nir_imul24(b, offset,
-                                                                                 build_tcs_out_vertices(b)));
-               vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
-       } else {
-               assert(location >= VARYING_SLOT_PATCH0 &&
-                          location <= VARYING_SLOT_TESS_MAX);
-               unsigned index = location - VARYING_SLOT_PATCH0;
-               attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
-               vertex_offset = nir_imm_int(b, 0);
-       }
-
-       return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
+   nir_ssa_def *primitive_id = nir_load_primitive_id(b);
+   nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
+   nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, patch_stride);
+   nir_ssa_def *attr_offset;
+
+   if (nir_src_is_const(nir_src_for_ssa(offset))) {
+      location += nir_src_as_uint(nir_src_for_ssa(offset));
+      offset = nir_imm_int(b, 0);
+   } else {
+      /* Offset is in vec4's, but we need it in unit of components for the
+       * load/store_global_ir3 offset.
+       */
+      offset = nir_ishl(b, offset, nir_imm_int(b, 2));
+   }
+
+   nir_ssa_def *vertex_offset;
+   if (vertex) {
+      unsigned index = shader_io_get_unique_index(location);
+      switch (b->shader->info.stage) {
+      case MESA_SHADER_TESS_CTRL:
+         attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
+                                nir_imm_int(b, comp));
+         break;
+      default:
+         unreachable("bad shader state");
+      }
+
+      attr_offset = nir_iadd(b, attr_offset,
+                             nir_imul24(b, offset, build_tcs_out_vertices(b)));
+      vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
+   } else {
+      assert(location >= VARYING_SLOT_PATCH0 &&
+             location <= VARYING_SLOT_TESS_MAX);
+      unsigned index = location - VARYING_SLOT_PATCH0;
+      attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
+      vertex_offset = nir_imm_int(b, 0);
+   }
+
+   return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
  }
  
  static nir_ssa_def *
-build_patch_offset(nir_builder *b, struct state *state,
-               uint32_t base, uint32_t comp, nir_ssa_def *offset)
+build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
+                   uint32_t comp, nir_ssa_def *offset)
  {
-       return build_per_vertex_offset(b, state, NULL, base, comp, offset);
+   return build_per_vertex_offset(b, state, NULL, base, comp, offset);
  }
  
  static void
  tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
  {
-       switch (state->topology) {
-       case IR3_TESS_TRIANGLES:
-               *inner = 1;
-               *outer = 3;
-               break;
-       case IR3_TESS_QUADS:
-               *inner = 2;
-               *outer = 4;
-               break;
-       case IR3_TESS_ISOLINES:
-               *inner = 0;
-               *outer = 2;
-               break;
-       default:
-               unreachable("bad");
-       }
+   switch (state->topology) {
+   case IR3_TESS_TRIANGLES:
+      *inner = 1;
+      *outer = 3;
+      break;
+   case IR3_TESS_QUADS:
+      *inner = 2;
+      *outer = 4;
+      break;
+   case IR3_TESS_ISOLINES:
+      *inner = 0;
+      *outer = 2;
+      break;
+   default:
+      unreachable("bad");
+   }
  }
  
  static nir_ssa_def *
  build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
  {
-       uint32_t inner_levels, outer_levels;
-       tess_level_components(state, &inner_levels, &outer_levels);
-
-       const uint32_t patch_stride = 1 + inner_levels + outer_levels;
-
-       nir_ssa_def *primitive_id = nir_load_primitive_id(b);
-
-       nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, nir_imm_int(b, patch_stride));
-
-       uint32_t offset;
-       switch (slot) {
-       case VARYING_SLOT_TESS_LEVEL_OUTER:
-               /* There's some kind of header dword, tess levels start at index 1. */
-               offset = 1;
-               break;
-       case VARYING_SLOT_TESS_LEVEL_INNER:
-               offset = 1 + outer_levels;
-               break;
-       default:
-               unreachable("bad");
-       }
-
-       return nir_iadd(b, patch_offset, nir_imm_int(b, offset));
+   uint32_t inner_levels, outer_levels;
+   tess_level_components(state, &inner_levels, &outer_levels);
+
+   const uint32_t patch_stride = 1 + inner_levels + outer_levels;
+
+   nir_ssa_def *primitive_id = nir_load_primitive_id(b);
+
+   nir_ssa_def *patch_offset =
+      nir_imul24(b, primitive_id, nir_imm_int(b, patch_stride));
+
+   uint32_t offset;
+   switch (slot) {
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+      /* There's some kind of header dword, tess levels start at index 1. */
+      offset = 1;
+      break;
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+      offset = 1 + outer_levels;
+      break;
+   default:
+      unreachable("bad");
+   }
+
+   return nir_iadd(b, patch_offset, nir_imm_int(b, offset));
  }
  
  static void
  lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
  {
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
-
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               switch (intr->intrinsic) {
-               case nir_intrinsic_load_per_vertex_output: {
-                       // src[] = { vertex, offset }.
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_per_vertex_offset(b, state,
-                                       intr->src[0].ssa,
-                                       nir_intrinsic_io_semantics(intr).location,
-                                       nir_intrinsic_component(intr),
-                                       intr->src[1].ssa);
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
-                       break;
-               }
-
-               case nir_intrinsic_store_per_vertex_output: {
-                       // src[] = { value, vertex, offset }.
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       /* sparse writemask not supported */
-                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
-
-                       nir_ssa_def *value = intr->src[0].ssa;
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_per_vertex_offset(b, state,
-                                       intr->src[1].ssa,
-                                       nir_intrinsic_io_semantics(intr).location,
-                                       nir_intrinsic_component(intr),
-                                       intr->src[2].ssa);
-
-                       replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address, offset);
-
-                       break;
-               }
-
-               case nir_intrinsic_load_output: {
-                       // src[] = { offset }.
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       nir_ssa_def *address, *offset;
-
-                       /* note if vectorization of the tess level loads ever happens:
-                        * "ldg" across 16-byte boundaries can behave incorrectly if results
-                        * are never used. most likely some issue with (sy) not properly
-                        * syncing with values coming from a second memory transaction.
-                        */
-                       gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
-                       if (is_tess_levels(location)) {
-                               assert(intr->dest.ssa.num_components == 1);
-                               address = nir_load_tess_factor_base_ir3(b);
-                               offset = build_tessfactor_base(b, location, state);
-                       } else {
-                               address = nir_load_tess_param_base_ir3(b);
-                               offset = build_patch_offset(b, state,
-                                                                                       location,
-                                                                                       nir_intrinsic_component(intr),
-                                                                                       intr->src[0].ssa);
-                       }
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
-                       break;
-               }
-
-               case nir_intrinsic_store_output: {
-                       // src[] = { value, offset }.
-
-                       /* write patch output to bo */
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       /* sparse writemask not supported */
-                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
-
-                       gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
-                       if (is_tess_levels(location)) {
-                               /* with tess levels are defined as float[4] and float[2],
-                                * but tess factor BO has smaller sizes for tris/isolines,
-                                * so we have to discard any writes beyond the number of
-                                * components for inner/outer levels */
-                               uint32_t inner_levels, outer_levels, levels;
-                               tess_level_components(state, &inner_levels, &outer_levels);
-
-                               if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
-                                       levels = outer_levels;
-                               else
-                                       levels = inner_levels;
-
-                               assert(intr->src[0].ssa->num_components == 1);
-
-                               nir_ssa_def *offset =
-                                       nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr));
-
-                               nir_if *nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
-
-                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                                               intr->src[0].ssa,
-                                               nir_load_tess_factor_base_ir3(b),
-                                               nir_iadd(b, offset, build_tessfactor_base(b, location, state)));
-
-                               nir_pop_if(b, nif);
-                       } else {
-                               nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                               nir_ssa_def *offset = build_patch_offset(b, state, 
-                                                                                                                location,
-                                                                                                                nir_intrinsic_component(intr),
-                                                                                                                intr->src[1].ssa);
-
-                               debug_assert(nir_intrinsic_component(intr) == 0);
-
-                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                                               intr->src[0].ssa, address, offset);
-                       }
-                       break;
-               }
-
-               default:
-                       break;
-               }
-       }
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_per_vertex_output: {
+         // src[] = { vertex, offset }.
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+         nir_ssa_def *offset = build_per_vertex_offset(
+            b, state, intr->src[0].ssa,
+            nir_intrinsic_io_semantics(intr).location,
+            nir_intrinsic_component(intr), intr->src[1].ssa);
+
+         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
+                           offset, NULL);
+         break;
+      }
+
+      case nir_intrinsic_store_per_vertex_output: {
+         // src[] = { value, vertex, offset }.
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         /* sparse writemask not supported */
+         assert(
+            util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+         nir_ssa_def *value = intr->src[0].ssa;
+         nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+         nir_ssa_def *offset = build_per_vertex_offset(
+            b, state, intr->src[1].ssa,
+            nir_intrinsic_io_semantics(intr).location,
+            nir_intrinsic_component(intr), intr->src[2].ssa);
+
+         replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
+                           address, offset);
+
+         break;
+      }
+
+      case nir_intrinsic_load_output: {
+         // src[] = { offset }.
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         nir_ssa_def *address, *offset;
+
+         /* note if vectorization of the tess level loads ever happens:
+          * "ldg" across 16-byte boundaries can behave incorrectly if results
+          * are never used. most likely some issue with (sy) not properly
+          * syncing with values coming from a second memory transaction.
+          */
+         gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
+         if (is_tess_levels(location)) {
+            assert(intr->dest.ssa.num_components == 1);
+            address = nir_load_tess_factor_base_ir3(b);
+            offset = build_tessfactor_base(b, location, state);
+         } else {
+            address = nir_load_tess_param_base_ir3(b);
+            offset = build_patch_offset(b, state, location,
+                                        nir_intrinsic_component(intr),
+                                        intr->src[0].ssa);
+         }
+
+         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
+                           offset, NULL);
+         break;
+      }
+
+      case nir_intrinsic_store_output: {
+         // src[] = { value, offset }.
+
+         /* write patch output to bo */
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         /* sparse writemask not supported */
+         assert(
+            util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+         gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
+         if (is_tess_levels(location)) {
+            /* with tess levels are defined as float[4] and float[2],
+             * but tess factor BO has smaller sizes for tris/isolines,
+             * so we have to discard any writes beyond the number of
+             * components for inner/outer levels */
+            uint32_t inner_levels, outer_levels, levels;
+            tess_level_components(state, &inner_levels, &outer_levels);
+
+            if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+               levels = outer_levels;
+            else
+               levels = inner_levels;
+
+            assert(intr->src[0].ssa->num_components == 1);
+
+            nir_ssa_def *offset =
+               nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr));
+
+            nir_if *nif =
+               nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
+
+            replace_intrinsic(
+               b, intr, nir_intrinsic_store_global_ir3, intr->src[0].ssa,
+               nir_load_tess_factor_base_ir3(b),
+               nir_iadd(b, offset, build_tessfactor_base(b, location, state)));
+
+            nir_pop_if(b, nif);
+         } else {
+            nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+            nir_ssa_def *offset = build_patch_offset(
+               b, state, location, nir_intrinsic_component(intr),
+               intr->src[1].ssa);
+
+            debug_assert(nir_intrinsic_component(intr) == 0);
+
+            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+                              intr->src[0].ssa, address, offset);
+         }
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
  }
  
  static void
  emit_tess_epilouge(nir_builder *b, struct state *state)
  {
-       /* Insert endpatch instruction:
-        *
-        * TODO we should re-work this to use normal flow control.
-        */
+   /* Insert endpatch instruction:
+    *
+    * TODO we should re-work this to use normal flow control.
+    */
  
-       nir_end_patch_ir3(b);
+   nir_end_patch_ir3(b);
  }
  
  void
  ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
-               unsigned topology)
+                        unsigned topology)
  {
-       struct state state = { .topology = topology };
+   struct state state = {.topology = topology};
  
-       if (shader_debug_enabled(shader->info.stage)) {
-               mesa_logi("NIR (before tess lowering) for %s shader:",
-                                 _mesa_shader_stage_to_string(shader->info.stage));
-               nir_log_shaderi(shader);
-       }
+   if (shader_debug_enabled(shader->info.stage)) {
+      mesa_logi("NIR (before tess lowering) for %s shader:",
+                _mesa_shader_stage_to_string(shader->info.stage));
+      nir_log_shaderi(shader);
+   }
  
-       build_primitive_map(shader, &state.map);
-       memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
-       v->output_size = state.map.stride;
+   build_primitive_map(shader, &state.map);
+   memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
+   v->output_size = state.map.stride;
  
-       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
-       assert(impl);
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   assert(impl);
  
-       nir_builder b;
-       nir_builder_init(&b, impl);
-       b.cursor = nir_before_cf_list(&impl->body);
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_before_cf_list(&impl->body);
  
-       state.header = nir_load_tcs_header_ir3(&b);
+   state.header = nir_load_tcs_header_ir3(&b);
  
-       nir_foreach_block_safe (block, impl)
-               lower_tess_ctrl_block(block, &b, &state);
+   nir_foreach_block_safe (block, impl)
+      lower_tess_ctrl_block(block, &b, &state);
  
-       /* Now move the body of the TCS into a conditional:
-        *
-        *   if (gl_InvocationID < num_vertices)
-        *     // body
-        *
-        */
+   /* Now move the body of the TCS into a conditional:
+    *
+    *   if (gl_InvocationID < num_vertices)
+    *     // body
+    *
+    */
  
-       nir_cf_list body;
-       nir_cf_extract(&body, nir_before_cf_list(&impl->body),
-                                  nir_after_cf_list(&impl->body));
+   nir_cf_list body;
+   nir_cf_extract(&body, nir_before_cf_list(&impl->body),
+                  nir_after_cf_list(&impl->body));
  
-       b.cursor = nir_after_cf_list(&impl->body);
+   b.cursor = nir_after_cf_list(&impl->body);
  
-       /* Re-emit the header, since the old one got moved into the if branch */
-       state.header = nir_load_tcs_header_ir3(&b);
-       nir_ssa_def *iid = build_invocation_id(&b, &state);
+   /* Re-emit the header, since the old one got moved into the if branch */
+   state.header = nir_load_tcs_header_ir3(&b);
+   nir_ssa_def *iid = build_invocation_id(&b, &state);
  
-       const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
-       nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
+   const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
+   nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
  
-       nir_if *nif = nir_push_if(&b, cond);
+   nir_if *nif = nir_push_if(&b, cond);
  
-       nir_cf_reinsert(&body, b.cursor);
+   nir_cf_reinsert(&body, b.cursor);
  
-       b.cursor = nir_after_cf_list(&nif->then_list);
+   b.cursor = nir_after_cf_list(&nif->then_list);
  
-       /* Insert conditional exit for threads invocation id != 0 */
-       nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
-       nir_cond_end_ir3(&b, iid0_cond);
+   /* Insert conditional exit for threads invocation id != 0 */
+   nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
+   nir_cond_end_ir3(&b, iid0_cond);
  
-       emit_tess_epilouge(&b, &state);
+   emit_tess_epilouge(&b, &state);
  
-       nir_pop_if(&b, nif);
+   nir_pop_if(&b, nif);
  
-       nir_metadata_preserve(impl, 0);
+   nir_metadata_preserve(impl, 0);
  }
  
-
  static void
  lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
  {
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
-
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               switch (intr->intrinsic) {
-               case nir_intrinsic_load_tess_coord: {
-                       b->cursor = nir_after_instr(&intr->instr);
-                       nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
-                       nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
-                       nir_ssa_def *z;
-
-                       if (state->topology == IR3_TESS_TRIANGLES)
-                               z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
-                       else
-                               z = nir_imm_float(b, 0.0f);
-
-                       nir_ssa_def *coord = nir_vec3(b, x, y, z);
-
-                       nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
-                                       coord,
-                                       b->cursor.instr);
-                       break;
-               }
-
-               case nir_intrinsic_load_per_vertex_input: {
-                       // src[] = { vertex, offset }.
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_per_vertex_offset(b, state,
-                                       intr->src[0].ssa,
-                                       nir_intrinsic_io_semantics(intr).location,
-                                       nir_intrinsic_component(intr),
-                                       intr->src[1].ssa);
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
-                       break;
-               }
-
-               case nir_intrinsic_load_input: {
-                       // src[] = { offset }.
-
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       nir_ssa_def *address, *offset;
-
-                       /* note if vectorization of the tess level loads ever happens:
-                        * "ldg" across 16-byte boundaries can behave incorrectly if results
-                        * are never used. most likely some issue with (sy) not properly
-                        * syncing with values coming from a second memory transaction.
-                        */
-                       gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
-                       if (is_tess_levels(location)) {
-                               assert(intr->dest.ssa.num_components == 1);
-                               address = nir_load_tess_factor_base_ir3(b);
-                               offset = build_tessfactor_base(b, location, state);
-                       } else {
-                               address = nir_load_tess_param_base_ir3(b);
-                               offset = build_patch_offset(b, state,
-                                                                                       location,
-                                                                                       nir_intrinsic_component(intr),
-                                                                                       intr->src[0].ssa);
-                       }
-
-                       offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)));
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
-                       break;
-               }
-
-               default:
-                       break;
-               }
-       }
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_tess_coord: {
+         b->cursor = nir_after_instr(&intr->instr);
+         nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
+         nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
+         nir_ssa_def *z;
+
+         if (state->topology == IR3_TESS_TRIANGLES)
+            z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
+         else
+            z = nir_imm_float(b, 0.0f);
+
+         nir_ssa_def *coord = nir_vec3(b, x, y, z);
+
+         nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord,
+                                        b->cursor.instr);
+         break;
+      }
+
+      case nir_intrinsic_load_per_vertex_input: {
+         // src[] = { vertex, offset }.
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+         nir_ssa_def *offset = build_per_vertex_offset(
+            b, state, intr->src[0].ssa,
+            nir_intrinsic_io_semantics(intr).location,
+            nir_intrinsic_component(intr), intr->src[1].ssa);
+
+         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
+                           offset, NULL);
+         break;
+      }
+
+      case nir_intrinsic_load_input: {
+         // src[] = { offset }.
+
+         b->cursor = nir_before_instr(&intr->instr);
+
+         nir_ssa_def *address, *offset;
+
+         /* note if vectorization of the tess level loads ever happens:
+          * "ldg" across 16-byte boundaries can behave incorrectly if results
+          * are never used. most likely some issue with (sy) not properly
+          * syncing with values coming from a second memory transaction.
+          */
+         gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
+         if (is_tess_levels(location)) {
+            assert(intr->dest.ssa.num_components == 1);
+            address = nir_load_tess_factor_base_ir3(b);
+            offset = build_tessfactor_base(b, location, state);
+         } else {
+            address = nir_load_tess_param_base_ir3(b);
+            offset = build_patch_offset(b, state, location,
+                                        nir_intrinsic_component(intr),
+                                        intr->src[0].ssa);
+         }
+
+         offset =
+            nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)));
+
+         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
+                           offset, NULL);
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
  }
  
  void
-ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology)
+ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
+                        unsigned topology)
  {
-       struct state state = { .topology = topology };
+   struct state state = {.topology = topology};
  
-       if (shader_debug_enabled(shader->info.stage)) {
-               mesa_logi("NIR (before tess lowering) for %s shader:",
-                                 _mesa_shader_stage_to_string(shader->info.stage));
-               nir_log_shaderi(shader);
-       }
+   if (shader_debug_enabled(shader->info.stage)) {
+      mesa_logi("NIR (before tess lowering) for %s shader:",
+                _mesa_shader_stage_to_string(shader->info.stage));
+      nir_log_shaderi(shader);
+   }
  
-       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
-       assert(impl);
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   assert(impl);
  
-       nir_builder b;
-       nir_builder_init(&b, impl);
+   nir_builder b;
+   nir_builder_init(&b, impl);
  
-       nir_foreach_block_safe (block, impl)
-               lower_tess_eval_block(block, &b, &state);
+   nir_foreach_block_safe (block, impl)
+      lower_tess_eval_block(block, &b, &state);
  
-       v->input_size = calc_primitive_map_size(shader);
+   v->input_size = calc_primitive_map_size(shader);
  
-       nir_metadata_preserve(impl, 0);
+   nir_metadata_preserve(impl, 0);
  }
  
  static void
  lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
  {
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
-
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-               switch (intr->intrinsic) {
-               case nir_intrinsic_end_primitive: {
-                       /* Note: This ignores the stream, which seems to match the blob
-                        * behavior. I'm guessing the HW ignores any extraneous cut
-                        * signals from an EndPrimitive() that doesn't correspond to the
-                        * rasterized stream.
-                        */
-                       b->cursor = nir_before_instr(&intr->instr);
-                       nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
-                       nir_instr_remove(&intr->instr);
-                       break;
-               }
-
-               case nir_intrinsic_emit_vertex: {
-                       /* Load the vertex count */
-                       b->cursor = nir_before_instr(&intr->instr);
-                       nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
-
-                       nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
-
-                       unsigned stream = nir_intrinsic_stream_id(intr);
-                       /* vertex_flags_out |= stream */
-                       nir_store_var(b, state->vertex_flags_out,
-                                                 nir_ior(b, nir_load_var(b, state->vertex_flags_out),
-                                                                 nir_imm_int(b, stream)), 0x1 /* .x */);
-
-                       foreach_two_lists(dest_node, &state->emit_outputs, src_node, &state->old_outputs) {
-                               nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
-                               nir_variable *src = exec_node_data(nir_variable, src_node, node);
-                               nir_copy_var(b, dest, src);
-                       }
-
-                       nir_instr_remove(&intr->instr);
-
-                       nir_store_var(b, state->emitted_vertex_var,
-                                       nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1);
-
-                       nir_pop_if(b, NULL);
-
-                       /* Increment the vertex count by 1 */
-                       nir_store_var(b, state->vertex_count_var,
-                                       nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
-                       nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
-
-                       break;
-               }
-
-               default:
-                       break;
-               }
-       }
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_end_primitive: {
+         /* Note: This ignores the stream, which seems to match the blob
+          * behavior. I'm guessing the HW ignores any extraneous cut
+          * signals from an EndPrimitive() that doesn't correspond to the
+          * rasterized stream.
+          */
+         b->cursor = nir_before_instr(&intr->instr);
+         nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
+         nir_instr_remove(&intr->instr);
+         break;
+      }
+
+      case nir_intrinsic_emit_vertex: {
+         /* Load the vertex count */
+         b->cursor = nir_before_instr(&intr->instr);
+         nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
+
+         nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
+
+         unsigned stream = nir_intrinsic_stream_id(intr);
+         /* vertex_flags_out |= stream */
+         nir_store_var(b, state->vertex_flags_out,
+                       nir_ior(b, nir_load_var(b, state->vertex_flags_out),
+                               nir_imm_int(b, stream)),
+                       0x1 /* .x */);
+
+         foreach_two_lists (dest_node, &state->emit_outputs, src_node,
+                            &state->old_outputs) {
+            nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
+            nir_variable *src = exec_node_data(nir_variable, src_node, node);
+            nir_copy_var(b, dest, src);
+         }
+
+         nir_instr_remove(&intr->instr);
+
+         nir_store_var(b, state->emitted_vertex_var,
+                       nir_iadd(b, nir_load_var(b, state->emitted_vertex_var),
+                                nir_imm_int(b, 1)),
+                       0x1);
+
+         nir_pop_if(b, NULL);
+
+         /* Increment the vertex count by 1 */
+         nir_store_var(b, state->vertex_count_var,
+                       nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
+         nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
+
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
  }
  
  void
  ir3_nir_lower_gs(nir_shader *shader)
  {
-       struct state state = { };
-
-       if (shader_debug_enabled(shader->info.stage)) {
-               mesa_logi("NIR (before gs lowering):");
-               nir_log_shaderi(shader);
-       }
-
-       /* Create an output var for vertex_flags. This will be shadowed below,
-        * same way regular outputs get shadowed, and this variable will become a
-        * temporary.
-        */
-       state.vertex_flags_out = nir_variable_create(shader, nir_var_shader_out,
-                       glsl_uint_type(), "vertex_flags");
-       state.vertex_flags_out->data.driver_location = shader->num_outputs++;
-       state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
-       state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
-
-       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
-       assert(impl);
-
-       nir_builder b;
-       nir_builder_init(&b, impl);
-       b.cursor = nir_before_cf_list(&impl->body);
-
-       state.header = nir_load_gs_header_ir3(&b);
-
-       /* Generate two set of shadow vars for the output variables.  The first
-        * set replaces the real outputs and the second set (emit_outputs) we'll
-        * assign in the emit_vertex conditionals.  Then at the end of the shader
-        * we copy the emit_outputs to the real outputs, so that we get
-        * store_output in uniform control flow.
-        */
-       exec_list_make_empty(&state.old_outputs);
-       nir_foreach_shader_out_variable_safe(var, shader) {
-               exec_node_remove(&var->node);
-               exec_list_push_tail(&state.old_outputs, &var->node);
-       }
-       exec_list_make_empty(&state.new_outputs);
-       exec_list_make_empty(&state.emit_outputs);
-       nir_foreach_variable_in_list(var, &state.old_outputs) {
-               /* Create a new output var by cloning the original output var and
-                * stealing the name.
-                */
-               nir_variable *output = nir_variable_clone(var, shader);
-               exec_list_push_tail(&state.new_outputs, &output->node);
-
-               /* Rewrite the original output to be a shadow variable. */
-               var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
-               var->data.mode = nir_var_shader_temp;
-
-               /* Clone the shadow variable to create the emit shadow variable that
-                * we'll assign in the emit conditionals.
-                */
-               nir_variable *emit_output = nir_variable_clone(var, shader);
-               emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
-               exec_list_push_tail(&state.emit_outputs, &emit_output->node);
-       }
-
-       /* During the shader we'll keep track of which vertex we're currently
-        * emitting for the EmitVertex test and how many vertices we emitted so we
-        * know to discard if didn't emit any.  In most simple shaders, this can
-        * all be statically determined and gets optimized away.
-        */
-       state.vertex_count_var =
-               nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
-       state.emitted_vertex_var =
-               nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
-
-       /* Initialize to 0. */
-       b.cursor = nir_before_cf_list(&impl->body);
-       nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
-       nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
-       nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
-
-       nir_foreach_block_safe (block, impl)
-               lower_gs_block(block, &b, &state);
-
-       set_foreach(impl->end_block->predecessors, block_entry) {
-               struct nir_block *block = (void *)block_entry->key;
-               b.cursor = nir_after_block_before_jump(block);
-
-               nir_ssa_def *cond = nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
-
-               nir_discard_if(&b, cond);
-
-               foreach_two_lists(dest_node, &state.new_outputs, src_node, &state.emit_outputs) {
-                       nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
-                       nir_variable *src = exec_node_data(nir_variable, src_node, node);
-                       nir_copy_var(&b, dest, src);
-               }
-       }
-
-       exec_list_append(&shader->variables, &state.old_outputs);
-       exec_list_append(&shader->variables, &state.emit_outputs);
-       exec_list_append(&shader->variables, &state.new_outputs);
-
-       nir_metadata_preserve(impl, 0);
-
-       nir_lower_global_vars_to_local(shader);
-       nir_split_var_copies(shader);
-       nir_lower_var_copies(shader);
-
-       nir_fixup_deref_modes(shader);
-
-       if (shader_debug_enabled(shader->info.stage)) {
-               mesa_logi("NIR (after gs lowering):");
-               nir_log_shaderi(shader);
-       }
+   struct state state = {};
+
+   if (shader_debug_enabled(shader->info.stage)) {
+      mesa_logi("NIR (before gs lowering):");
+      nir_log_shaderi(shader);
+   }
+
+   /* Create an output var for vertex_flags. This will be shadowed below,
+    * same way regular outputs get shadowed, and this variable will become a
+    * temporary.
+    */
+   state.vertex_flags_out = nir_variable_create(
+      shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
+   state.vertex_flags_out->data.driver_location = shader->num_outputs++;
+   state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
+   state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   assert(impl);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   b.cursor = nir_before_cf_list(&impl->body);
+
+   state.header = nir_load_gs_header_ir3(&b);
+
+   /* Generate two set of shadow vars for the output variables.  The first
+    * set replaces the real outputs and the second set (emit_outputs) we'll
+    * assign in the emit_vertex conditionals.  Then at the end of the shader
+    * we copy the emit_outputs to the real outputs, so that we get
+    * store_output in uniform control flow.
+    */
+   exec_list_make_empty(&state.old_outputs);
+   nir_foreach_shader_out_variable_safe (var, shader) {
+      exec_node_remove(&var->node);
+      exec_list_push_tail(&state.old_outputs, &var->node);
+   }
+   exec_list_make_empty(&state.new_outputs);
+   exec_list_make_empty(&state.emit_outputs);
+   nir_foreach_variable_in_list (var, &state.old_outputs) {
+      /* Create a new output var by cloning the original output var and
+       * stealing the name.
+       */
+      nir_variable *output = nir_variable_clone(var, shader);
+      exec_list_push_tail(&state.new_outputs, &output->node);
+
+      /* Rewrite the original output to be a shadow variable. */
+      var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
+      var->data.mode = nir_var_shader_temp;
+
+      /* Clone the shadow variable to create the emit shadow variable that
+       * we'll assign in the emit conditionals.
+       */
+      nir_variable *emit_output = nir_variable_clone(var, shader);
+      emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
+      exec_list_push_tail(&state.emit_outputs, &emit_output->node);
+   }
+
+   /* During the shader we'll keep track of which vertex we're currently
+    * emitting for the EmitVertex test and how many vertices we emitted so we
+    * know to discard if didn't emit any.  In most simple shaders, this can
+    * all be statically determined and gets optimized away.
+    */
+   state.vertex_count_var =
+      nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
+   state.emitted_vertex_var =
+      nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
+
+   /* Initialize to 0. */
+   b.cursor = nir_before_cf_list(&impl->body);
+   nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
+   nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
+   nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
+
+   nir_foreach_block_safe (block, impl)
+      lower_gs_block(block, &b, &state);
+
+   set_foreach (impl->end_block->predecessors, block_entry) {
+      struct nir_block *block = (void *)block_entry->key;
+      b.cursor = nir_after_block_before_jump(block);
+
+      nir_ssa_def *cond =
+         nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
+
+      nir_discard_if(&b, cond);
+
+      foreach_two_lists (dest_node, &state.new_outputs, src_node,
+                         &state.emit_outputs) {
+         nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
+         nir_variable *src = exec_node_data(nir_variable, src_node, node);
+         nir_copy_var(&b, dest, src);
+      }
+   }
+
+   exec_list_append(&shader->variables, &state.old_outputs);
+   exec_list_append(&shader->variables, &state.emit_outputs);
+   exec_list_append(&shader->variables, &state.new_outputs);
+
+   nir_metadata_preserve(impl, 0);
+
+   nir_lower_global_vars_to_local(shader);
+   nir_split_var_copies(shader);
+   nir_lower_var_copies(shader);
+
+   nir_fixup_deref_modes(shader);
+
+   if (shader_debug_enabled(shader->info.stage)) {
+      mesa_logi("NIR (after gs lowering):");
+      nir_log_shaderi(shader);
+   }
  }
-
diff --git a/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c b/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c

index 8bcccfe..a8691f5 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c
@@ -31,97 +31,97 @@
  static int
  coord_offset(nir_ssa_def *ssa)
  {
-       nir_instr *parent_instr = ssa->parent_instr;
+   nir_instr *parent_instr = ssa->parent_instr;
  
-       /* The coordinate of a texture sampling instruction eligible for
-        * pre-fetch is either going to be a load_interpolated_input/
-        * load_input, or a vec2 assembling non-swizzled components of
-        * a load_interpolated_input/load_input (due to varying packing)
-        */
+   /* The coordinate of a texture sampling instruction eligible for
+    * pre-fetch is either going to be a load_interpolated_input/
+    * load_input, or a vec2 assembling non-swizzled components of
+    * a load_interpolated_input/load_input (due to varying packing)
+    */
  
-       if (parent_instr->type == nir_instr_type_alu) {
-               nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
+   if (parent_instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
  
-               if (alu->op != nir_op_vec2)
-                       return -1;
+      if (alu->op != nir_op_vec2)
+         return -1;
  
-               if (!alu->src[0].src.is_ssa)
-                       return -1;
+      if (!alu->src[0].src.is_ssa)
+         return -1;
  
-               int base_offset = coord_offset(alu->src[0].src.ssa) +
-                               alu->src[0].swizzle[0];
+      int base_offset =
+         coord_offset(alu->src[0].src.ssa) + alu->src[0].swizzle[0];
  
-               /* NOTE it might be possible to support more than 2D? */
-               for (int i = 1; i < 2; i++) {
-                       if (!alu->src[i].src.is_ssa)
-                               return -1;
+      /* NOTE it might be possible to support more than 2D? */
+      for (int i = 1; i < 2; i++) {
+         if (!alu->src[i].src.is_ssa)
+            return -1;
  
-                       int nth_offset = coord_offset(alu->src[i].src.ssa) +
-                                       alu->src[i].swizzle[0];
+         int nth_offset =
+            coord_offset(alu->src[i].src.ssa) + alu->src[i].swizzle[0];
  
-                       if (nth_offset != (base_offset + i))
-                               return -1;
-               }
+         if (nth_offset != (base_offset + i))
+            return -1;
+      }
  
-               return base_offset;
-       }
+      return base_offset;
+   }
  
-       if (parent_instr->type != nir_instr_type_intrinsic)
-               return -1;
+   if (parent_instr->type != nir_instr_type_intrinsic)
+      return -1;
  
-       nir_intrinsic_instr *input = nir_instr_as_intrinsic(parent_instr);
+   nir_intrinsic_instr *input = nir_instr_as_intrinsic(parent_instr);
  
-       if (input->intrinsic != nir_intrinsic_load_interpolated_input)
-               return -1;
+   if (input->intrinsic != nir_intrinsic_load_interpolated_input)
+      return -1;
  
-       /* limit to load_barycentric_pixel, other interpolation modes don't seem
-        * to be supported:
-        */
-       if (!input->src[0].is_ssa)
-               return -1;
+   /* limit to load_barycentric_pixel, other interpolation modes don't seem
+    * to be supported:
+    */
+   if (!input->src[0].is_ssa)
+      return -1;
  
-       nir_intrinsic_instr *interp =
-               nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
+   nir_intrinsic_instr *interp =
+      nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
  
-       if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
-               return -1;
+   if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
+      return -1;
  
-       /* we also need a const input offset: */
-       if (!nir_src_is_const(input->src[1]))
-               return -1;
+   /* we also need a const input offset: */
+   if (!nir_src_is_const(input->src[1]))
+      return -1;
  
-       unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
-       unsigned comp = nir_intrinsic_component(input);
+   unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
+   unsigned comp = nir_intrinsic_component(input);
  
-       return (4 * base) + comp;
+   return (4 * base) + comp;
  }
  
  int
  ir3_nir_coord_offset(nir_ssa_def *ssa)
  {
  
-       assert (ssa->num_components == 2);
-       return coord_offset(ssa);
+   assert(ssa->num_components == 2);
+   return coord_offset(ssa);
  }
  
  static bool
  has_src(nir_tex_instr *tex, nir_tex_src_type type)
  {
-       return nir_tex_instr_src_index(tex, type) >= 0;
+   return nir_tex_instr_src_index(tex, type) >= 0;
  }
  
  static bool
  ok_bindless_src(nir_tex_instr *tex, nir_tex_src_type type)
  {
-       int idx = nir_tex_instr_src_index(tex, type);
-       assert(idx >= 0);
-       nir_intrinsic_instr *bindless = ir3_bindless_resource(tex->src[idx].src);
-
-       /* TODO from SP_FS_BINDLESS_PREFETCH[n] it looks like this limit should
-        * be 1<<8 ?
-        */
-       return nir_src_is_const(bindless->src[0]) &&
-                       (nir_src_as_uint(bindless->src[0]) < (1 << 16));
+   int idx = nir_tex_instr_src_index(tex, type);
+   assert(idx >= 0);
+   nir_intrinsic_instr *bindless = ir3_bindless_resource(tex->src[idx].src);
+
+   /* TODO from SP_FS_BINDLESS_PREFETCH[n] it looks like this limit should
+    * be 1<<8 ?
+    */
+   return nir_src_is_const(bindless->src[0]) &&
+          (nir_src_as_uint(bindless->src[0]) < (1 << 16));
  }
  
  /**
@@ -134,107 +134,103 @@ ok_bindless_src(nir_tex_instr *tex, nir_tex_src_type type)
  static bool
  ok_tex_samp(nir_tex_instr *tex)
  {
-       if (has_src(tex, nir_tex_src_texture_handle)) {
-               /* bindless case: */
+   if (has_src(tex, nir_tex_src_texture_handle)) {
+      /* bindless case: */
  
-               assert(has_src(tex, nir_tex_src_sampler_handle));
+      assert(has_src(tex, nir_tex_src_sampler_handle));
  
-               return ok_bindless_src(tex, nir_tex_src_texture_handle) &&
-                               ok_bindless_src(tex, nir_tex_src_sampler_handle);
-       } else {
-               assert(!has_src(tex, nir_tex_src_texture_offset));
-               assert(!has_src(tex, nir_tex_src_sampler_offset));
+      return ok_bindless_src(tex, nir_tex_src_texture_handle) &&
+             ok_bindless_src(tex, nir_tex_src_sampler_handle);
+   } else {
+      assert(!has_src(tex, nir_tex_src_texture_offset));
+      assert(!has_src(tex, nir_tex_src_sampler_offset));
  
-               return (tex->texture_index <= 0x1f) &&
-                               (tex->sampler_index <= 0xf);
-       }
+      return (tex->texture_index <= 0x1f) && (tex->sampler_index <= 0xf);
+   }
  }
  
  static bool
  lower_tex_prefetch_block(nir_block *block)
  {
-       bool progress = false;
-
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_tex)
-                       continue;
-
-               nir_tex_instr *tex = nir_instr_as_tex(instr);
-               if (tex->op != nir_texop_tex)
-                       continue;
-
-               if (has_src(tex, nir_tex_src_bias) ||
-                               has_src(tex, nir_tex_src_lod) ||
-                               has_src(tex, nir_tex_src_comparator) ||
-                               has_src(tex, nir_tex_src_projector) ||
-                               has_src(tex, nir_tex_src_offset) ||
-                               has_src(tex, nir_tex_src_ddx) ||
-                               has_src(tex, nir_tex_src_ddy) ||
-                               has_src(tex, nir_tex_src_ms_index) ||
-                               has_src(tex, nir_tex_src_texture_offset) ||
-                               has_src(tex, nir_tex_src_sampler_offset))
-                       continue;
-
-               /* only prefetch for simple 2d tex fetch case */
-               if (tex->sampler_dim != GLSL_SAMPLER_DIM_2D || tex->is_array)
-                       continue;
-
-               if (!ok_tex_samp(tex))
-                       continue;
-
-               int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
-               /* First source should be the sampling coordinate. */
-               nir_tex_src *coord = &tex->src[idx];
-               debug_assert(coord->src.is_ssa);
-
-               if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
-                       tex->op = nir_texop_tex_prefetch;
-
-                       progress |= true;
-               }
-       }
-
-       return progress;
+   bool progress = false;
+
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_tex)
+         continue;
+
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      if (tex->op != nir_texop_tex)
+         continue;
+
+      if (has_src(tex, nir_tex_src_bias) || has_src(tex, nir_tex_src_lod) ||
+          has_src(tex, nir_tex_src_comparator) ||
+          has_src(tex, nir_tex_src_projector) ||
+          has_src(tex, nir_tex_src_offset) || has_src(tex, nir_tex_src_ddx) ||
+          has_src(tex, nir_tex_src_ddy) || has_src(tex, nir_tex_src_ms_index) ||
+          has_src(tex, nir_tex_src_texture_offset) ||
+          has_src(tex, nir_tex_src_sampler_offset))
+         continue;
+
+      /* only prefetch for simple 2d tex fetch case */
+      if (tex->sampler_dim != GLSL_SAMPLER_DIM_2D || tex->is_array)
+         continue;
+
+      if (!ok_tex_samp(tex))
+         continue;
+
+      int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+      /* First source should be the sampling coordinate. */
+      nir_tex_src *coord = &tex->src[idx];
+      debug_assert(coord->src.is_ssa);
+
+      if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
+         tex->op = nir_texop_tex_prefetch;
+
+         progress |= true;
+      }
+   }
+
+   return progress;
  }
  
  static bool
  lower_tex_prefetch_func(nir_function_impl *impl)
  {
-       /* Only instructions in the the outer-most block are considered
-        * eligible for pre-dispatch, because they need to be move-able
-        * to the beginning of the shader to avoid locking down the
-        * register holding the pre-fetched result for too long.
-        */
-       nir_block *block = nir_start_block(impl);
-       if (!block)
-               return false;
-
-       bool progress = lower_tex_prefetch_block(block);
-
-       if (progress) {
-               nir_metadata_preserve(impl, nir_metadata_block_index |
-                               nir_metadata_dominance);
-       }
-
-       return progress;
+   /* Only instructions in the the outer-most block are considered
+    * eligible for pre-dispatch, because they need to be move-able
+    * to the beginning of the shader to avoid locking down the
+    * register holding the pre-fetched result for too long.
+    */
+   nir_block *block = nir_start_block(impl);
+   if (!block)
+      return false;
+
+   bool progress = lower_tex_prefetch_block(block);
+
+   if (progress) {
+      nir_metadata_preserve(impl,
+                            nir_metadata_block_index | nir_metadata_dominance);
+   }
+
+   return progress;
  }
  
  bool
  ir3_nir_lower_tex_prefetch(nir_shader *shader)
  {
-       bool progress = false;
+   bool progress = false;
  
-       assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+   assert(shader->info.stage == MESA_SHADER_FRAGMENT);
  
-       nir_foreach_function (function, shader) {
-               /* Only texture sampling instructions inside the main function
-                * are eligible for pre-dispatch.
-                */
-               if (!function->impl || !function->is_entrypoint)
-                       continue;
+   nir_foreach_function (function, shader) {
+      /* Only texture sampling instructions inside the main function
+       * are eligible for pre-dispatch.
+       */
+      if (!function->impl || !function->is_entrypoint)
+         continue;
  
-               progress |= lower_tex_prefetch_func(function->impl);
-       }
+      progress |= lower_tex_prefetch_func(function->impl);
+   }
  
-       return progress;
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c

index 6650a28..4ca9aaa 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
@@ -21,8 +21,8 @@
   * IN THE SOFTWARE.
   */
  
-#include "ir3_nir.h"
  #include "compiler/nir/nir_builder.h"
+#include "ir3_nir.h"
  
  /* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
   * gather results, rather than before. As a result, it must be emulated with
@@ -32,70 +32,68 @@
  static nir_ssa_def *
  ir3_nir_lower_tg4_to_tex_instr(nir_builder *b, nir_instr *instr, void *data)
  {
-       nir_tex_instr *tg4 = nir_instr_as_tex(instr);
-       static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
+   nir_tex_instr *tg4 = nir_instr_as_tex(instr);
+   static const int offsets[3][2] = {{0, 1}, {1, 1}, {1, 0}};
  
-       nir_ssa_def *results[4];
-       int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
-       for (int i = 0; i < 4; i++) {
-               int num_srcs = tg4->num_srcs + 1 /* lod */;
-               if (offset_index < 0 && i < 3)
-                       num_srcs++;
+   nir_ssa_def *results[4];
+   int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
+   for (int i = 0; i < 4; i++) {
+      int num_srcs = tg4->num_srcs + 1 /* lod */;
+      if (offset_index < 0 && i < 3)
+         num_srcs++;
  
-               nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
-               tex->op = nir_texop_txl;
-               tex->sampler_dim = tg4->sampler_dim;
-               tex->coord_components = tg4->coord_components;
-               tex->is_array = tg4->is_array;
-               tex->is_shadow = tg4->is_shadow;
-               tex->is_new_style_shadow = tg4->is_new_style_shadow;
-               tex->texture_index = tg4->texture_index;
-               tex->sampler_index = tg4->sampler_index;
-               tex->dest_type = tg4->dest_type;
+      nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
+      tex->op = nir_texop_txl;
+      tex->sampler_dim = tg4->sampler_dim;
+      tex->coord_components = tg4->coord_components;
+      tex->is_array = tg4->is_array;
+      tex->is_shadow = tg4->is_shadow;
+      tex->is_new_style_shadow = tg4->is_new_style_shadow;
+      tex->texture_index = tg4->texture_index;
+      tex->sampler_index = tg4->sampler_index;
+      tex->dest_type = tg4->dest_type;
  
-               for (int j = 0; j < tg4->num_srcs; j++) {
-                       nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
-                       tex->src[j].src_type = tg4->src[j].src_type;
-               }
-               if (i != 3) {
-                       nir_ssa_def *offset =
-                               nir_vec2(b, nir_imm_int(b, offsets[i][0]),
-                                               nir_imm_int(b, offsets[i][1]));
-                       if (offset_index < 0) {
-                               tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
-                               tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
-                       } else {
-                               assert(nir_tex_instr_src_size(tex, offset_index) == 2);
-                               nir_ssa_def *orig = nir_ssa_for_src(
-                                       b, tex->src[offset_index].src, 2);
-                               tex->src[offset_index].src =
-                                       nir_src_for_ssa(nir_iadd(b, orig, offset));
-                       }
-               }
-               tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
-               tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
+      for (int j = 0; j < tg4->num_srcs; j++) {
+         nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
+         tex->src[j].src_type = tg4->src[j].src_type;
+      }
+      if (i != 3) {
+         nir_ssa_def *offset = nir_vec2(b, nir_imm_int(b, offsets[i][0]),
+                                        nir_imm_int(b, offsets[i][1]));
+         if (offset_index < 0) {
+            tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
+            tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
+         } else {
+            assert(nir_tex_instr_src_size(tex, offset_index) == 2);
+            nir_ssa_def *orig =
+               nir_ssa_for_src(b, tex->src[offset_index].src, 2);
+            tex->src[offset_index].src =
+               nir_src_for_ssa(nir_iadd(b, orig, offset));
+         }
+      }
+      tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
+      tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
  
-               nir_ssa_dest_init(&tex->instr, &tex->dest,
-                               nir_tex_instr_dest_size(tex), 32, NULL);
-               nir_builder_instr_insert(b, &tex->instr);
+      nir_ssa_dest_init(&tex->instr, &tex->dest, nir_tex_instr_dest_size(tex),
+                        32, NULL);
+      nir_builder_instr_insert(b, &tex->instr);
  
-               results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
-       }
+      results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
+   }
  
-       return nir_vec(b, results, 4);
+   return nir_vec(b, results, 4);
  }
  
  static bool
  ir3_nir_lower_tg4_to_tex_filter(const nir_instr *instr, const void *data)
  {
-       return (instr->type == nir_instr_type_tex &&
-                       nir_instr_as_tex(instr)->op == nir_texop_tg4);
+   return (instr->type == nir_instr_type_tex &&
+           nir_instr_as_tex(instr)->op == nir_texop_tg4);
  }
  
  bool
  ir3_nir_lower_tg4_to_tex(nir_shader *shader)
  {
-       return nir_shader_lower_instructions(shader,
-                       ir3_nir_lower_tg4_to_tex_filter,
-                       ir3_nir_lower_tg4_to_tex_instr, NULL);
+   return nir_shader_lower_instructions(shader, ir3_nir_lower_tg4_to_tex_filter,
+                                        ir3_nir_lower_tg4_to_tex_instr, NULL);
  }
diff --git a/src/freedreno/ir3/ir3_nir_move_varying_inputs.c b/src/freedreno/ir3/ir3_nir_move_varying_inputs.c

index 9c4ec75..b1534f0 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_move_varying_inputs.c
+++ b/src/freedreno/ir3/ir3_nir_move_varying_inputs.c
@@ -21,8 +21,8 @@
   * IN THE SOFTWARE.
   */
  
-#include "ir3_nir.h"
  #include "compiler/nir/nir_builder.h"
+#include "ir3_nir.h"
  
  /**
   * This pass moves varying fetches (and the instructions they depend on
@@ -46,25 +46,23 @@
   */
  
  typedef struct {
-       nir_block *start_block;
-       bool precondition_failed;
+   nir_block *start_block;
+   bool precondition_failed;
  } precond_state;
  
  typedef struct {
-       nir_shader *shader;
-       nir_block *start_block;
+   nir_shader *shader;
+   nir_block *start_block;
  } state;
  
-
-
  static void check_precondition_instr(precond_state *state, nir_instr *instr);
  static void move_instruction_to_start_block(state *state, nir_instr *instr);
  
  static bool
  check_precondition_src(nir_src *src, void *state)
  {
-       check_precondition_instr(state, src->ssa->parent_instr);
-       return true;
+   check_precondition_instr(state, src->ssa->parent_instr);
+   return true;
  }
  
  /* Recursively check if there is even a single dependency which
@@ -73,163 +71,163 @@ check_precondition_src(nir_src *src, void *state)
  static void
  check_precondition_instr(precond_state *state, nir_instr *instr)
  {
-       if (instr->block == state->start_block)
-               return;
-
-       switch (instr->type) {
-               case nir_instr_type_alu:
-               case nir_instr_type_deref:
-               case nir_instr_type_load_const:
-               case nir_instr_type_ssa_undef:
-                       /* These could be safely moved around */
-                       break;
-               case nir_instr_type_intrinsic: {
-                       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-                       if (!nir_intrinsic_can_reorder(intr)) {
-                               state->precondition_failed = true;
-                               return;
-                       }
-                       break;
-               }
-               default:
-                       state->precondition_failed = true;
-                       return;
-       }
-
-       nir_foreach_src(instr, check_precondition_src, state);
+   if (instr->block == state->start_block)
+      return;
+
+   switch (instr->type) {
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_load_const:
+   case nir_instr_type_ssa_undef:
+      /* These could be safely moved around */
+      break;
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      if (!nir_intrinsic_can_reorder(intr)) {
+         state->precondition_failed = true;
+         return;
+      }
+      break;
+   }
+   default:
+      state->precondition_failed = true;
+      return;
+   }
+
+   nir_foreach_src(instr, check_precondition_src, state);
  }
  
  static void
  check_precondition_block(precond_state *state, nir_block *block)
  {
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
  
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  
-               switch (intr->intrinsic) {
-               case nir_intrinsic_load_interpolated_input:
-               case nir_intrinsic_load_input:
-                       break;
-               default:
-                       continue;
-               }
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_interpolated_input:
+      case nir_intrinsic_load_input:
+         break;
+      default:
+         continue;
+      }
  
-               check_precondition_instr(state, instr);
+      check_precondition_instr(state, instr);
  
-               if (state->precondition_failed)
-                       return;
-       }
+      if (state->precondition_failed)
+         return;
+   }
  }
  
  static bool
  move_src(nir_src *src, void *state)
  {
-       /* At this point we shouldn't have any non-ssa src: */
-       debug_assert(src->is_ssa);
-       move_instruction_to_start_block(state, src->ssa->parent_instr);
-       return true;
+   /* At this point we shouldn't have any non-ssa src: */
+   debug_assert(src->is_ssa);
+   move_instruction_to_start_block(state, src->ssa->parent_instr);
+   return true;
  }
  
  static void
  move_instruction_to_start_block(state *state, nir_instr *instr)
  {
-       /* nothing to do if the instruction is already in the start block */
-       if (instr->block == state->start_block)
-               return;
-
-       /* first move (recursively) all src's to ensure they appear before
-        * load*_input that we are trying to move:
-        */
-       nir_foreach_src(instr, move_src, state);
-
-       /* and then move the instruction itself:
-        */
-       exec_node_remove(&instr->node);
-       exec_list_push_tail(&state->start_block->instr_list, &instr->node);
-       instr->block = state->start_block;
+   /* nothing to do if the instruction is already in the start block */
+   if (instr->block == state->start_block)
+      return;
+
+   /* first move (recursively) all src's to ensure they appear before
+    * load*_input that we are trying to move:
+    */
+   nir_foreach_src(instr, move_src, state);
+
+   /* and then move the instruction itself:
+    */
+   exec_node_remove(&instr->node);
+   exec_list_push_tail(&state->start_block->instr_list, &instr->node);
+   instr->block = state->start_block;
  }
  
  static bool
  move_varying_inputs_block(state *state, nir_block *block)
  {
-       bool progress = false;
+   bool progress = false;
  
-       nir_foreach_instr_safe (instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                       continue;
+   nir_foreach_instr_safe (instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
  
-               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  
-               switch (intr->intrinsic) {
-               case nir_intrinsic_load_interpolated_input:
-               case nir_intrinsic_load_input:
-                       /* TODO any others to handle? */
-                       break;
-               default:
-                       continue;
-               }
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_interpolated_input:
+      case nir_intrinsic_load_input:
+         /* TODO any others to handle? */
+         break;
+      default:
+         continue;
+      }
  
-               debug_assert(intr->dest.is_ssa);
+      debug_assert(intr->dest.is_ssa);
  
-               move_instruction_to_start_block(state, instr);
+      move_instruction_to_start_block(state, instr);
  
-               progress = true;
-       }
+      progress = true;
+   }
  
-       return progress;
+   return progress;
  }
  
  bool
  ir3_nir_move_varying_inputs(nir_shader *shader)
  {
-       bool progress = false;
+   bool progress = false;
  
-       debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+   debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
  
-       nir_foreach_function (function, shader) {
-               precond_state state;
+   nir_foreach_function (function, shader) {
+      precond_state state;
  
-               if (!function->impl)
-                       continue;
+      if (!function->impl)
+         continue;
  
-               state.precondition_failed = false;
-               state.start_block = nir_start_block(function->impl);
+      state.precondition_failed = false;
+      state.start_block = nir_start_block(function->impl);
  
-               nir_foreach_block (block, function->impl) {
-                       if (block == state.start_block)
-                               continue;
+      nir_foreach_block (block, function->impl) {
+         if (block == state.start_block)
+            continue;
  
-                       check_precondition_block(&state, block);
+         check_precondition_block(&state, block);
  
-                       if (state.precondition_failed)
-                               return false;
-               }
-       }
+         if (state.precondition_failed)
+            return false;
+      }
+   }
  
-       nir_foreach_function (function, shader) {
-               state state;
+   nir_foreach_function (function, shader) {
+      state state;
  
-               if (!function->impl)
-                       continue;
+      if (!function->impl)
+         continue;
  
-               state.shader = shader;
-               state.start_block = nir_start_block(function->impl);
+      state.shader = shader;
+      state.start_block = nir_start_block(function->impl);
  
-               bool progress = false;
-               nir_foreach_block (block, function->impl) {
-                       /* don't need to move anything that is already in the first block */
-                       if (block == state.start_block)
-                               continue;
-                       progress |= move_varying_inputs_block(&state, block);
-               }
+      bool progress = false;
+      nir_foreach_block (block, function->impl) {
+         /* don't need to move anything that is already in the first block */
+         if (block == state.start_block)
+            continue;
+         progress |= move_varying_inputs_block(&state, block);
+      }
  
-               if (progress) {
-                       nir_metadata_preserve(function->impl,
-                               nir_metadata_block_index | nir_metadata_dominance);
-               }
-       }
+      if (progress) {
+         nir_metadata_preserve(
+            function->impl, nir_metadata_block_index | nir_metadata_dominance);
+      }
+   }
  
-       return progress;
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c

index 50c223b..b3a6a9d 100644 (file)
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -24,7 +24,6 @@
   *    Rob Clark <robclark@freedesktop.org>
   */
  
-
  #include "util/dag.h"
  #include "util/u_math.h"
  
@@ -37,114 +36,120 @@
  #else
  #define SCHED_DEBUG 0
  #endif
-#define d(fmt, ...) do { if (SCHED_DEBUG) { \
-       printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
-} } while (0)
-
-#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
-       printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
-       ir3_print_instr(instr); \
-} } while (0)
+#define d(fmt, ...)                                                            \
+   do {                                                                        \
+      if (SCHED_DEBUG) {                                                       \
+         printf("PSCHED: " fmt "\n", ##__VA_ARGS__);                           \
+      }                                                                        \
+   } while (0)
+
+#define di(instr, fmt, ...)                                                    \
+   do {                                                                        \
+      if (SCHED_DEBUG) {                                                       \
+         printf("PSCHED: " fmt ": ", ##__VA_ARGS__);                           \
+         ir3_print_instr(instr);                                               \
+      }                                                                        \
+   } while (0)
  
  /*
   * Post RA Instruction Scheduling
   */
  
  struct ir3_postsched_ctx {
-       struct ir3 *ir;
+   struct ir3 *ir;
  
-       struct ir3_shader_variant *v;
+   struct ir3_shader_variant *v;
  
-       void *mem_ctx;
-       struct ir3_block *block;           /* the current block */
-       struct dag *dag;
+   void *mem_ctx;
+   struct ir3_block *block; /* the current block */
+   struct dag *dag;
  
-       struct list_head unscheduled_list; /* unscheduled instructions */
+   struct list_head unscheduled_list; /* unscheduled instructions */
  
-       int sfu_delay;
-       int tex_delay;
+   int sfu_delay;
+   int tex_delay;
  };
  
  struct ir3_postsched_node {
-       struct dag_node dag;     /* must be first for util_dynarray_foreach */
-       struct ir3_instruction *instr;
-       bool partially_evaluated_path;
+   struct dag_node dag; /* must be first for util_dynarray_foreach */
+   struct ir3_instruction *instr;
+   bool partially_evaluated_path;
  
-       bool has_tex_src, has_sfu_src;
+   bool has_tex_src, has_sfu_src;
  
-       unsigned delay;
-       unsigned max_delay;
+   unsigned delay;
+   unsigned max_delay;
  };
  
-#define foreach_sched_node(__n, __list) \
-       list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
+#define foreach_sched_node(__n, __list)                                        \
+   list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
  
  static bool
  has_tex_src(struct ir3_instruction *instr)
  {
-       struct ir3_postsched_node *node = instr->data;
-       return node->has_tex_src;
+   struct ir3_postsched_node *node = instr->data;
+   return node->has_tex_src;
  }
  
  static bool
  has_sfu_src(struct ir3_instruction *instr)
  {
-       struct ir3_postsched_node *node = instr->data;
-       return node->has_sfu_src;
+   struct ir3_postsched_node *node = instr->data;
+   return node->has_sfu_src;
  }
  
  static void
  schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  {
-       debug_assert(ctx->block == instr->block);
+   debug_assert(ctx->block == instr->block);
  
-       /* remove from unscheduled_list:
-        */
-       list_delinit(&instr->node);
+   /* remove from unscheduled_list:
+    */
+   list_delinit(&instr->node);
  
-       di(instr, "schedule");
+   di(instr, "schedule");
  
-       list_addtail(&instr->node, &instr->block->instr_list);
+   list_addtail(&instr->node, &instr->block->instr_list);
  
-       struct ir3_postsched_node *n = instr->data;
-       dag_prune_head(ctx->dag, &n->dag);
+   struct ir3_postsched_node *n = instr->data;
+   dag_prune_head(ctx->dag, &n->dag);
  
-       if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
-               return;
+   if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
+      return;
  
-       if (is_sfu(instr)) {
-               ctx->sfu_delay = 8;
-       } else if (has_sfu_src(instr)) {
-               ctx->sfu_delay = 0;
-       } else if (ctx->sfu_delay > 0) {
-               ctx->sfu_delay--;
-       }
+   if (is_sfu(instr)) {
+      ctx->sfu_delay = 8;
+   } else if (has_sfu_src(instr)) {
+      ctx->sfu_delay = 0;
+   } else if (ctx->sfu_delay > 0) {
+      ctx->sfu_delay--;
+   }
  
-       if (is_tex_or_prefetch(instr)) {
-               ctx->tex_delay = 10;
-       } else if (has_tex_src(instr)) {
-               ctx->tex_delay = 0;
-       } else if (ctx->tex_delay > 0) {
-               ctx->tex_delay--;
-       }
+   if (is_tex_or_prefetch(instr)) {
+      ctx->tex_delay = 10;
+   } else if (has_tex_src(instr)) {
+      ctx->tex_delay = 0;
+   } else if (ctx->tex_delay > 0) {
+      ctx->tex_delay--;
+   }
  }
  
  static void
  dump_state(struct ir3_postsched_ctx *ctx)
  {
-       if (!SCHED_DEBUG)
-               return;
+   if (!SCHED_DEBUG)
+      return;
  
-       foreach_sched_node (n, &ctx->dag->heads) {
-               di(n->instr, "maxdel=%3d    ", n->max_delay);
+   foreach_sched_node (n, &ctx->dag->heads) {
+      di(n->instr, "maxdel=%3d    ", n->max_delay);
  
-               util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
-                       struct ir3_postsched_node *child =
-                               (struct ir3_postsched_node *)edge->child;
+      util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+         struct ir3_postsched_node *child =
+            (struct ir3_postsched_node *)edge->child;
  
-                       di(child->instr, " -> (%d parents) ", child->dag.parent_count);
-               }
-       }
+         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
+      }
+   }
  }
  
  /* Determine if this is an instruction that we'd prefer not to schedule
@@ -155,257 +160,257 @@ dump_state(struct ir3_postsched_ctx *ctx)
  static bool
  would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (ctx->sfu_delay) {
-               if (has_sfu_src(instr))
-                       return true;
-       }
+   if (ctx->sfu_delay) {
+      if (has_sfu_src(instr))
+         return true;
+   }
  
-       if (ctx->tex_delay) {
-               if (has_tex_src(instr))
-                       return true;
-       }
+   if (ctx->tex_delay) {
+      if (has_tex_src(instr))
+         return true;
+   }
  
-       return false;
+   return false;
  }
  
  /* find instruction to schedule: */
  static struct ir3_instruction *
  choose_instr(struct ir3_postsched_ctx *ctx)
  {
-       struct ir3_postsched_node *chosen = NULL;
-
-       dump_state(ctx);
-
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (!is_meta(n->instr))
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "prio: chose (meta)");
-               return chosen->instr;
-       }
-
-       /* Try to schedule inputs with a higher priority, if possible, as
-        * the last bary.f unlocks varying storage to unblock more VS
-        * warps.
-        */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (!is_input(n->instr))
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "prio: chose (input)");
-               return chosen->instr;
-       }
-
-       /* Next prioritize discards: */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               unsigned d =
-                       ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
-
-               if (d > 0)
-                       continue;
-
-               if (!is_kill_or_demote(n->instr))
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "csp: chose (kill, hard ready)");
-               return chosen->instr;
-       }
-
-       /* Next prioritize expensive instructions: */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               unsigned d =
-                       ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
-
-               if (d > 0)
-                       continue;
-
-               if (!(is_sfu(n->instr) || is_tex(n->instr)))
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
-               return chosen->instr;
-       }
-
-       /*
-        * Sometimes be better to take a nop, rather than scheduling an
-        * instruction that would require an (ss) shortly after another
-        * SFU..  ie. if last SFU was just one or two instr ago, and we
-        * could choose between taking a nop and then scheduling
-        * something else, vs scheduling the immed avail instruction that
-        * would require (ss), we are better with the nop.
-        */
-       for (unsigned delay = 0; delay < 4; delay++) {
-               foreach_sched_node (n, &ctx->dag->heads) {
-                       if (would_sync(ctx, n->instr))
-                               continue;
-
-                       unsigned d =
-                               ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
-
-                       if (d > delay)
-                               continue;
-
-                       if (!chosen || (chosen->max_delay < n->max_delay))
-                               chosen = n;
-               }
-
-               if (chosen) {
-                       di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
-                       return chosen->instr;
-               }
-       }
-
-       /* Next try to find a ready leader w/ soft delay (ie. including extra
-        * delay for things like tex fetch which can be synchronized w/ sync
-        * bit (but we probably do want to schedule some other instructions
-        * while we wait)
-        */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               unsigned d =
-                       ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
-
-               if (d > 0)
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "csp: chose (soft ready)");
-               return chosen->instr;
-       }
-
-       /* Next try to find a ready leader that can be scheduled without nop's,
-        * which in the case of things that need (sy)/(ss) could result in
-        * stalls.. but we've already decided there is not a better option.
-        */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               unsigned d =
-                       ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
-
-               if (d > 0)
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "csp: chose (hard ready)");
-               return chosen->instr;
-       }
-
-       /* Otherwise choose leader with maximum cost:
-        *
-        * TODO should we try to balance cost and delays?  I guess it is
-        * a balance between now-nop's and future-nop's?
-        */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (!chosen || chosen->max_delay < n->max_delay)
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "csp: chose (leader)");
-               return chosen->instr;
-       }
-
-       return NULL;
+   struct ir3_postsched_node *chosen = NULL;
+
+   dump_state(ctx);
+
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (!is_meta(n->instr))
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "prio: chose (meta)");
+      return chosen->instr;
+   }
+
+   /* Try to schedule inputs with a higher priority, if possible, as
+    * the last bary.f unlocks varying storage to unblock more VS
+    * warps.
+    */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (!is_input(n->instr))
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "prio: chose (input)");
+      return chosen->instr;
+   }
+
+   /* Next prioritize discards: */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      unsigned d =
+         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+
+      if (d > 0)
+         continue;
+
+      if (!is_kill_or_demote(n->instr))
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "csp: chose (kill, hard ready)");
+      return chosen->instr;
+   }
+
+   /* Next prioritize expensive instructions: */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      unsigned d =
+         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+
+      if (d > 0)
+         continue;
+
+      if (!(is_sfu(n->instr) || is_tex(n->instr)))
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
+      return chosen->instr;
+   }
+
+   /*
+    * Sometimes be better to take a nop, rather than scheduling an
+    * instruction that would require an (ss) shortly after another
+    * SFU..  ie. if last SFU was just one or two instr ago, and we
+    * could choose between taking a nop and then scheduling
+    * something else, vs scheduling the immed avail instruction that
+    * would require (ss), we are better with the nop.
+    */
+   for (unsigned delay = 0; delay < 4; delay++) {
+      foreach_sched_node (n, &ctx->dag->heads) {
+         if (would_sync(ctx, n->instr))
+            continue;
+
+         unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
+                                            ctx->v->mergedregs);
+
+         if (d > delay)
+            continue;
+
+         if (!chosen || (chosen->max_delay < n->max_delay))
+            chosen = n;
+      }
+
+      if (chosen) {
+         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
+         return chosen->instr;
+      }
+   }
+
+   /* Next try to find a ready leader w/ soft delay (ie. including extra
+    * delay for things like tex fetch which can be synchronized w/ sync
+    * bit (but we probably do want to schedule some other instructions
+    * while we wait)
+    */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      unsigned d =
+         ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
+
+      if (d > 0)
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "csp: chose (soft ready)");
+      return chosen->instr;
+   }
+
+   /* Next try to find a ready leader that can be scheduled without nop's,
+    * which in the case of things that need (sy)/(ss) could result in
+    * stalls.. but we've already decided there is not a better option.
+    */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      unsigned d =
+         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
+
+      if (d > 0)
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "csp: chose (hard ready)");
+      return chosen->instr;
+   }
+
+   /* Otherwise choose leader with maximum cost:
+    *
+    * TODO should we try to balance cost and delays?  I guess it is
+    * a balance between now-nop's and future-nop's?
+    */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "csp: chose (leader)");
+      return chosen->instr;
+   }
+
+   return NULL;
  }
  
  struct ir3_postsched_deps_state {
-       struct ir3_postsched_ctx *ctx;
-
-       enum { F, R } direction;
-
-       bool merged;
-
-       /* Track the mapping between sched node (instruction) that last
-        * wrote a given register (in whichever direction we are iterating
-        * the block)
-        *
-        * Note, this table is twice as big as the # of regs, to deal with
-        * half-precision regs.  The approach differs depending on whether
-        * the half and full precision register files are "merged" (conflict,
-        * ie. a6xx+) in which case we consider each full precision dep
-        * as two half-precision dependencies, vs older separate (non-
-        * conflicting) in which case the first half of the table is used
-        * for full precision and 2nd half for half-precision.
-        */
-       struct ir3_postsched_node *regs[2 * 256];
+   struct ir3_postsched_ctx *ctx;
+
+   enum { F, R } direction;
+
+   bool merged;
+
+   /* Track the mapping between sched node (instruction) that last
+    * wrote a given register (in whichever direction we are iterating
+    * the block)
+    *
+    * Note, this table is twice as big as the # of regs, to deal with
+    * half-precision regs.  The approach differs depending on whether
+    * the half and full precision register files are "merged" (conflict,
+    * ie. a6xx+) in which case we consider each full precision dep
+    * as two half-precision dependencies, vs older separate (non-
+    * conflicting) in which case the first half of the table is used
+    * for full precision and 2nd half for half-precision.
+    */
+   struct ir3_postsched_node *regs[2 * 256];
  };
  
  /* bounds checking read/write accessors, since OoB access to stuff on
   * the stack is gonna cause a bad day.
   */
-#define dep_reg(state, idx) *({ \
-               assert((idx) < ARRAY_SIZE((state)->regs)); \
-               &(state)->regs[(idx)]; \
-       })
+#define dep_reg(state, idx)                                                    \
+   *({                                                                         \
+      assert((idx) < ARRAY_SIZE((state)->regs));                               \
+      &(state)->regs[(idx)];                                                   \
+   })
  
  static void
  add_dep(struct ir3_postsched_deps_state *state,
-               struct ir3_postsched_node *before,
-               struct ir3_postsched_node *after)
+        struct ir3_postsched_node *before, struct ir3_postsched_node *after)
  {
-       if (!before || !after)
-               return;
+   if (!before || !after)
+      return;
  
-       assert(before != after);
+   assert(before != after);
  
-       if (state->direction == F) {
-               dag_add_edge(&before->dag, &after->dag, NULL);
-       } else {
-               dag_add_edge(&after->dag, &before->dag, NULL);
-       }
+   if (state->direction == F) {
+      dag_add_edge(&before->dag, &after->dag, NULL);
+   } else {
+      dag_add_edge(&after->dag, &before->dag, NULL);
+   }
  }
  
  static void
  add_single_reg_dep(struct ir3_postsched_deps_state *state,
-               struct ir3_postsched_node *node, unsigned num, int src_n)
+                   struct ir3_postsched_node *node, unsigned num, int src_n)
  {
-       struct ir3_postsched_node *dep = dep_reg(state, num);
-
-       if (src_n >= 0 && dep && state->direction == F) {
-               unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
-               node->delay = MAX2(node->delay, d);
-               if (is_tex_or_prefetch(dep->instr))
-                       node->has_tex_src = true;
-               if (is_tex_or_prefetch(dep->instr))
-                       node->has_sfu_src = true;
-       }
-
-       add_dep(state, dep, node);
-       if (src_n < 0) {
-               dep_reg(state, num) = node;
-       }
+   struct ir3_postsched_node *dep = dep_reg(state, num);
+
+   if (src_n >= 0 && dep && state->direction == F) {
+      unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
+      node->delay = MAX2(node->delay, d);
+      if (is_tex_or_prefetch(dep->instr))
+         node->has_tex_src = true;
+      if (is_tex_or_prefetch(dep->instr))
+         node->has_sfu_src = true;
+   }
+
+   add_dep(state, dep, node);
+   if (src_n < 0) {
+      dep_reg(state, num) = node;
+   }
  }
  
  /* This is where we handled full vs half-precision, and potential conflicts
   * between half and full precision that result in additional dependencies.
   * The 'reg' arg is really just to know half vs full precision.
- * 
+ *
   * If non-negative, then this adds a dependency on a source register, and
   * src_n is the index passed into ir3_delayslots() for calculating the delay:
   * If positive, corresponds to node->instr->regs[src_n]. If negative, then
@@ -413,302 +418,303 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state,
   */
  static void
  add_reg_dep(struct ir3_postsched_deps_state *state,
-               struct ir3_postsched_node *node, const struct ir3_register *reg,
-               unsigned num, int src_n)
+            struct ir3_postsched_node *node, const struct ir3_register *reg,
+            unsigned num, int src_n)
  {
-       if (state->merged) {
-               /* Make sure that special registers like a0.x that are written as
-                * half-registers don't alias random full registers by pretending that
-                * they're full registers:
-                */
-               if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
-                       /* single conflict in half-reg space: */
-                       add_single_reg_dep(state, node, num, src_n);
-               } else {
-                       /* two conflicts in half-reg space: */
-                       add_single_reg_dep(state, node, 2 * num + 0, src_n);
-                       add_single_reg_dep(state, node, 2 * num + 1, src_n);
-               }
-       } else {
-               if (reg->flags & IR3_REG_HALF)
-                       num += ARRAY_SIZE(state->regs) / 2;
-               add_single_reg_dep(state, node, num, src_n);
-       }
+   if (state->merged) {
+      /* Make sure that special registers like a0.x that are written as
+       * half-registers don't alias random full registers by pretending that
+       * they're full registers:
+       */
+      if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
+         /* single conflict in half-reg space: */
+         add_single_reg_dep(state, node, num, src_n);
+      } else {
+         /* two conflicts in half-reg space: */
+         add_single_reg_dep(state, node, 2 * num + 0, src_n);
+         add_single_reg_dep(state, node, 2 * num + 1, src_n);
+      }
+   } else {
+      if (reg->flags & IR3_REG_HALF)
+         num += ARRAY_SIZE(state->regs) / 2;
+      add_single_reg_dep(state, node, num, src_n);
+   }
  }
  
  static void
  calculate_deps(struct ir3_postsched_deps_state *state,
-               struct ir3_postsched_node *node)
+               struct ir3_postsched_node *node)
  {
-       /* Add dependencies on instructions that previously (or next,
-        * in the reverse direction) wrote any of our src registers:
-        */
-       foreach_src_n (reg, i, node->instr) {
-               if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-                       continue;
-
-               if (reg->flags & IR3_REG_RELATIV) {
-                       /* mark entire array as read: */
-                       for (unsigned j = 0; j < reg->size; j++) {
-                               add_reg_dep(state, node, reg, reg->array.base + j, i);
-                       }
-               } else {
-                       assert(reg->wrmask >= 1);
-                       u_foreach_bit (b, reg->wrmask) {
-                               add_reg_dep(state, node, reg, reg->num + b, i);
-                       }
-               }
-       }
-
-       /* And then after we update the state for what this instruction
-        * wrote:
-        */
-       foreach_dst (reg, node->instr) {
-               if (reg->wrmask == 0)
-                       continue;
-               if (reg->flags & IR3_REG_RELATIV) {
-                       /* mark the entire array as written: */
-                       for (unsigned i = 0; i < reg->size; i++) {
-                               add_reg_dep(state, node, reg, reg->array.base + i, -1);
-                       }
-               } else {
-                       assert(reg->wrmask >= 1);
-                       u_foreach_bit (b, reg->wrmask) {
-                               add_reg_dep(state, node, reg, reg->num + b, -1);
-                       }
-               }
-       }
+   /* Add dependencies on instructions that previously (or next,
+    * in the reverse direction) wrote any of our src registers:
+    */
+   foreach_src_n (reg, i, node->instr) {
+      if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+         continue;
+
+      if (reg->flags & IR3_REG_RELATIV) {
+         /* mark entire array as read: */
+         for (unsigned j = 0; j < reg->size; j++) {
+            add_reg_dep(state, node, reg, reg->array.base + j, i);
+         }
+      } else {
+         assert(reg->wrmask >= 1);
+         u_foreach_bit (b, reg->wrmask) {
+            add_reg_dep(state, node, reg, reg->num + b, i);
+         }
+      }
+   }
+
+   /* And then after we update the state for what this instruction
+    * wrote:
+    */
+   foreach_dst (reg, node->instr) {
+      if (reg->wrmask == 0)
+         continue;
+      if (reg->flags & IR3_REG_RELATIV) {
+         /* mark the entire array as written: */
+         for (unsigned i = 0; i < reg->size; i++) {
+            add_reg_dep(state, node, reg, reg->array.base + i, -1);
+         }
+      } else {
+         assert(reg->wrmask >= 1);
+         u_foreach_bit (b, reg->wrmask) {
+            add_reg_dep(state, node, reg, reg->num + b, -1);
+         }
+      }
+   }
  }
  
  static void
  calculate_forward_deps(struct ir3_postsched_ctx *ctx)
  {
-       struct ir3_postsched_deps_state state = {
-                       .ctx = ctx,
-                       .direction = F,
-                       .merged = ctx->v->mergedregs,
-       };
-
-       foreach_instr (instr, &ctx->unscheduled_list) {
-               calculate_deps(&state, instr->data);
-       }
+   struct ir3_postsched_deps_state state = {
+      .ctx = ctx,
+      .direction = F,
+      .merged = ctx->v->mergedregs,
+   };
+
+   foreach_instr (instr, &ctx->unscheduled_list) {
+      calculate_deps(&state, instr->data);
+   }
  }
  
  static void
  calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
  {
-       struct ir3_postsched_deps_state state = {
-                       .ctx = ctx,
-                       .direction = R,
-                       .merged = ctx->v->mergedregs,
-       };
-
-       foreach_instr_rev (instr, &ctx->unscheduled_list) {
-               calculate_deps(&state, instr->data);
-       }
+   struct ir3_postsched_deps_state state = {
+      .ctx = ctx,
+      .direction = R,
+      .merged = ctx->v->mergedregs,
+   };
+
+   foreach_instr_rev (instr, &ctx->unscheduled_list) {
+      calculate_deps(&state, instr->data);
+   }
  }
  
  static void
  sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  {
-       struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
+   struct ir3_postsched_node *n =
+      rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
  
-       dag_init_node(ctx->dag, &n->dag);
+   dag_init_node(ctx->dag, &n->dag);
  
-       n->instr = instr;
-       instr->data = n;
+   n->instr = instr;
+   instr->data = n;
  }
  
  static void
  sched_dag_max_delay_cb(struct dag_node *node, void *state)
  {
-       struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
-       uint32_t max_delay = 0;
+   struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
+   uint32_t max_delay = 0;
  
-       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
-               struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
-               max_delay = MAX2(child->max_delay, max_delay);
-       }
+   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+      struct ir3_postsched_node *child =
+         (struct ir3_postsched_node *)edge->child;
+      max_delay = MAX2(child->max_delay, max_delay);
+   }
  
-       n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
+   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
  }
  
  static void
  sched_dag_init(struct ir3_postsched_ctx *ctx)
  {
-       ctx->mem_ctx = ralloc_context(NULL);
-
-       ctx->dag = dag_create(ctx->mem_ctx);
-
-       foreach_instr (instr, &ctx->unscheduled_list)
-               sched_node_init(ctx, instr);
-
-       calculate_forward_deps(ctx);
-       calculate_reverse_deps(ctx);
-
-       /*
-        * To avoid expensive texture fetches, etc, from being moved ahead
-        * of kills, track the kills we've seen so far, so we can add an
-        * extra dependency on them for tex/mem instructions
-        */
-       struct util_dynarray kills;
-       util_dynarray_init(&kills, ctx->mem_ctx);
-
-       /* The last bary.f with the (ei) flag must be scheduled before any kills,
-        * or the hw gets angry. Keep track of inputs here so we can add the
-        * false dep on the kill instruction.
-        */
-       struct util_dynarray inputs;
-       util_dynarray_init(&inputs, ctx->mem_ctx);
-
-       /*
-        * Normal srcs won't be in SSA at this point, those are dealt with in
-        * calculate_forward_deps() and calculate_reverse_deps().  But we still
-        * have the false-dep information in SSA form, so go ahead and add
-        * dependencies for that here:
-        */
-       foreach_instr (instr, &ctx->unscheduled_list) {
-               struct ir3_postsched_node *n = instr->data;
-
-               foreach_ssa_src_n (src, i, instr) {
-                       if (src->block != instr->block)
-                               continue;
-
-                       /* we can end up with unused false-deps.. just skip them: */
-                       if (src->flags & IR3_INSTR_UNUSED)
-                               continue;
-
-                       struct ir3_postsched_node *sn = src->data;
-
-                       /* don't consider dependencies in other blocks: */
-                       if (src->block != instr->block)
-                               continue;
-
-                       dag_add_edge(&sn->dag, &n->dag, NULL);
-               }
-
-               if (is_input(instr)) {
-                       util_dynarray_append(&inputs, struct ir3_instruction *, instr);
-               } else if (is_kill_or_demote(instr)) {
-                       util_dynarray_foreach(&inputs, struct ir3_instruction *, instrp) {
-                               struct ir3_instruction *input = *instrp;
-                               struct ir3_postsched_node *in = input->data;
-                               dag_add_edge(&in->dag, &n->dag, NULL);
-                       }
-                       util_dynarray_append(&kills, struct ir3_instruction *, instr);
-               } else if (is_tex(instr) || is_mem(instr)) {
-                       util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
-                               struct ir3_instruction *kill = *instrp;
-                               struct ir3_postsched_node *kn = kill->data;
-                               dag_add_edge(&kn->dag, &n->dag, NULL);
-                       }
-               }
-       }
-
-       // TODO do we want to do this after reverse-dependencies?
-       dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
+   ctx->mem_ctx = ralloc_context(NULL);
+
+   ctx->dag = dag_create(ctx->mem_ctx);
+
+   foreach_instr (instr, &ctx->unscheduled_list)
+      sched_node_init(ctx, instr);
+
+   calculate_forward_deps(ctx);
+   calculate_reverse_deps(ctx);
+
+   /*
+    * To avoid expensive texture fetches, etc, from being moved ahead
+    * of kills, track the kills we've seen so far, so we can add an
+    * extra dependency on them for tex/mem instructions
+    */
+   struct util_dynarray kills;
+   util_dynarray_init(&kills, ctx->mem_ctx);
+
+   /* The last bary.f with the (ei) flag must be scheduled before any kills,
+    * or the hw gets angry. Keep track of inputs here so we can add the
+    * false dep on the kill instruction.
+    */
+   struct util_dynarray inputs;
+   util_dynarray_init(&inputs, ctx->mem_ctx);
+
+   /*
+    * Normal srcs won't be in SSA at this point, those are dealt with in
+    * calculate_forward_deps() and calculate_reverse_deps().  But we still
+    * have the false-dep information in SSA form, so go ahead and add
+    * dependencies for that here:
+    */
+   foreach_instr (instr, &ctx->unscheduled_list) {
+      struct ir3_postsched_node *n = instr->data;
+
+      foreach_ssa_src_n (src, i, instr) {
+         if (src->block != instr->block)
+            continue;
+
+         /* we can end up with unused false-deps.. just skip them: */
+         if (src->flags & IR3_INSTR_UNUSED)
+            continue;
+
+         struct ir3_postsched_node *sn = src->data;
+
+         /* don't consider dependencies in other blocks: */
+         if (src->block != instr->block)
+            continue;
+
+         dag_add_edge(&sn->dag, &n->dag, NULL);
+      }
+
+      if (is_input(instr)) {
+         util_dynarray_append(&inputs, struct ir3_instruction *, instr);
+      } else if (is_kill_or_demote(instr)) {
+         util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
+            struct ir3_instruction *input = *instrp;
+            struct ir3_postsched_node *in = input->data;
+            dag_add_edge(&in->dag, &n->dag, NULL);
+         }
+         util_dynarray_append(&kills, struct ir3_instruction *, instr);
+      } else if (is_tex(instr) || is_mem(instr)) {
+         util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
+            struct ir3_instruction *kill = *instrp;
+            struct ir3_postsched_node *kn = kill->data;
+            dag_add_edge(&kn->dag, &n->dag, NULL);
+         }
+      }
+   }
+
+   // TODO do we want to do this after reverse-dependencies?
+   dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
  }
  
  static void
  sched_dag_destroy(struct ir3_postsched_ctx *ctx)
  {
-       ralloc_free(ctx->mem_ctx);
-       ctx->mem_ctx = NULL;
-       ctx->dag = NULL;
+   ralloc_free(ctx->mem_ctx);
+   ctx->mem_ctx = NULL;
+   ctx->dag = NULL;
  }
  
  static void
  sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
  {
-       ctx->block = block;
-       ctx->tex_delay = 0;
-       ctx->sfu_delay = 0;
-
-       /* move all instructions to the unscheduled list, and
-        * empty the block's instruction list (to which we will
-        * be inserting).
-        */
-       list_replace(&block->instr_list, &ctx->unscheduled_list);
-       list_inithead(&block->instr_list);
-
-       // TODO once we are using post-sched for everything we can
-       // just not stick in NOP's prior to post-sched, and drop this.
-       // for now keep this, since it makes post-sched optional:
-       foreach_instr_safe (instr, &ctx->unscheduled_list) {
-               switch (instr->opc) {
-               case OPC_NOP:
-               case OPC_B:
-               case OPC_JUMP:
-                       list_delinit(&instr->node);
-                       break;
-               default:
-                       break;
-               }
-       }
-
-       sched_dag_init(ctx);
-
-       /* First schedule all meta:input instructions, followed by
-        * tex-prefetch.  We want all of the instructions that load
-        * values into registers before the shader starts to go
-        * before any other instructions.  But in particular we
-        * want inputs to come before prefetches.  This is because
-        * a FS's bary_ij input may not actually be live in the
-        * shader, but it should not be scheduled on top of any
-        * other input (but can be overwritten by a tex prefetch)
-        */
-       foreach_instr_safe (instr, &ctx->unscheduled_list)
-               if (instr->opc == OPC_META_INPUT)
-                       schedule(ctx, instr);
-
-       foreach_instr_safe (instr, &ctx->unscheduled_list)
-               if (instr->opc == OPC_META_TEX_PREFETCH)
-                       schedule(ctx, instr);
-
-       while (!list_is_empty(&ctx->unscheduled_list)) {
-               struct ir3_instruction *instr = choose_instr(ctx);
-
-               unsigned delay =
-                       ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
-               d("delay=%u", delay);
-
-               /* and if we run out of instructions that can be scheduled,
-                * then it is time for nop's:
-                */
-               debug_assert(delay <= 6);
-               while (delay > 0) {
-                       ir3_NOP(block);
-                       delay--;
-               }
-
-               schedule(ctx, instr);
-       }
-
-       sched_dag_destroy(ctx);
+   ctx->block = block;
+   ctx->tex_delay = 0;
+   ctx->sfu_delay = 0;
+
+   /* move all instructions to the unscheduled list, and
+    * empty the block's instruction list (to which we will
+    * be inserting).
+    */
+   list_replace(&block->instr_list, &ctx->unscheduled_list);
+   list_inithead(&block->instr_list);
+
+   // TODO once we are using post-sched for everything we can
+   // just not stick in NOP's prior to post-sched, and drop this.
+   // for now keep this, since it makes post-sched optional:
+   foreach_instr_safe (instr, &ctx->unscheduled_list) {
+      switch (instr->opc) {
+      case OPC_NOP:
+      case OPC_B:
+      case OPC_JUMP:
+         list_delinit(&instr->node);
+         break;
+      default:
+         break;
+      }
+   }
+
+   sched_dag_init(ctx);
+
+   /* First schedule all meta:input instructions, followed by
+    * tex-prefetch.  We want all of the instructions that load
+    * values into registers before the shader starts to go
+    * before any other instructions.  But in particular we
+    * want inputs to come before prefetches.  This is because
+    * a FS's bary_ij input may not actually be live in the
+    * shader, but it should not be scheduled on top of any
+    * other input (but can be overwritten by a tex prefetch)
+    */
+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_META_INPUT)
+         schedule(ctx, instr);
+
+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_META_TEX_PREFETCH)
+         schedule(ctx, instr);
+
+   while (!list_is_empty(&ctx->unscheduled_list)) {
+      struct ir3_instruction *instr = choose_instr(ctx);
+
+      unsigned delay =
+         ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
+      d("delay=%u", delay);
+
+      /* and if we run out of instructions that can be scheduled,
+       * then it is time for nop's:
+       */
+      debug_assert(delay <= 6);
+      while (delay > 0) {
+         ir3_NOP(block);
+         delay--;
+      }
+
+      schedule(ctx, instr);
+   }
+
+   sched_dag_destroy(ctx);
  }
  
-
  static bool
  is_self_mov(struct ir3_instruction *instr)
  {
-       if (!is_same_type_mov(instr))
-               return false;
+   if (!is_same_type_mov(instr))
+      return false;
  
-       if (instr->dsts[0]->num != instr->srcs[0]->num)
-               return false;
+   if (instr->dsts[0]->num != instr->srcs[0]->num)
+      return false;
  
-       if (instr->dsts[0]->flags & IR3_REG_RELATIV)
-               return false;
+   if (instr->dsts[0]->flags & IR3_REG_RELATIV)
+      return false;
  
-       if (instr->cat1.round != ROUND_ZERO)
-               return false;
+   if (instr->cat1.round != ROUND_ZERO)
+      return false;
  
-       if (instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
-                       IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
-                       IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
-               return false;
+   if (instr->srcs[0]->flags &
+       (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
+        IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
+      return false;
  
-       return true;
+   return true;
  }
  
  /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
@@ -720,34 +726,34 @@ is_self_mov(struct ir3_instruction *instr)
  static void
  cleanup_self_movs(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list) {
-               foreach_instr_safe (instr, &block->instr_list) {
-                       for (unsigned i = 0; i < instr->deps_count; i++) {
-                               if (instr->deps[i] && is_self_mov(instr->deps[i])) {
-                                       instr->deps[i] = NULL;
-                               }
-                       }
-
-                       if (is_self_mov(instr))
-                               list_delinit(&instr->node);
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         for (unsigned i = 0; i < instr->deps_count; i++) {
+            if (instr->deps[i] && is_self_mov(instr->deps[i])) {
+               instr->deps[i] = NULL;
+            }
+         }
+
+         if (is_self_mov(instr))
+            list_delinit(&instr->node);
+      }
+   }
  }
  
  bool
  ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
  {
-       struct ir3_postsched_ctx ctx = {
-                       .ir = ir,
-                       .v  = v,
-       };
+   struct ir3_postsched_ctx ctx = {
+      .ir = ir,
+      .v = v,
+   };
  
-       ir3_remove_nops(ir);
-       cleanup_self_movs(ir);
+   ir3_remove_nops(ir);
+   cleanup_self_movs(ir);
  
-       foreach_block (block, &ir->block_list) {
-               sched_block(&ctx, block);
-       }
+   foreach_block (block, &ir->block_list) {
+      sched_block(&ctx, block);
+   }
  
-       return true;
+   return true;
  }
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c

index 917a7f4..0f92003 100644 (file)
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@@ -33,425 +33,452 @@
  #define PTRID(x) ((unsigned long)(x))
  
  /* ansi escape sequences: */
-#define RESET  "\x1b[0m"
-#define RED            "\x1b[0;31m"
-#define GREEN  "\x1b[0;32m"
-#define BLUE   "\x1b[0;34m"
-#define MAGENTA        "\x1b[0;35m"
+#define RESET   "\x1b[0m"
+#define RED     "\x1b[0;31m"
+#define GREEN   "\x1b[0;32m"
+#define BLUE    "\x1b[0;34m"
+#define MAGENTA "\x1b[0;35m"
  
  /* syntax coloring, mostly to make it easier to see different sorts of
   * srcs (immediate, constant, ssa, array, ...)
   */
-#define SYN_REG(x)             RED x RESET
-#define SYN_IMMED(x)   GREEN x RESET
-#define SYN_CONST(x)   GREEN x RESET
-#define SYN_SSA(x)             BLUE x RESET
-#define SYN_ARRAY(x)   MAGENTA x RESET
+#define SYN_REG(x)   RED x RESET
+#define SYN_IMMED(x) GREEN x RESET
+#define SYN_CONST(x) GREEN x RESET
+#define SYN_SSA(x)   BLUE x RESET
+#define SYN_ARRAY(x) MAGENTA x RESET
  
  static const char *
  type_name(type_t type)
  {
-       static const char *type_names[] = {
-                       [TYPE_F16] = "f16",
-                       [TYPE_F32] = "f32",
-                       [TYPE_U16] = "u16",
-                       [TYPE_U32] = "u32",
-                       [TYPE_S16] = "s16",
-                       [TYPE_S32] = "s32",
-                       [TYPE_U8]  = "u8",
-                       [TYPE_S8]  = "s8",
-       };
-       return type_names[type];
+   static const char *type_names[] = {
+      [TYPE_F16] = "f16", [TYPE_F32] = "f32", [TYPE_U16] = "u16",
+      [TYPE_U32] = "u32", [TYPE_S16] = "s16", [TYPE_S32] = "s32",
+      [TYPE_U8] = "u8",   [TYPE_S8] = "s8",
+   };
+   return type_names[type];
  }
  
-static void print_instr_name(struct log_stream *stream, struct ir3_instruction *instr, bool flags)
+static void
+print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
+                 bool flags)
  {
-       if (!instr)
-               return;
+   if (!instr)
+      return;
  #ifdef DEBUG
-       mesa_log_stream_printf(stream, "%04u:", instr->serialno);
+   mesa_log_stream_printf(stream, "%04u:", instr->serialno);
  #endif
-       mesa_log_stream_printf(stream, "%04u:", instr->name);
-       mesa_log_stream_printf(stream, "%04u:", instr->ip);
-       if (instr->flags & IR3_INSTR_UNUSED) {
-               mesa_log_stream_printf(stream, "XXX: ");
-       } else {
-               mesa_log_stream_printf(stream, "%03u: ", instr->use_count);
-       }
-
-       if (flags) {
-               mesa_log_stream_printf(stream, "\t");
-               if (instr->flags & IR3_INSTR_SY)
-                       mesa_log_stream_printf(stream, "(sy)");
-               if (instr->flags & IR3_INSTR_SS)
-                       mesa_log_stream_printf(stream, "(ss)");
-               if (instr->flags & IR3_INSTR_JP)
-                       mesa_log_stream_printf(stream, "(jp)");
-               if (instr->repeat)
-                       mesa_log_stream_printf(stream, "(rpt%d)", instr->repeat);
-               if (instr->nop)
-                       mesa_log_stream_printf(stream, "(nop%d)", instr->nop);
-               if (instr->flags & IR3_INSTR_UL)
-                       mesa_log_stream_printf(stream, "(ul)");
-       } else {
-               mesa_log_stream_printf(stream, " ");
-       }
-
-       if (is_meta(instr)) {
-               switch (instr->opc) {
-               case OPC_META_INPUT:  mesa_log_stream_printf(stream, "_meta:in");   break;
-               case OPC_META_SPLIT:                    mesa_log_stream_printf(stream, "_meta:split");        break;
-               case OPC_META_COLLECT:                  mesa_log_stream_printf(stream, "_meta:collect");      break;
-               case OPC_META_TEX_PREFETCH:             mesa_log_stream_printf(stream, "_meta:tex_prefetch"); break;
-               case OPC_META_PARALLEL_COPY:    mesa_log_stream_printf(stream, "_meta:parallel_copy"); break;
-               case OPC_META_PHI:                              mesa_log_stream_printf(stream, "_meta:phi");          break;
-
-               /* shouldn't hit here.. just for debugging: */
-               default: mesa_log_stream_printf(stream, "_meta:%d", instr->opc);    break;
-               }
-       } else if (opc_cat(instr->opc) == 1) {
-               if (instr->opc == OPC_MOV) {
-                       if (instr->cat1.src_type == instr->cat1.dst_type)
-                               mesa_log_stream_printf(stream, "mov");
-                       else
-                               mesa_log_stream_printf(stream, "cov");
-               } else {
-                       mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
-               }
-
-               if (instr->opc != OPC_MOVMSK) {
-                       mesa_log_stream_printf(stream, ".%s%s", type_name(instr->cat1.src_type),
-                                       type_name(instr->cat1.dst_type));
-               }
-       } else if (instr->opc == OPC_B) {
-               const char *name[8] = {
-                       [BRANCH_PLAIN] = "br",
-                       [BRANCH_OR]    = "brao",
-                       [BRANCH_AND]   = "braa",
-                       [BRANCH_CONST] = "brac",
-                       [BRANCH_ANY]   = "bany",
-                       [BRANCH_ALL]   = "ball",
-                       [BRANCH_X]     = "brax",
-               };
-               mesa_log_stream_printf(stream, "%s", name[instr->cat0.brtype]);
-       } else {
-               mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
-               if (instr->flags & IR3_INSTR_3D)
-                       mesa_log_stream_printf(stream, ".3d");
-               if (instr->flags & IR3_INSTR_A)
-                       mesa_log_stream_printf(stream, ".a");
-               if (instr->flags & IR3_INSTR_O)
-                       mesa_log_stream_printf(stream, ".o");
-               if (instr->flags & IR3_INSTR_P)
-                       mesa_log_stream_printf(stream, ".p");
-               if (instr->flags & IR3_INSTR_S)
-                       mesa_log_stream_printf(stream, ".s");
-               if (instr->flags & IR3_INSTR_A1EN)
-                       mesa_log_stream_printf(stream, ".a1en");
-               if (instr->opc == OPC_LDC)
-                       mesa_log_stream_printf(stream, ".offset%d", instr->cat6.d);
-               if (instr->flags & IR3_INSTR_B) {
-                       mesa_log_stream_printf(stream, ".base%d",
-                                  is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
-               }
-               if (instr->flags & IR3_INSTR_S2EN)
-                       mesa_log_stream_printf(stream, ".s2en");
-
-               static const char *cond[0x7] = {
-                               "lt",
-                               "le",
-                               "gt",
-                               "ge",
-                               "eq",
-                               "ne",
-               };
-
-               switch (instr->opc) {
-               case OPC_CMPS_F:
-               case OPC_CMPS_U:
-               case OPC_CMPS_S:
-               case OPC_CMPV_F:
-               case OPC_CMPV_U:
-               case OPC_CMPV_S:
-                       mesa_log_stream_printf(stream, ".%s", cond[instr->cat2.condition & 0x7]);
-                       break;
-               default:
-                       break;
-               }
-       }
+   mesa_log_stream_printf(stream, "%04u:", instr->name);
+   mesa_log_stream_printf(stream, "%04u:", instr->ip);
+   if (instr->flags & IR3_INSTR_UNUSED) {
+      mesa_log_stream_printf(stream, "XXX: ");
+   } else {
+      mesa_log_stream_printf(stream, "%03u: ", instr->use_count);
+   }
+
+   if (flags) {
+      mesa_log_stream_printf(stream, "\t");
+      if (instr->flags & IR3_INSTR_SY)
+         mesa_log_stream_printf(stream, "(sy)");
+      if (instr->flags & IR3_INSTR_SS)
+         mesa_log_stream_printf(stream, "(ss)");
+      if (instr->flags & IR3_INSTR_JP)
+         mesa_log_stream_printf(stream, "(jp)");
+      if (instr->repeat)
+         mesa_log_stream_printf(stream, "(rpt%d)", instr->repeat);
+      if (instr->nop)
+         mesa_log_stream_printf(stream, "(nop%d)", instr->nop);
+      if (instr->flags & IR3_INSTR_UL)
+         mesa_log_stream_printf(stream, "(ul)");
+   } else {
+      mesa_log_stream_printf(stream, " ");
+   }
+
+   if (is_meta(instr)) {
+      switch (instr->opc) {
+      case OPC_META_INPUT:
+         mesa_log_stream_printf(stream, "_meta:in");
+         break;
+      case OPC_META_SPLIT:
+         mesa_log_stream_printf(stream, "_meta:split");
+         break;
+      case OPC_META_COLLECT:
+         mesa_log_stream_printf(stream, "_meta:collect");
+         break;
+      case OPC_META_TEX_PREFETCH:
+         mesa_log_stream_printf(stream, "_meta:tex_prefetch");
+         break;
+      case OPC_META_PARALLEL_COPY:
+         mesa_log_stream_printf(stream, "_meta:parallel_copy");
+         break;
+      case OPC_META_PHI:
+         mesa_log_stream_printf(stream, "_meta:phi");
+         break;
+
+      /* shouldn't hit here.. just for debugging: */
+      default:
+         mesa_log_stream_printf(stream, "_meta:%d", instr->opc);
+         break;
+      }
+   } else if (opc_cat(instr->opc) == 1) {
+      if (instr->opc == OPC_MOV) {
+         if (instr->cat1.src_type == instr->cat1.dst_type)
+            mesa_log_stream_printf(stream, "mov");
+         else
+            mesa_log_stream_printf(stream, "cov");
+      } else {
+         mesa_log_stream_printf(stream, "%s",
+                                disasm_a3xx_instr_name(instr->opc));
+      }
+
+      if (instr->opc != OPC_MOVMSK) {
+         mesa_log_stream_printf(stream, ".%s%s",
+                                type_name(instr->cat1.src_type),
+                                type_name(instr->cat1.dst_type));
+      }
+   } else if (instr->opc == OPC_B) {
+      const char *name[8] = {
+         [BRANCH_PLAIN] = "br",   [BRANCH_OR] = "brao",  [BRANCH_AND] = "braa",
+         [BRANCH_CONST] = "brac", [BRANCH_ANY] = "bany", [BRANCH_ALL] = "ball",
+         [BRANCH_X] = "brax",
+      };
+      mesa_log_stream_printf(stream, "%s", name[instr->cat0.brtype]);
+   } else {
+      mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
+      if (instr->flags & IR3_INSTR_3D)
+         mesa_log_stream_printf(stream, ".3d");
+      if (instr->flags & IR3_INSTR_A)
+         mesa_log_stream_printf(stream, ".a");
+      if (instr->flags & IR3_INSTR_O)
+         mesa_log_stream_printf(stream, ".o");
+      if (instr->flags & IR3_INSTR_P)
+         mesa_log_stream_printf(stream, ".p");
+      if (instr->flags & IR3_INSTR_S)
+         mesa_log_stream_printf(stream, ".s");
+      if (instr->flags & IR3_INSTR_A1EN)
+         mesa_log_stream_printf(stream, ".a1en");
+      if (instr->opc == OPC_LDC)
+         mesa_log_stream_printf(stream, ".offset%d", instr->cat6.d);
+      if (instr->flags & IR3_INSTR_B) {
+         mesa_log_stream_printf(
+            stream, ".base%d",
+            is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
+      }
+      if (instr->flags & IR3_INSTR_S2EN)
+         mesa_log_stream_printf(stream, ".s2en");
+
+      static const char *cond[0x7] = {
+         "lt", "le", "gt", "ge", "eq", "ne",
+      };
+
+      switch (instr->opc) {
+      case OPC_CMPS_F:
+      case OPC_CMPS_U:
+      case OPC_CMPS_S:
+      case OPC_CMPV_F:
+      case OPC_CMPV_U:
+      case OPC_CMPV_S:
+         mesa_log_stream_printf(stream, ".%s",
+                                cond[instr->cat2.condition & 0x7]);
+         break;
+      default:
+         break;
+      }
+   }
  }
  
-static void print_ssa_def_name(struct log_stream *stream, struct ir3_register *reg)
+static void
+print_ssa_def_name(struct log_stream *stream, struct ir3_register *reg)
  {
-       mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), reg->instr->serialno);
-               if (reg->name != 0)
-                       mesa_log_stream_printf(stream, ":%u", reg->name);
+   mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), reg->instr->serialno);
+   if (reg->name != 0)
+      mesa_log_stream_printf(stream, ":%u", reg->name);
  }
  
-static void print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
+static void
+print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
  {
-       if (!dst) {
-               if (!reg->def)
-                       mesa_log_stream_printf(stream, SYN_SSA("undef"));
-               else
-                       print_ssa_def_name(stream, reg->def);
-       } else {
-               print_ssa_def_name(stream, reg);
-       }
-
-       if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
-               mesa_log_stream_printf(stream, "("SYN_REG("r%u.%c")")", reg_num(reg), "xyzw"[reg_comp(reg)]);
+   if (!dst) {
+      if (!reg->def)
+         mesa_log_stream_printf(stream, SYN_SSA("undef"));
+      else
+         print_ssa_def_name(stream, reg->def);
+   } else {
+      print_ssa_def_name(stream, reg);
+   }
+
+   if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
+      mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")", reg_num(reg),
+                             "xyzw"[reg_comp(reg)]);
  }
  
-static void print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
-                                                  struct ir3_register *reg, bool dest)
+static void
+print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
+               struct ir3_register *reg, bool dest)
  {
-       if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
-                       (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
-               mesa_log_stream_printf(stream, "(absneg)");
-       else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
-               mesa_log_stream_printf(stream, "(neg)");
-       else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
-               mesa_log_stream_printf(stream, "(abs)");
-
-       if (reg->flags & IR3_REG_FIRST_KILL)
-               mesa_log_stream_printf(stream, "(kill)");
-       if (reg->flags & IR3_REG_UNUSED)
-               mesa_log_stream_printf(stream, "(unused)");
-
-       if (reg->flags & IR3_REG_R)
-               mesa_log_stream_printf(stream, "(r)");
-
-       /* Right now all instructions that use tied registers only have one
-        * destination register, so we can just print (tied) as if it's a flag,
-        * although it's more convenient for RA if it's a pointer.
-        */
-       if (reg->tied)
-               printf("(tied)");
-
-       if (reg->flags & IR3_REG_SHARED)
-               mesa_log_stream_printf(stream, "s");
-       if (reg->flags & IR3_REG_HALF)
-               mesa_log_stream_printf(stream, "h");
-
-       if (reg->flags & IR3_REG_IMMED) {
-               mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val, reg->iim_val, reg->iim_val);
-       } else if (reg->flags & IR3_REG_ARRAY) {
-               if (reg->flags & IR3_REG_SSA) {
-                       print_ssa_name(stream, reg, dest);
-                       mesa_log_stream_printf(stream, ":");
-               }
-               mesa_log_stream_printf(stream, SYN_ARRAY("arr[id=%u, offset=%d, size=%u]"), reg->array.id,
-                               reg->array.offset, reg->size);
-               if (reg->array.base != INVALID_REG)
-                       mesa_log_stream_printf(stream, "("SYN_REG("r%u.%c")")", reg->array.base >> 2,
-                                  "xyzw"[reg->array.base & 0x3]);
-       } else if (reg->flags & IR3_REG_SSA) {
-               print_ssa_name(stream, reg, dest);
-       } else if (reg->flags & IR3_REG_RELATIV) {
-               if (reg->flags & IR3_REG_CONST)
-                       mesa_log_stream_printf(stream, SYN_CONST("c<a0.x + %d>"), reg->array.offset);
-               else
-                       mesa_log_stream_printf(stream, SYN_REG("r<a0.x + %d>")" (%u)", reg->array.offset, reg->size);
-       } else {
-               if (reg->flags & IR3_REG_CONST)
-                       mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]);
-               else
-                       mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]);
-       }
-
-       if (reg->wrmask > 0x1)
-               mesa_log_stream_printf(stream, " (wrmask=0x%x)", reg->wrmask);
+   if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+       (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+      mesa_log_stream_printf(stream, "(absneg)");
+   else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+      mesa_log_stream_printf(stream, "(neg)");
+   else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+      mesa_log_stream_printf(stream, "(abs)");
+
+   if (reg->flags & IR3_REG_FIRST_KILL)
+      mesa_log_stream_printf(stream, "(kill)");
+   if (reg->flags & IR3_REG_UNUSED)
+      mesa_log_stream_printf(stream, "(unused)");
+
+   if (reg->flags & IR3_REG_R)
+      mesa_log_stream_printf(stream, "(r)");
+
+   /* Right now all instructions that use tied registers only have one
+    * destination register, so we can just print (tied) as if it's a flag,
+    * although it's more convenient for RA if it's a pointer.
+    */
+   if (reg->tied)
+      printf("(tied)");
+
+   if (reg->flags & IR3_REG_SHARED)
+      mesa_log_stream_printf(stream, "s");
+   if (reg->flags & IR3_REG_HALF)
+      mesa_log_stream_printf(stream, "h");
+
+   if (reg->flags & IR3_REG_IMMED) {
+      mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val,
+                             reg->iim_val, reg->iim_val);
+   } else if (reg->flags & IR3_REG_ARRAY) {
+      if (reg->flags & IR3_REG_SSA) {
+         print_ssa_name(stream, reg, dest);
+         mesa_log_stream_printf(stream, ":");
+      }
+      mesa_log_stream_printf(stream,
+                             SYN_ARRAY("arr[id=%u, offset=%d, size=%u]"),
+                             reg->array.id, reg->array.offset, reg->size);
+      if (reg->array.base != INVALID_REG)
+         mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")",
+                                reg->array.base >> 2,
+                                "xyzw"[reg->array.base & 0x3]);
+   } else if (reg->flags & IR3_REG_SSA) {
+      print_ssa_name(stream, reg, dest);
+   } else if (reg->flags & IR3_REG_RELATIV) {
+      if (reg->flags & IR3_REG_CONST)
+         mesa_log_stream_printf(stream, SYN_CONST("c<a0.x + %d>"),
+                                reg->array.offset);
+      else
+         mesa_log_stream_printf(stream, SYN_REG("r<a0.x + %d>") " (%u)",
+                                reg->array.offset, reg->size);
+   } else {
+      if (reg->flags & IR3_REG_CONST)
+         mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg),
+                                "xyzw"[reg_comp(reg)]);
+      else
+         mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg),
+                                "xyzw"[reg_comp(reg)]);
+   }
+
+   if (reg->wrmask > 0x1)
+      mesa_log_stream_printf(stream, " (wrmask=0x%x)", reg->wrmask);
  }
  
  static void
  tab(struct log_stream *stream, int lvl)
  {
-       for (int i = 0; i < lvl; i++)
-               mesa_log_stream_printf(stream, "\t");
+   for (int i = 0; i < lvl; i++)
+      mesa_log_stream_printf(stream, "\t");
  }
  
  static void
  print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
  {
-       tab(stream, lvl);
-
-       print_instr_name(stream, instr, true);
-
-       if (is_tex(instr)) {
-               mesa_log_stream_printf(stream, " (%s)(", type_name(instr->cat5.type));
-               for (unsigned i = 0; i < 4; i++)
-                       if (instr->dsts[0]->wrmask & (1 << i))
-                               mesa_log_stream_printf(stream, "%c", "xyzw"[i]);
-               mesa_log_stream_printf(stream, ")");
-       } else if ((instr->srcs_count > 0 || instr->dsts_count > 0) && (instr->opc != OPC_B)) {
-               /* NOTE the b(ranch) instruction has a suffix, which is
-                * handled below
-                */
-               mesa_log_stream_printf(stream, " ");
-       }
-
-       if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
-               bool first = true;
-               foreach_dst (reg, instr) {
-                       if (reg->wrmask == 0)
-                               continue;
-                       if (!first)
-                               mesa_log_stream_printf(stream, ", ");
-                       print_reg_name(stream, instr, reg, true);
-                       first = false;
-               }
-               foreach_src (reg, instr) {
-                       if (!first)
-                               mesa_log_stream_printf(stream, ", ");
-                       print_reg_name(stream, instr, reg, false);
-                       first = false;
-               }
-       }
-
-       if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
-               if (!!(instr->flags & IR3_INSTR_B)) {
-                       if (!!(instr->flags & IR3_INSTR_A1EN)) {
-                               mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
-                       } else {
-                               mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp & 0xf,
-                                          instr->cat5.samp >> 4);
-                       }
-               } else {
-                       mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp, instr->cat5.tex);
-               }
-       }
-
-       if (instr->opc == OPC_META_SPLIT) {
-               mesa_log_stream_printf(stream, ", off=%d", instr->split.off);
-       } else if (instr->opc == OPC_META_TEX_PREFETCH) {
-               mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d", instr->prefetch.tex,
-                               instr->prefetch.samp, instr->prefetch.input_offset);
-       }
-
-       if (is_flow(instr) && instr->cat0.target) {
-               /* the predicate register src is implied: */
-               if (instr->opc == OPC_B) {
-                       static const struct {
-                               const char *suffix;
-                               int nsrc;
-                               bool idx;
-                       } brinfo[7] = {
-                               [BRANCH_PLAIN] = { "r",   1, false },
-                               [BRANCH_OR]    = { "rao", 2, false },
-                               [BRANCH_AND]   = { "raa", 2, false },
-                               [BRANCH_CONST] = { "rac", 0, true  },
-                               [BRANCH_ANY]   = { "any", 1, false },
-                               [BRANCH_ALL]   = { "all", 1, false },
-                               [BRANCH_X]     = { "rax", 0, false },
-                       };
-
-                       mesa_log_stream_printf(stream, "%s", brinfo[instr->cat0.brtype].suffix);
-                       if (brinfo[instr->cat0.brtype].idx) {
-                               mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
-                       }
-                       if (brinfo[instr->cat0.brtype].nsrc >= 1) {
-                               mesa_log_stream_printf(stream, " %sp0.%c (",
-                                               instr->cat0.inv1 ? "!" : "",
-                                               "xyzw"[instr->cat0.comp1 & 0x3]);
-                               print_reg_name(stream, instr, instr->srcs[0], false);
-                               mesa_log_stream_printf(stream, "), ");
-                       }
-                       if (brinfo[instr->cat0.brtype].nsrc >= 2) {
-                               mesa_log_stream_printf(stream, " %sp0.%c (",
-                                               instr->cat0.inv2 ? "!" : "",
-                                               "xyzw"[instr->cat0.comp2 & 0x3]);
-                               print_reg_name(stream, instr, instr->srcs[1], false);
-                               mesa_log_stream_printf(stream, "), ");
-                       }
-               }
-               mesa_log_stream_printf(stream, " target=block%u", block_id(instr->cat0.target));
-       }
-
-       if (instr->deps_count) {
-               mesa_log_stream_printf(stream, ", false-deps:");
-               unsigned n = 0;
-               for (unsigned i = 0; i < instr->deps_count; i++) {
-                       if (!instr->deps[i])
-                               continue;
-                       if (n++ > 0)
-                               mesa_log_stream_printf(stream, ", ");
-                       mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), instr->deps[i]->serialno);
-               }
-       }
-
-       mesa_log_stream_printf(stream, "\n");
+   tab(stream, lvl);
+
+   print_instr_name(stream, instr, true);
+
+   if (is_tex(instr)) {
+      mesa_log_stream_printf(stream, " (%s)(", type_name(instr->cat5.type));
+      for (unsigned i = 0; i < 4; i++)
+         if (instr->dsts[0]->wrmask & (1 << i))
+            mesa_log_stream_printf(stream, "%c", "xyzw"[i]);
+      mesa_log_stream_printf(stream, ")");
+   } else if ((instr->srcs_count > 0 || instr->dsts_count > 0) &&
+              (instr->opc != OPC_B)) {
+      /* NOTE the b(ranch) instruction has a suffix, which is
+       * handled below
+       */
+      mesa_log_stream_printf(stream, " ");
+   }
+
+   if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
+      bool first = true;
+      foreach_dst (reg, instr) {
+         if (reg->wrmask == 0)
+            continue;
+         if (!first)
+            mesa_log_stream_printf(stream, ", ");
+         print_reg_name(stream, instr, reg, true);
+         first = false;
+      }
+      foreach_src (reg, instr) {
+         if (!first)
+            mesa_log_stream_printf(stream, ", ");
+         print_reg_name(stream, instr, reg, false);
+         first = false;
+      }
+   }
+
+   if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
+      if (!!(instr->flags & IR3_INSTR_B)) {
+         if (!!(instr->flags & IR3_INSTR_A1EN)) {
+            mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
+         } else {
+            mesa_log_stream_printf(stream, ", s#%d, t#%d",
+                                   instr->cat5.samp & 0xf,
+                                   instr->cat5.samp >> 4);
+         }
+      } else {
+         mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp,
+                                instr->cat5.tex);
+      }
+   }
+
+   if (instr->opc == OPC_META_SPLIT) {
+      mesa_log_stream_printf(stream, ", off=%d", instr->split.off);
+   } else if (instr->opc == OPC_META_TEX_PREFETCH) {
+      mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d",
+                             instr->prefetch.tex, instr->prefetch.samp,
+                             instr->prefetch.input_offset);
+   }
+
+   if (is_flow(instr) && instr->cat0.target) {
+      /* the predicate register src is implied: */
+      if (instr->opc == OPC_B) {
+         static const struct {
+            const char *suffix;
+            int nsrc;
+            bool idx;
+         } brinfo[7] = {
+            [BRANCH_PLAIN] = {"r", 1, false}, [BRANCH_OR] = {"rao", 2, false},
+            [BRANCH_AND] = {"raa", 2, false}, [BRANCH_CONST] = {"rac", 0, true},
+            [BRANCH_ANY] = {"any", 1, false}, [BRANCH_ALL] = {"all", 1, false},
+            [BRANCH_X] = {"rax", 0, false},
+         };
+
+         mesa_log_stream_printf(stream, "%s",
+                                brinfo[instr->cat0.brtype].suffix);
+         if (brinfo[instr->cat0.brtype].idx) {
+            mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
+         }
+         if (brinfo[instr->cat0.brtype].nsrc >= 1) {
+            mesa_log_stream_printf(stream, " %sp0.%c (",
+                                   instr->cat0.inv1 ? "!" : "",
+                                   "xyzw"[instr->cat0.comp1 & 0x3]);
+            print_reg_name(stream, instr, instr->srcs[0], false);
+            mesa_log_stream_printf(stream, "), ");
+         }
+         if (brinfo[instr->cat0.brtype].nsrc >= 2) {
+            mesa_log_stream_printf(stream, " %sp0.%c (",
+                                   instr->cat0.inv2 ? "!" : "",
+                                   "xyzw"[instr->cat0.comp2 & 0x3]);
+            print_reg_name(stream, instr, instr->srcs[1], false);
+            mesa_log_stream_printf(stream, "), ");
+         }
+      }
+      mesa_log_stream_printf(stream, " target=block%u",
+                             block_id(instr->cat0.target));
+   }
+
+   if (instr->deps_count) {
+      mesa_log_stream_printf(stream, ", false-deps:");
+      unsigned n = 0;
+      for (unsigned i = 0; i < instr->deps_count; i++) {
+         if (!instr->deps[i])
+            continue;
+         if (n++ > 0)
+            mesa_log_stream_printf(stream, ", ");
+         mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"),
+                                instr->deps[i]->serialno);
+      }
+   }
+
+   mesa_log_stream_printf(stream, "\n");
  }
  
-void ir3_print_instr(struct ir3_instruction *instr)
+void
+ir3_print_instr(struct ir3_instruction *instr)
  {
-       struct log_stream *stream = mesa_log_streami();
-       print_instr(stream, instr, 0);
-       mesa_log_stream_destroy(stream);
+   struct log_stream *stream = mesa_log_streami();
+   print_instr(stream, instr, 0);
+   mesa_log_stream_destroy(stream);
  }
  
  static void
  print_block(struct ir3_block *block, int lvl)
  {
-       struct log_stream *stream = mesa_log_streami();
-
-       tab(stream, lvl); mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
-
-       if (block->predecessors_count > 0) {
-               tab(stream, lvl+1);
-               mesa_log_stream_printf(stream, "pred: ");
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       struct ir3_block *pred = block->predecessors[i];
-                       if (i != 0)
-                               mesa_log_stream_printf(stream, ", ");
-                       mesa_log_stream_printf(stream, "block%u", block_id(pred));
-               }
-               mesa_log_stream_printf(stream, "\n");
-       }
-
-       foreach_instr (instr, &block->instr_list) {
-               print_instr(stream, instr, lvl+1);
-       }
-
-       tab(stream, lvl+1); mesa_log_stream_printf(stream, "/* keeps:\n");
-       for (unsigned i = 0; i < block->keeps_count; i++) {
-               print_instr(stream, block->keeps[i], lvl+2);
-       }
-       tab(stream, lvl+1); mesa_log_stream_printf(stream, " */\n");
-
-       if (block->successors[1]) {
-               /* leading into if/else: */
-               tab(stream, lvl+1);
-               mesa_log_stream_printf(stream, "/* succs: if ");
-               switch (block->brtype) {
-               case IR3_BRANCH_COND:
-                       break;
-               case IR3_BRANCH_ANY:
-                       printf("any ");
-                       break;
-               case IR3_BRANCH_ALL:
-                       printf("all ");
-                       break;
-               case IR3_BRANCH_GETONE:
-                       printf("getone ");
-                       break;
-               }
-               if (block->condition)
-                       mesa_log_stream_printf(stream, SYN_SSA("ssa_%u")" ", block->condition->serialno);
-               mesa_log_stream_printf(stream, "block%u; else block%u; */\n",
-                               block_id(block->successors[0]),
-                               block_id(block->successors[1]));
-       } else if (block->successors[0]) {
-               tab(stream, lvl+1);
-               mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
-                               block_id(block->successors[0]));
-       }
-       tab(stream, lvl); mesa_log_stream_printf(stream, "}\n");
+   struct log_stream *stream = mesa_log_streami();
+
+   tab(stream, lvl);
+   mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
+
+   if (block->predecessors_count > 0) {
+      tab(stream, lvl + 1);
+      mesa_log_stream_printf(stream, "pred: ");
+      for (unsigned i = 0; i < block->predecessors_count; i++) {
+         struct ir3_block *pred = block->predecessors[i];
+         if (i != 0)
+            mesa_log_stream_printf(stream, ", ");
+         mesa_log_stream_printf(stream, "block%u", block_id(pred));
+      }
+      mesa_log_stream_printf(stream, "\n");
+   }
+
+   foreach_instr (instr, &block->instr_list) {
+      print_instr(stream, instr, lvl + 1);
+   }
+
+   tab(stream, lvl + 1);
+   mesa_log_stream_printf(stream, "/* keeps:\n");
+   for (unsigned i = 0; i < block->keeps_count; i++) {
+      print_instr(stream, block->keeps[i], lvl + 2);
+   }
+   tab(stream, lvl + 1);
+   mesa_log_stream_printf(stream, " */\n");
+
+   if (block->successors[1]) {
+      /* leading into if/else: */
+      tab(stream, lvl + 1);
+      mesa_log_stream_printf(stream, "/* succs: if ");
+      switch (block->brtype) {
+      case IR3_BRANCH_COND:
+         break;
+      case IR3_BRANCH_ANY:
+         printf("any ");
+         break;
+      case IR3_BRANCH_ALL:
+         printf("all ");
+         break;
+      case IR3_BRANCH_GETONE:
+         printf("getone ");
+         break;
+      }
+      if (block->condition)
+         mesa_log_stream_printf(stream, SYN_SSA("ssa_%u") " ",
+                                block->condition->serialno);
+      mesa_log_stream_printf(stream, "block%u; else block%u; */\n",
+                             block_id(block->successors[0]),
+                             block_id(block->successors[1]));
+   } else if (block->successors[0]) {
+      tab(stream, lvl + 1);
+      mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
+                             block_id(block->successors[0]));
+   }
+   tab(stream, lvl);
+   mesa_log_stream_printf(stream, "}\n");
  }
  
  void
  ir3_print(struct ir3 *ir)
  {
-       foreach_block (block, &ir->block_list)
-               print_block(block, 0);
+   foreach_block (block, &ir->block_list)
+      print_block(block, 0);
  }
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c

index febf3a5..6463b62 100644 (file)
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -23,9 +23,9 @@
   */
  
  #include "ir3_ra.h"
-#include "ir3_shader.h"
  #include "util/rb_tree.h"
  #include "util/u_math.h"
+#include "ir3_shader.h"
  
  /* This file implements an SSA-based register allocator. Unlike other
   * SSA-based allocators, it handles vector split/collect "smartly," meaning
@@ -51,28 +51,30 @@
  static int
  ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
  {
-       physreg_t reg = *(const physreg_t *)data;
-       const struct ir3_reg_interval *interval = ir3_rb_node_to_interval_const(node);
-       if (interval->reg->interval_start > reg)
-               return -1;
-       else if (interval->reg->interval_end <= reg)
-               return 1;
-       else
-               return 0;
+   physreg_t reg = *(const physreg_t *)data;
+   const struct ir3_reg_interval *interval =
+      ir3_rb_node_to_interval_const(node);
+   if (interval->reg->interval_start > reg)
+      return -1;
+   else if (interval->reg->interval_end <= reg)
+      return 1;
+   else
+      return 0;
  }
  
  static struct ir3_reg_interval *
  ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
  {
-       struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
-       return node ? ir3_rb_node_to_interval(node) : NULL;
+   struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
+   return node ? ir3_rb_node_to_interval(node) : NULL;
  }
  
  static struct ir3_reg_interval *
  ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
  {
-       struct rb_node *node = rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
-       return node ? ir3_rb_node_to_interval(node) : NULL;
+   struct rb_node *node =
+      rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
+   return node ? ir3_rb_node_to_interval(node) : NULL;
  }
  
  /* Get the interval covering the reg, or the closest to the right if it
@@ -81,155 +83,161 @@ ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
  static struct ir3_reg_interval *
  ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
  {
-       struct ir3_reg_interval *interval = ir3_reg_interval_search_sloppy(tree, offset);
-       if (!interval) {
-               return NULL;
-       } else if (interval->reg->interval_end > offset) {
-               return interval;
-       } else {
-               /* There is no interval covering reg, and ra_file_search_sloppy()
-                * returned the closest range to the left, so the next interval to the
-                * right should be the closest to the right.
-                */
-               return ir3_reg_interval_next_or_null(interval);
-       }
+   struct ir3_reg_interval *interval =
+      ir3_reg_interval_search_sloppy(tree, offset);
+   if (!interval) {
+      return NULL;
+   } else if (interval->reg->interval_end > offset) {
+      return interval;
+   } else {
+      /* There is no interval covering reg, and ra_file_search_sloppy()
+       * returned the closest range to the left, so the next interval to the
+       * right should be the closest to the right.
+       */
+      return ir3_reg_interval_next_or_null(interval);
+   }
  }
  
  static int
  ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
  {
-       const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
-       const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
-       return b->reg->interval_start - a->reg->interval_start;
+   const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
+   const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
+   return b->reg->interval_start - a->reg->interval_start;
  }
  
  static void
  interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
-                               struct ir3_reg_interval *interval)
-{
-       struct ir3_reg_interval *right =
-               ir3_reg_interval_search_right(tree, interval->reg->interval_start);
-       if (right && right->reg->interval_start < interval->reg->interval_end) {
-               /* We disallow trees where different members have different half-ness.
-                * This means that we can't treat bitcasts as copies like normal
-                * split/collect, so something like this would require an extra copy
-                * in mergedregs mode, and count as 4 half-units of register pressure
-                * instead of 2:
-                *
-                * f16vec2 foo = unpackFloat2x16(bar)
-                * ... = foo.x
-                * ... = bar
-                *
-                * However, relaxing this rule would open a huge can of worms. What
-                * happens when there's a vector of 16 things, and the fifth element
-                * has been bitcasted as a half-reg? Would that element alone have to
-                * be small enough to be used as a half-reg source? Let's keep that
-                * can of worms firmly shut for now.
-                */
-               assert((interval->reg->flags & IR3_REG_HALF) ==
-                          (right->reg->flags & IR3_REG_HALF));
-
-               if (right->reg->interval_end <= interval->reg->interval_end &&
-                       right->reg->interval_start >= interval->reg->interval_start) {
-                       /* Check if we're inserting something that's already inserted */
-                       assert(interval != right);
-
-                       /* "right" is contained in "interval" and must become a child of
-                        * it. There may be further children too.
-                        */
-                       for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
-                                right && right->reg->interval_start < interval->reg->interval_end;
-                                right = next, next = ir3_reg_interval_next_or_null(next)) {
-                               /* "right" must be contained in "interval." */
-                               assert(right->reg->interval_end <= interval->reg->interval_end);
-                               assert((interval->reg->flags & IR3_REG_HALF) ==
-                                          (right->reg->flags & IR3_REG_HALF));
-                               if (!right->parent)
-                                       ctx->interval_delete(ctx, right);
-                               right->parent = interval;
-                               rb_tree_remove(tree, &right->node);
-                               rb_tree_insert(&interval->children, &right->node,
-                                                          ir3_reg_interval_insert_cmp);
-                       }
-               } else {
-                       /* "right" must contain "interval," since intervals must form a
-                        * tree.
-                        */
-                       assert(right->reg->interval_start <= interval->reg->interval_start);
-                       interval->parent = right;
-                       interval_insert(ctx, &right->children, interval);
-                       return;
-               }
-       }
-
-       if (!interval->parent)
-               ctx->interval_add(ctx, interval);
-       rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
-       interval->inserted = true;
+                struct ir3_reg_interval *interval)
+{
+   struct ir3_reg_interval *right =
+      ir3_reg_interval_search_right(tree, interval->reg->interval_start);
+   if (right && right->reg->interval_start < interval->reg->interval_end) {
+      /* We disallow trees where different members have different half-ness.
+       * This means that we can't treat bitcasts as copies like normal
+       * split/collect, so something like this would require an extra copy
+       * in mergedregs mode, and count as 4 half-units of register pressure
+       * instead of 2:
+       *
+       * f16vec2 foo = unpackFloat2x16(bar)
+       * ... = foo.x
+       * ... = bar
+       *
+       * However, relaxing this rule would open a huge can of worms. What
+       * happens when there's a vector of 16 things, and the fifth element
+       * has been bitcasted as a half-reg? Would that element alone have to
+       * be small enough to be used as a half-reg source? Let's keep that
+       * can of worms firmly shut for now.
+       */
+      assert((interval->reg->flags & IR3_REG_HALF) ==
+             (right->reg->flags & IR3_REG_HALF));
+
+      if (right->reg->interval_end <= interval->reg->interval_end &&
+          right->reg->interval_start >= interval->reg->interval_start) {
+         /* Check if we're inserting something that's already inserted */
+         assert(interval != right);
+
+         /* "right" is contained in "interval" and must become a child of
+          * it. There may be further children too.
+          */
+         for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
+              right && right->reg->interval_start < interval->reg->interval_end;
+              right = next, next = ir3_reg_interval_next_or_null(next)) {
+            /* "right" must be contained in "interval." */
+            assert(right->reg->interval_end <= interval->reg->interval_end);
+            assert((interval->reg->flags & IR3_REG_HALF) ==
+                   (right->reg->flags & IR3_REG_HALF));
+            if (!right->parent)
+               ctx->interval_delete(ctx, right);
+            right->parent = interval;
+            rb_tree_remove(tree, &right->node);
+            rb_tree_insert(&interval->children, &right->node,
+                           ir3_reg_interval_insert_cmp);
+         }
+      } else {
+         /* "right" must contain "interval," since intervals must form a
+          * tree.
+          */
+         assert(right->reg->interval_start <= interval->reg->interval_start);
+         interval->parent = right;
+         interval_insert(ctx, &right->children, interval);
+         return;
+      }
+   }
+
+   if (!interval->parent)
+      ctx->interval_add(ctx, interval);
+   rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
+   interval->inserted = true;
  }
  
  void
-ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
+                        struct ir3_reg_interval *interval)
  {
-       interval_insert(ctx, &ctx->intervals, interval);
+   interval_insert(ctx, &ctx->intervals, interval);
  }
  
  void
-ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
-{
-       if (interval->parent) {
-               rb_tree_remove(&interval->parent->children, &interval->node);
-       } else {
-               ctx->interval_delete(ctx, interval);
-               rb_tree_remove(&ctx->intervals, &interval->node);
-       }
-
-       rb_tree_foreach_safe(struct ir3_reg_interval, child, &interval->children, node) {
-               rb_tree_remove(&interval->children, &child->node);
-               child->parent = interval->parent;
-
-               if (interval->parent) {
-                       rb_tree_insert(&child->parent->children, &child->node,
-                                                  ir3_reg_interval_insert_cmp);
-               } else {
-                       ctx->interval_readd(ctx, interval, child);
-                       rb_tree_insert(&ctx->intervals, &child->node,
-                                                  ir3_reg_interval_insert_cmp);
-               }
-       }
-
-       interval->inserted = false;
+ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
+                        struct ir3_reg_interval *interval)
+{
+   if (interval->parent) {
+      rb_tree_remove(&interval->parent->children, &interval->node);
+   } else {
+      ctx->interval_delete(ctx, interval);
+      rb_tree_remove(&ctx->intervals, &interval->node);
+   }
+
+   rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
+                         node) {
+      rb_tree_remove(&interval->children, &child->node);
+      child->parent = interval->parent;
+
+      if (interval->parent) {
+         rb_tree_insert(&child->parent->children, &child->node,
+                        ir3_reg_interval_insert_cmp);
+      } else {
+         ctx->interval_readd(ctx, interval, child);
+         rb_tree_insert(&ctx->intervals, &child->node,
+                        ir3_reg_interval_insert_cmp);
+      }
+   }
+
+   interval->inserted = false;
  }
  
  void
-ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
+                            struct ir3_reg_interval *interval)
  {
-       assert(!interval->parent);
+   assert(!interval->parent);
  
-       ctx->interval_delete(ctx, interval);
-       rb_tree_remove(&ctx->intervals, &interval->node);
+   ctx->interval_delete(ctx, interval);
+   rb_tree_remove(&ctx->intervals, &interval->node);
  }
  
  static void
  interval_dump(struct ir3_reg_interval *interval, unsigned indent)
  {
-       for (unsigned i = 0; i < indent; i++)
-               printf("\t");
-       printf("reg %u start %u\n", interval->reg->name, interval->reg->interval_start);
+   for (unsigned i = 0; i < indent; i++)
+      printf("\t");
+   printf("reg %u start %u\n", interval->reg->name,
+          interval->reg->interval_start);
  
-       rb_tree_foreach(struct ir3_reg_interval, child, &interval->children, node) {
-               interval_dump(child, indent + 1);
-       }
+   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
+      interval_dump(child, indent + 1);
+   }
  
-       for (unsigned i = 0; i < indent; i++)
-               printf("\t");
-       printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
+   for (unsigned i = 0; i < indent; i++)
+      printf("\t");
+   printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
  }
  
  void
  ir3_reg_interval_dump(struct ir3_reg_interval *interval)
  {
-       interval_dump(interval, 0);
+   interval_dump(interval, 0);
  }
  
  /* These are the core datastructures used by the register allocator. First
@@ -238,34 +246,34 @@ ir3_reg_interval_dump(struct ir3_reg_interval *interval)
   */
  
  struct ra_interval {
-       struct ir3_reg_interval interval;
-
-       struct rb_node physreg_node;
-       physreg_t physreg_start, physreg_end;
-
-       /* True if this is a source of the current instruction which is entirely
-        * killed. This means we can allocate the dest over it, but we can't break
-        * it up.
-        */
-       bool is_killed;
-
-       /* True if this interval cannot be moved from its position. This is only
-        * used for precolored inputs to ensure that other inputs don't get
-        * allocated on top of them.
-        */
-       bool frozen;
+   struct ir3_reg_interval interval;
+
+   struct rb_node physreg_node;
+   physreg_t physreg_start, physreg_end;
+
+   /* True if this is a source of the current instruction which is entirely
+    * killed. This means we can allocate the dest over it, but we can't break
+    * it up.
+    */
+   bool is_killed;
+
+   /* True if this interval cannot be moved from its position. This is only
+    * used for precolored inputs to ensure that other inputs don't get
+    * allocated on top of them.
+    */
+   bool frozen;
  };
  
  struct ra_file {
-       struct ir3_reg_ctx reg_ctx;
+   struct ir3_reg_ctx reg_ctx;
  
-       BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
-       BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
+   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
  
-       struct rb_tree physreg_intervals;
+   struct rb_tree physreg_intervals;
  
-       unsigned size;
-       unsigned start;
+   unsigned size;
+   unsigned start;
  };
  
  /* State for inter-block tracking. When we split a live range to make space
@@ -275,123 +283,127 @@ struct ra_file {
   */
  
  struct ra_block_state {
-       /* Map of defining ir3_register -> physreg it was allocated to at the end
-        * of the block.
-        */
-       struct hash_table *renames;
-
-       /* For loops, we need to process a block before all its predecessors have
-        * been processed. In particular, we need to pick registers for values
-        * without knowing if all the predecessors have been renamed. This keeps
-        * track of the registers we chose so that when we visit the back-edge we
-        * can move them appropriately. If all predecessors have been visited
-        * before this block is visited then we don't need to fill this out. This
-        * is a map from ir3_register -> physreg.
-        */
-       struct hash_table *entry_regs;
-
-       /* True if the block has been visited and "renames" is complete.
-        */
-       bool visited;
-
-       /* True if the block is unreachable via the logical CFG. This happens for
-        * blocks after an if where both sides end in a break/continue. We ignore
-        * it for everything but shared registers.
-        */
-       bool logical_unreachable;
+   /* Map of defining ir3_register -> physreg it was allocated to at the end
+    * of the block.
+    */
+   struct hash_table *renames;
+
+   /* For loops, we need to process a block before all its predecessors have
+    * been processed. In particular, we need to pick registers for values
+    * without knowing if all the predecessors have been renamed. This keeps
+    * track of the registers we chose so that when we visit the back-edge we
+    * can move them appropriately. If all predecessors have been visited
+    * before this block is visited then we don't need to fill this out. This
+    * is a map from ir3_register -> physreg.
+    */
+   struct hash_table *entry_regs;
+
+   /* True if the block has been visited and "renames" is complete.
+    */
+   bool visited;
+
+   /* True if the block is unreachable via the logical CFG. This happens for
+    * blocks after an if where both sides end in a break/continue. We ignore
+    * it for everything but shared registers.
+    */
+   bool logical_unreachable;
  };
  
  struct ra_parallel_copy {
-       struct ra_interval *interval;
-       physreg_t src;
+   struct ra_interval *interval;
+   physreg_t src;
  };
  
  /* The main context: */
  
  struct ra_ctx {
-       /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
-        * half of this file too.
-        */
-       struct ra_file full;
+   /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
+    * half of this file too.
+    */
+   struct ra_file full;
  
-       /* hr0.x - hr63.w, only used without merged-regs. */
-       struct ra_file half;
+   /* hr0.x - hr63.w, only used without merged-regs. */
+   struct ra_file half;
  
-       /* Shared regs. */
-       struct ra_file shared;
+   /* Shared regs. */
+   struct ra_file shared;
  
-       struct ir3 *ir;
+   struct ir3 *ir;
  
-       struct ir3_liveness *live;
+   struct ir3_liveness *live;
  
-       struct ir3_block *block;
+   struct ir3_block *block;
  
-       const struct ir3_compiler *compiler;
-       gl_shader_stage stage;
+   const struct ir3_compiler *compiler;
+   gl_shader_stage stage;
  
-       /* Pending moves of top-level intervals that will be emitted once we're
-        * finished:
-        */
-       DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
+   /* Pending moves of top-level intervals that will be emitted once we're
+    * finished:
+    */
+   DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
  
-       struct ra_interval *intervals;
-       struct ra_block_state *blocks;
+   struct ra_interval *intervals;
+   struct ra_block_state *blocks;
  
-       bool merged_regs;
+   bool merged_regs;
  };
  
-#define foreach_interval(interval, file) \
-       rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
-#define foreach_interval_rev(interval, file) \
-       rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
-#define foreach_interval_safe(interval, file) \
-       rb_tree_foreach_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
-#define foreach_interval_rev_safe(interval, file) \
-       rb_tree_foreach_rev_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval(interval, file)                                       \
+   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
+                    physreg_node)
+#define foreach_interval_rev(interval, file)                                   \
+   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
+                    physreg_node)
+#define foreach_interval_safe(interval, file)                                  \
+   rb_tree_foreach_safe (struct ra_interval, interval,                         \
+                         &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_rev_safe(interval, file)                              \
+   rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
+                            &(file)->physreg_intervals, physreg_node)
  
  static struct ra_interval *
  rb_node_to_interval(struct rb_node *node)
  {
-       return rb_node_data(struct ra_interval, node, physreg_node);
+   return rb_node_data(struct ra_interval, node, physreg_node);
  }
  
  static const struct ra_interval *
  rb_node_to_interval_const(const struct rb_node *node)
  {
-       return rb_node_data(struct ra_interval, node, physreg_node);
+   return rb_node_data(struct ra_interval, node, physreg_node);
  }
  
  static struct ra_interval *
  ra_interval_next(struct ra_interval *interval)
  {
-       struct rb_node *next = rb_node_next(&interval->physreg_node);
-       return next ? rb_node_to_interval(next) : NULL;
+   struct rb_node *next = rb_node_next(&interval->physreg_node);
+   return next ? rb_node_to_interval(next) : NULL;
  }
  
  static struct ra_interval *
  ra_interval_next_or_null(struct ra_interval *interval)
  {
-       return interval ? ra_interval_next(interval) : NULL;
+   return interval ? ra_interval_next(interval) : NULL;
  }
  
  static int
  ra_interval_cmp(const struct rb_node *node, const void *data)
  {
-       physreg_t reg = *(const physreg_t *)data;
-       const struct ra_interval *interval = rb_node_to_interval_const(node);
-       if (interval->physreg_start > reg)
-               return -1;
-       else if (interval->physreg_end <= reg)
-               return 1;
-       else
-               return 0;
+   physreg_t reg = *(const physreg_t *)data;
+   const struct ra_interval *interval = rb_node_to_interval_const(node);
+   if (interval->physreg_start > reg)
+      return -1;
+   else if (interval->physreg_end <= reg)
+      return 1;
+   else
+      return 0;
  }
  
  static struct ra_interval *
  ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
  {
-       struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
-       return node ? rb_node_to_interval(node) : NULL;
+   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
+   return node ? rb_node_to_interval(node) : NULL;
  }
  
  /* Get the interval covering the reg, or the closest to the right if it
@@ -400,218 +412,220 @@ ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
  static struct ra_interval *
  ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
  {
-       struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
-       if (!interval) {
-               return NULL;
-       } else if (interval->physreg_end > reg) {
-               return interval;
-       } else {
-               /* There is no interval covering reg, and ra_file_search_sloppy()
-                * returned the closest range to the left, so the next interval to the
-                * right should be the closest to the right.
-                */
-               return ra_interval_next_or_null(interval);
-       }
+   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
+   if (!interval) {
+      return NULL;
+   } else if (interval->physreg_end > reg) {
+      return interval;
+   } else {
+      /* There is no interval covering reg, and ra_file_search_sloppy()
+       * returned the closest range to the left, so the next interval to the
+       * right should be the closest to the right.
+       */
+      return ra_interval_next_or_null(interval);
+   }
  }
  
  static struct ra_interval *
  ra_file_search_right(struct ra_file *file, physreg_t reg)
  {
-       return ra_interval_search_right(&file->physreg_intervals, reg);
+   return ra_interval_search_right(&file->physreg_intervals, reg);
  }
  
  static int
  ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
  {
-       const struct ra_interval *a = rb_node_to_interval_const(_a);
-       const struct ra_interval *b = rb_node_to_interval_const(_b);
-       return b->physreg_start - a->physreg_start;
+   const struct ra_interval *a = rb_node_to_interval_const(_a);
+   const struct ra_interval *b = rb_node_to_interval_const(_b);
+   return b->physreg_start - a->physreg_start;
  }
  
  static struct ra_interval *
  ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
  {
-       return rb_node_data(struct ra_interval, interval, interval);
+   return rb_node_data(struct ra_interval, interval, interval);
  }
  
  static struct ra_file *
  ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
  {
-       return rb_node_data(struct ra_file, ctx, reg_ctx);
+   return rb_node_data(struct ra_file, ctx, reg_ctx);
  }
  
  static void
  interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
  {
-       struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
-       struct ra_file *file = ir3_reg_ctx_to_file(ctx);
+   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
  
-       /* We can assume in this case that physreg_start/physreg_end is already
-        * initialized.
-        */
-       for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
-               BITSET_CLEAR(file->available, i);
-               BITSET_CLEAR(file->available_to_evict, i);
-       }
+   /* We can assume in this case that physreg_start/physreg_end is already
+    * initialized.
+    */
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_CLEAR(file->available, i);
+      BITSET_CLEAR(file->available_to_evict, i);
+   }
  
-       rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
-                                  ra_interval_insert_cmp);
+   rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
+                  ra_interval_insert_cmp);
  }
  
  static void
  interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
  {
-       struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
-       struct ra_file *file = ir3_reg_ctx_to_file(ctx);
+   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
  
-       for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
-               BITSET_SET(file->available, i);
-               BITSET_SET(file->available_to_evict, i);
-       }
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_SET(file->available, i);
+      BITSET_SET(file->available_to_evict, i);
+   }
  
-       rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
+   rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
  }
  
  static void
  interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
-                          struct ir3_reg_interval *_child)
+               struct ir3_reg_interval *_child)
  {
-       struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
-       struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
+   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
+   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
  
-       child->physreg_start = parent->physreg_start +
-               (child->interval.reg->interval_start - parent->interval.reg->interval_start);
-       child->physreg_end = child->physreg_start +
-               (child->interval.reg->interval_end - child->interval.reg->interval_start);
+   child->physreg_start =
+      parent->physreg_start + (child->interval.reg->interval_start -
+                               parent->interval.reg->interval_start);
+   child->physreg_end =
+      child->physreg_start +
+      (child->interval.reg->interval_end - child->interval.reg->interval_start);
  
-       interval_add(ctx, _child);
+   interval_add(ctx, _child);
  }
  
-
  static void
  ra_file_init(struct ra_file *file)
  {
-       for (unsigned i = 0; i < file->size; i++) {
-               BITSET_SET(file->available, i);
-               BITSET_SET(file->available_to_evict, i);
-       }
+   for (unsigned i = 0; i < file->size; i++) {
+      BITSET_SET(file->available, i);
+      BITSET_SET(file->available_to_evict, i);
+   }
  
-       file->start = 0;
+   file->start = 0;
  
-       rb_tree_init(&file->reg_ctx.intervals);
-       rb_tree_init(&file->physreg_intervals);
+   rb_tree_init(&file->reg_ctx.intervals);
+   rb_tree_init(&file->physreg_intervals);
  
-       file->reg_ctx.interval_add = interval_add;
-       file->reg_ctx.interval_delete = interval_delete;
-       file->reg_ctx.interval_readd = interval_readd;
+   file->reg_ctx.interval_add = interval_add;
+   file->reg_ctx.interval_delete = interval_delete;
+   file->reg_ctx.interval_readd = interval_readd;
  }
  
  static void
  ra_file_insert(struct ra_file *file, struct ra_interval *interval)
  {
-       assert(interval->physreg_start < interval->physreg_end);
-       assert(interval->physreg_end <= file->size);
-       if (interval->interval.reg->flags & IR3_REG_HALF)
-               assert(interval->physreg_end <= RA_HALF_SIZE);
+   assert(interval->physreg_start < interval->physreg_end);
+   assert(interval->physreg_end <= file->size);
+   if (interval->interval.reg->flags & IR3_REG_HALF)
+      assert(interval->physreg_end <= RA_HALF_SIZE);
  
-       ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
  }
  
  static void
  ra_file_remove(struct ra_file *file, struct ra_interval *interval)
  {
-       ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
+   ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
  }
  
  static void
  ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
  {
-       assert(!interval->interval.parent);
+   assert(!interval->interval.parent);
  
-       for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
-               BITSET_SET(file->available, i);
-       }
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_SET(file->available, i);
+   }
  
-       interval->is_killed = true;
+   interval->is_killed = true;
  }
  
  static physreg_t
  ra_interval_get_physreg(const struct ra_interval *interval)
  {
-       unsigned child_start = interval->interval.reg->interval_start;
+   unsigned child_start = interval->interval.reg->interval_start;
  
-       while (interval->interval.parent) {
-               interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
-       }
+   while (interval->interval.parent) {
+      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+   }
  
-       return interval->physreg_start +
-               (child_start - interval->interval.reg->interval_start);
+   return interval->physreg_start +
+          (child_start - interval->interval.reg->interval_start);
  }
  
  static unsigned
  ra_interval_get_num(const struct ra_interval *interval)
  {
-       return ra_physreg_to_num(ra_interval_get_physreg(interval),
-                                                        interval->interval.reg->flags);
+   return ra_physreg_to_num(ra_interval_get_physreg(interval),
+                            interval->interval.reg->flags);
  }
  
  static void
  ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
  {
-       ir3_reg_interval_init(&interval->interval, reg);
-       interval->is_killed = false;
-       interval->frozen = false;
+   ir3_reg_interval_init(&interval->interval, reg);
+   interval->is_killed = false;
+   interval->frozen = false;
  }
  
  static void
  ra_interval_dump(struct ra_interval *interval)
  {
-       printf("physreg %u ", interval->physreg_start);
+   printf("physreg %u ", interval->physreg_start);
  
-       ir3_reg_interval_dump(&interval->interval);
+   ir3_reg_interval_dump(&interval->interval);
  }
  
  static void
  ra_file_dump(struct ra_file *file)
  {
-       rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
-               ra_interval_dump(interval);
-       }
+   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
+                    physreg_node) {
+      ra_interval_dump(interval);
+   }
  
-       unsigned start, end;
-       printf("available:\n");
-       BITSET_FOREACH_RANGE(start, end, file->available, file->size) {
-               printf("%u-%u ", start, end);
-       }
-       printf("\n");
+   unsigned start, end;
+   printf("available:\n");
+   BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
+      printf("%u-%u ", start, end);
+   }
+   printf("\n");
  
-       printf("available to evict:\n");
-       BITSET_FOREACH_RANGE(start, end, file->available_to_evict, file->size) {
-               printf("%u-%u ", start, end);
-       }
-       printf("\n");
-       printf("start: %u\n", file->start);
+   printf("available to evict:\n");
+   BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
+      printf("%u-%u ", start, end);
+   }
+   printf("\n");
+   printf("start: %u\n", file->start);
  }
  
  static void
  ra_ctx_dump(struct ra_ctx *ctx)
  {
-       printf("full:\n");
-       ra_file_dump(&ctx->full);
-       printf("half:\n");
-       ra_file_dump(&ctx->half);
-       printf("shared:\n");
-       ra_file_dump(&ctx->shared);
+   printf("full:\n");
+   ra_file_dump(&ctx->full);
+   printf("half:\n");
+   ra_file_dump(&ctx->half);
+   printf("shared:\n");
+   ra_file_dump(&ctx->shared);
  }
  
  static unsigned
  reg_file_size(struct ra_file *file, struct ir3_register *reg)
  {
-       /* Half-regs can only take up the first half of the combined regfile */
-       if (reg->flags & IR3_REG_HALF)
-               return MIN2(file->size, RA_HALF_SIZE);
-       else
-               return file->size;
+   /* Half-regs can only take up the first half of the combined regfile */
+   if (reg->flags & IR3_REG_HALF)
+      return MIN2(file->size, RA_HALF_SIZE);
+   else
+      return file->size;
  }
  
  /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
@@ -620,68 +634,72 @@ reg_file_size(struct ra_file *file, struct ir3_register *reg)
   */
  
  struct ra_removed_interval {
-       struct ra_interval *interval;
-       unsigned size;
+   struct ra_interval *interval;
+   unsigned size;
  };
  
  static struct ra_removed_interval
  ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
-                       struct ra_interval *interval)
+                struct ra_interval *interval)
  {
-       assert(!interval->interval.parent);
+   assert(!interval->interval.parent);
  
-       /* Check if we've already moved this reg before */
-       unsigned pcopy_index;
-       for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; pcopy_index++) {
-               if (ctx->parallel_copies[pcopy_index].interval == interval)
-                       break;
-       }
+   /* Check if we've already moved this reg before */
+   unsigned pcopy_index;
+   for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
+        pcopy_index++) {
+      if (ctx->parallel_copies[pcopy_index].interval == interval)
+         break;
+   }
  
-       if (pcopy_index == ctx->parallel_copies_count) {
-               array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) {
-                               .interval = interval,
-                               .src = interval->physreg_start,
-               });
-       }
+   if (pcopy_index == ctx->parallel_copies_count) {
+      array_insert(ctx, ctx->parallel_copies,
+                   (struct ra_parallel_copy){
+                      .interval = interval,
+                      .src = interval->physreg_start,
+                   });
+   }
  
-       ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
+   ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
  
-       return (struct ra_removed_interval) {
-               .interval = interval,
-               .size = interval->physreg_end - interval->physreg_start,
-       };
+   return (struct ra_removed_interval){
+      .interval = interval,
+      .size = interval->physreg_end - interval->physreg_start,
+   };
  }
  
  static void
  ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
-                                const struct ra_removed_interval *removed, physreg_t dst)
+                 const struct ra_removed_interval *removed, physreg_t dst)
  {
-       struct ra_interval *interval = removed->interval;
+   struct ra_interval *interval = removed->interval;
  
-       interval->physreg_start = dst;
-       interval->physreg_end = dst + removed->size;
+   interval->physreg_start = dst;
+   interval->physreg_end = dst + removed->size;
  
-       ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
  }
  
  /* Pick up the interval and place it at "dst". */
  static void
  ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
-                                struct ra_interval *interval, physreg_t dst)
+                 struct ra_interval *interval, physreg_t dst)
  {
-       struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
-       ra_push_interval(ctx, file, &temp, dst);
+   struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
+   ra_push_interval(ctx, file, &temp, dst);
  }
  
  static bool
-get_reg_specified(struct ra_file *file, struct ir3_register *reg, physreg_t physreg, bool is_source)
+get_reg_specified(struct ra_file *file, struct ir3_register *reg,
+                  physreg_t physreg, bool is_source)
  {
-       for (unsigned i = 0; i < reg_size(reg); i++) {
-               if (!BITSET_TEST(is_source ? file->available_to_evict : file->available, physreg + i))
-                       return false;
-       }
+   for (unsigned i = 0; i < reg_size(reg); i++) {
+      if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
+                       physreg + i))
+         return false;
+   }
  
-       return true;
+   return true;
  }
  
  /* Try to evict any registers conflicting with the proposed spot "physreg" for
@@ -691,105 +709,111 @@ get_reg_specified(struct ra_file *file, struct ir3_register *reg, physreg_t phys
  
  static bool
  try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
-                      struct ir3_register *reg, physreg_t physreg,
-                          unsigned *_eviction_count, bool is_source, bool speculative)
-{
-       BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
-       memcpy(available_to_evict, file->available_to_evict, sizeof(available_to_evict));
-
-       for (unsigned i = 0; i < reg_size(reg); i++)
-               BITSET_CLEAR(available_to_evict, physreg + i);
-       
-       unsigned eviction_count = 0;
-       /* Iterate over each range conflicting with physreg */
-       for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
-                *next = ra_interval_next_or_null(conflicting);
-                conflicting != NULL && conflicting->physreg_start < physreg + reg_size(reg);
-                conflicting = next, next = ra_interval_next_or_null(next)) {
-               if (!is_source && conflicting->is_killed)
-                       continue;
-
-               if (conflicting->frozen) {
-                       assert(speculative);
-                       return false;
-               }
-
-               unsigned avail_start, avail_end;
-               bool evicted = false;
-               BITSET_FOREACH_RANGE(avail_start, avail_end, available_to_evict,
-                                                        reg_file_size(file, conflicting->interval.reg)) {
-                       unsigned size = avail_end - avail_start;
-
-                       /* non-half registers must be aligned */
-                       if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && avail_start % 2 == 1) {
-                               avail_start++;
-                               size--;
-                       }
-
-                       if (size >= conflicting->physreg_end - conflicting->physreg_start) {
-                               for (unsigned i = 0; i < conflicting->physreg_end - conflicting->physreg_start; i++)
-                                       BITSET_CLEAR(available_to_evict, avail_start + i);
-                               eviction_count += conflicting->physreg_end - conflicting->physreg_start;
-                               if (!speculative)
-                                       ra_move_interval(ctx, file, conflicting, avail_start);
-                               evicted = true;
-                               break;
-                       }
-               }
-
-               if (!evicted)
-                       return false;
-       }
-
-       *_eviction_count = eviction_count;
-       return true;
-}
-
-static int removed_interval_cmp(const void *_i1, const void *_i2)
-{
-       const struct ra_removed_interval *i1 = _i1;
-       const struct ra_removed_interval *i2 = _i2;
-
-       /* We sort the registers as follows:
-        *
-        * |--------------------------------------------------------------------|
-        * |                    |             |             |                   |
-        * |  Half live-through | Half killed | Full killed | Full live-through |
-        * |                    |             |             |                   |
-        * |--------------------------------------------------------------------|
-        *                        |                 |
-        *                        |   Destination   |
-        *                        |                 |
-        *                        |-----------------|
-        *
-        * Half-registers have to be first so that they stay in the low half of
-        * the register file. Then half and full killed must stay together so that
-        * there's a contiguous range where we can put the register. With this
-        * structure we should be able to accomodate any collection of intervals
-        * such that the total number of half components is within the half limit
-        * and the combined components are within the full limit.
-        */
-
-       unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
-       unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
-       if (i1_align > i2_align)
-               return 1;
-       if (i1_align < i2_align)
-               return -1;
-
-       if (i1_align == 1) {
-               if (i2->interval->is_killed)
-                       return -1;
-               if (i1->interval->is_killed)
-                       return 1;
-       } else {
-               if (i2->interval->is_killed)
-                       return 1;
-               if (i1->interval->is_killed)
-                       return -1;
-       }
-
-       return 0;
+               struct ir3_register *reg, physreg_t physreg,
+               unsigned *_eviction_count, bool is_source, bool speculative)
+{
+   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
+   memcpy(available_to_evict, file->available_to_evict,
+          sizeof(available_to_evict));
+
+   for (unsigned i = 0; i < reg_size(reg); i++)
+      BITSET_CLEAR(available_to_evict, physreg + i);
+
+   unsigned eviction_count = 0;
+   /* Iterate over each range conflicting with physreg */
+   for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
+                           *next = ra_interval_next_or_null(conflicting);
+        conflicting != NULL &&
+        conflicting->physreg_start < physreg + reg_size(reg);
+        conflicting = next, next = ra_interval_next_or_null(next)) {
+      if (!is_source && conflicting->is_killed)
+         continue;
+
+      if (conflicting->frozen) {
+         assert(speculative);
+         return false;
+      }
+
+      unsigned avail_start, avail_end;
+      bool evicted = false;
+      BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
+                            reg_file_size(file, conflicting->interval.reg)) {
+         unsigned size = avail_end - avail_start;
+
+         /* non-half registers must be aligned */
+         if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
+             avail_start % 2 == 1) {
+            avail_start++;
+            size--;
+         }
+
+         if (size >= conflicting->physreg_end - conflicting->physreg_start) {
+            for (unsigned i = 0;
+                 i < conflicting->physreg_end - conflicting->physreg_start; i++)
+               BITSET_CLEAR(available_to_evict, avail_start + i);
+            eviction_count +=
+               conflicting->physreg_end - conflicting->physreg_start;
+            if (!speculative)
+               ra_move_interval(ctx, file, conflicting, avail_start);
+            evicted = true;
+            break;
+         }
+      }
+
+      if (!evicted)
+         return false;
+   }
+
+   *_eviction_count = eviction_count;
+   return true;
+}
+
+static int
+removed_interval_cmp(const void *_i1, const void *_i2)
+{
+   const struct ra_removed_interval *i1 = _i1;
+   const struct ra_removed_interval *i2 = _i2;
+
+   /* We sort the registers as follows:
+    *
+    * |--------------------------------------------------------------------|
+    * |                    |             |             |                   |
+    * |  Half live-through | Half killed | Full killed | Full live-through |
+    * |                    |             |             |                   |
+    * |--------------------------------------------------------------------|
+    *                        |                 |
+    *                        |   Destination   |
+    *                        |                 |
+    *                        |-----------------|
+    *
+    * Half-registers have to be first so that they stay in the low half of
+    * the register file. Then half and full killed must stay together so that
+    * there's a contiguous range where we can put the register. With this
+    * structure we should be able to accomodate any collection of intervals
+    * such that the total number of half components is within the half limit
+    * and the combined components are within the full limit.
+    */
+
+   unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
+   unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
+   if (i1_align > i2_align)
+      return 1;
+   if (i1_align < i2_align)
+      return -1;
+
+   if (i1_align == 1) {
+      if (i2->interval->is_killed)
+         return -1;
+      if (i1->interval->is_killed)
+         return 1;
+   } else {
+      if (i2->interval->is_killed)
+         return 1;
+      if (i1->interval->is_killed)
+         return -1;
+   }
+
+   return 0;
  }
  
  /* "Compress" all the live intervals so that there is enough space for the
@@ -803,158 +827,165 @@ static int removed_interval_cmp(const void *_i1, const void *_i2)
   */
  static physreg_t
  compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
-                                  unsigned align, bool is_source)
-{
-       DECLARE_ARRAY(struct ra_removed_interval, intervals);
-       intervals_count = intervals_sz = 0;
-       intervals = NULL;
-
-       unsigned removed_full_size = 0;
-       unsigned removed_half_size = 0;
-       unsigned file_size = align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
-       physreg_t start_reg = 0;
-
-       foreach_interval_rev_safe(interval, file) {
-               /* Check if we can sort the intervals *after* this one and have
-                * enough space leftover to accomodate "size" units.
-                */
-               if (align == 1) {
-                       if (interval->physreg_end + removed_half_size <= file_size - size) {
-                               start_reg = interval->physreg_end;
-                               break;
-                       }
-               } else {
-                       if (interval->physreg_end + removed_half_size <= file_size -
-                                       removed_full_size - size) {
-                               start_reg = interval->physreg_end;
-                               break;
-                       }
-               }
-
-               /* We assume that all frozen intervals are at the start and that we
-                * can avoid popping them.
-                */
-               assert(!interval->frozen);
-
-               /* Killed sources don't count because they go at the end and can
-                * overlap the register we're trying to add.
-                */
-               if (!interval->is_killed && !is_source) {
-                       if (interval->interval.reg->flags & IR3_REG_HALF)
-                               removed_half_size += interval->physreg_end - interval->physreg_start;
-                       else
-                               removed_full_size += interval->physreg_end - interval->physreg_start;
-               }
-
-               /* Now that we've done the accounting, pop this off */
-               d("popping interval %u physreg %u\n", interval->interval.reg->name, interval->physreg_start);
-               array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
-       }
-
-       /* TODO: In addition to skipping registers at the beginning that are
-        * well-packed, we should try to skip registers at the end.
-        */
-
-       qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
-
-       physreg_t physreg = start_reg;
-       physreg_t ret_reg = (physreg_t) ~0;
-       for (unsigned i = 0; i < intervals_count; i++) {
-               if (ret_reg == (physreg_t) ~0 &&
-                       ((intervals[i].interval->is_killed && !is_source) ||
-                       !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
-                       ret_reg = ALIGN(physreg, align);
-               }
-
-               if (ret_reg != (physreg_t) ~0 &&
-                       (is_source || !intervals[i].interval->is_killed)) {
-                       physreg = MAX2(physreg, ret_reg + size);
-               }
-
-               if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
-                       physreg = ALIGN(physreg, 2);
-               }
-
-               if (physreg + intervals[i].size >
-                       reg_file_size(file, intervals[i].interval->interval.reg)) {
-                       d("ran out of room for interval %u!\n", intervals[i].interval->interval.reg->name);
-                       unreachable("reg pressure calculation was wrong!");
-                       return 0;
-               }
-
-               d("pushing interval %u physreg %u\n", intervals[i].interval->interval.reg->name, physreg);
-               ra_push_interval(ctx, file, &intervals[i], physreg);
-
-               physreg += intervals[i].size;
-       }
-
-       if (ret_reg == (physreg_t) ~0)
-               ret_reg = physreg;
-
-       ret_reg = ALIGN(ret_reg, align);
-       if (ret_reg + size > file_size) {
-               d("ran out of room for the new interval!\n");
-               unreachable("reg pressure calculation was wrong!");
-               return 0;
-       }
-
-       return ret_reg;
+                   unsigned align, bool is_source)
+{
+   DECLARE_ARRAY(struct ra_removed_interval, intervals);
+   intervals_count = intervals_sz = 0;
+   intervals = NULL;
+
+   unsigned removed_full_size = 0;
+   unsigned removed_half_size = 0;
+   unsigned file_size =
+      align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
+   physreg_t start_reg = 0;
+
+   foreach_interval_rev_safe (interval, file) {
+      /* Check if we can sort the intervals *after* this one and have
+       * enough space leftover to accomodate "size" units.
+       */
+      if (align == 1) {
+         if (interval->physreg_end + removed_half_size <= file_size - size) {
+            start_reg = interval->physreg_end;
+            break;
+         }
+      } else {
+         if (interval->physreg_end + removed_half_size <=
+             file_size - removed_full_size - size) {
+            start_reg = interval->physreg_end;
+            break;
+         }
+      }
+
+      /* We assume that all frozen intervals are at the start and that we
+       * can avoid popping them.
+       */
+      assert(!interval->frozen);
+
+      /* Killed sources don't count because they go at the end and can
+       * overlap the register we're trying to add.
+       */
+      if (!interval->is_killed && !is_source) {
+         if (interval->interval.reg->flags & IR3_REG_HALF)
+            removed_half_size +=
+               interval->physreg_end - interval->physreg_start;
+         else
+            removed_full_size +=
+               interval->physreg_end - interval->physreg_start;
+      }
+
+      /* Now that we've done the accounting, pop this off */
+      d("popping interval %u physreg %u\n", interval->interval.reg->name,
+        interval->physreg_start);
+      array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
+   }
+
+   /* TODO: In addition to skipping registers at the beginning that are
+    * well-packed, we should try to skip registers at the end.
+    */
+
+   qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
+
+   physreg_t physreg = start_reg;
+   physreg_t ret_reg = (physreg_t)~0;
+   for (unsigned i = 0; i < intervals_count; i++) {
+      if (ret_reg == (physreg_t)~0 &&
+          ((intervals[i].interval->is_killed && !is_source) ||
+           !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
+         ret_reg = ALIGN(physreg, align);
+      }
+
+      if (ret_reg != (physreg_t)~0 &&
+          (is_source || !intervals[i].interval->is_killed)) {
+         physreg = MAX2(physreg, ret_reg + size);
+      }
+
+      if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
+         physreg = ALIGN(physreg, 2);
+      }
+
+      if (physreg + intervals[i].size >
+          reg_file_size(file, intervals[i].interval->interval.reg)) {
+         d("ran out of room for interval %u!\n",
+           intervals[i].interval->interval.reg->name);
+         unreachable("reg pressure calculation was wrong!");
+         return 0;
+      }
+
+      d("pushing interval %u physreg %u\n",
+        intervals[i].interval->interval.reg->name, physreg);
+      ra_push_interval(ctx, file, &intervals[i], physreg);
+
+      physreg += intervals[i].size;
+   }
+
+   if (ret_reg == (physreg_t)~0)
+      ret_reg = physreg;
+
+   ret_reg = ALIGN(ret_reg, align);
+   if (ret_reg + size > file_size) {
+      d("ran out of room for the new interval!\n");
+      unreachable("reg pressure calculation was wrong!");
+      return 0;
+   }
+
+   return ret_reg;
  }
  
  static void
  update_affinity(struct ir3_register *reg, physreg_t physreg)
  {
-       if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t) ~0)
-               return;
+   if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
+      return;
  
-       if (physreg < reg->merge_set_offset)
-               return;
+   if (physreg < reg->merge_set_offset)
+      return;
  
-       reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
+   reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
  }
  
  /* Try to find free space for a register without shuffling anything. This uses
   * a round-robin algorithm to reduce false dependencies.
   */
  static physreg_t
-find_best_gap(struct ra_file *file, unsigned file_size,
-                     unsigned size, unsigned align, bool is_source)
-{
-       BITSET_WORD *available = is_source ? file->available_to_evict : file->available;
-
-       unsigned start = ALIGN(file->start, align) % (file_size - size + align);
-       unsigned candidate = start;
-       do {
-               bool is_available = true;
-               for (unsigned i = 0; i < size; i++) {
-                       if (!BITSET_TEST(available, candidate + i)) {
-                               is_available = false;
-                               break;
-                       }
-               }
-
-               if (is_available) {
-                       file->start = (candidate + size) % file_size;
-                       return candidate;
-               }
-
-               candidate += align;
-               if (candidate + size > file_size)
-                       candidate = 0;
-       } while (candidate != start);
-       
-       return (physreg_t) ~0;
+find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
+              unsigned align, bool is_source)
+{
+   BITSET_WORD *available =
+      is_source ? file->available_to_evict : file->available;
+
+   unsigned start = ALIGN(file->start, align) % (file_size - size + align);
+   unsigned candidate = start;
+   do {
+      bool is_available = true;
+      for (unsigned i = 0; i < size; i++) {
+         if (!BITSET_TEST(available, candidate + i)) {
+            is_available = false;
+            break;
+         }
+      }
+
+      if (is_available) {
+         file->start = (candidate + size) % file_size;
+         return candidate;
+      }
+
+      candidate += align;
+      if (candidate + size > file_size)
+         candidate = 0;
+   } while (candidate != start);
+
+   return (physreg_t)~0;
  }
  
  static struct ra_file *
  ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_SHARED)
-               return &ctx->shared;
-       else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
-               return &ctx->full;
-       else
-               return &ctx->half;
+   if (reg->flags & IR3_REG_SHARED)
+      return &ctx->shared;
+   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
+      return &ctx->full;
+   else
+      return &ctx->half;
  }
  
  /* This is the main entrypoint for picking a register. Pick a free register
@@ -968,195 +999,200 @@ ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
  
  static physreg_t
  get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
-               bool is_source)
-{
-       unsigned file_size = reg_file_size(file, reg);
-       if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t) ~0) {
-               physreg_t preferred_reg =
-                       reg->merge_set->preferred_reg + reg->merge_set_offset;
-               if (preferred_reg < file_size &&
-                       preferred_reg % reg_elem_size(reg) == 0 &&
-                       get_reg_specified(file, reg, preferred_reg, is_source))
-                       return preferred_reg;
-       }
-
-       /* If this register is a subset of a merge set which we have not picked a
-        * register for, first try to allocate enough space for the entire merge
-        * set.
-        */
-       unsigned size = reg_size(reg);
-       if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
-               size < reg->merge_set->size) {
-               physreg_t best_reg =
-                       find_best_gap(file, file_size, reg->merge_set->size, reg->merge_set->alignment, is_source);
-               if (best_reg != (physreg_t) ~0u) {
-                       best_reg += reg->merge_set_offset;
-                       return best_reg;
-               }
-       }
-
-       /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
-        * Because this doesn't introduce unnecessary dependencies, and it
-        * potentially avoids needing (ss) syncs for write after read hazards for
-        * SFU instructions:
-        */
-       if (is_sfu(reg->instr) || is_alu(reg->instr)) {
-               for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
-                       struct ir3_register *src = reg->instr->srcs[i];
-                       if (!ra_reg_is_src(src))
-                               continue;
-                       if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
-                               struct ra_interval *src_interval =
-                                       &ctx->intervals[src->def->name];
-                               physreg_t src_physreg = ra_interval_get_physreg(src_interval);
-                               if (src_physreg % reg_elem_size(reg) == 0 &&
-                                       src_physreg + size <= file_size &&
-                                       get_reg_specified(file, reg, src_physreg, is_source))
-                                       return src_physreg;
-                       }
-               }
-       }
-
-       physreg_t best_reg =
-               find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
-       if (best_reg != (physreg_t) ~0u) {
-               return best_reg;
-       }
-
-       /* Ok, we couldn't find anything that fits. Here is where we have to start
-        * moving things around to make stuff fit. First try solely evicting
-        * registers in the way.
-        */
-       unsigned best_eviction_count = ~0;
-       for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
-               unsigned eviction_count;
-               if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
-                       if (eviction_count < best_eviction_count) {
-                               best_eviction_count = eviction_count;
-                               best_reg = i;
-                       }
-               }
-       }
-       
-       if (best_eviction_count != ~0) {
-               ASSERTED bool result =
-                       try_evict_regs(ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
-               assert(result);
-               return best_reg;
-       }
-
-       /* Use the dumb fallback only if try_evict_regs() fails. */
-       return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg), is_source);
+        bool is_source)
+{
+   unsigned file_size = reg_file_size(file, reg);
+   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
+      physreg_t preferred_reg =
+         reg->merge_set->preferred_reg + reg->merge_set_offset;
+      if (preferred_reg < file_size &&
+          preferred_reg % reg_elem_size(reg) == 0 &&
+          get_reg_specified(file, reg, preferred_reg, is_source))
+         return preferred_reg;
+   }
+
+   /* If this register is a subset of a merge set which we have not picked a
+    * register for, first try to allocate enough space for the entire merge
+    * set.
+    */
+   unsigned size = reg_size(reg);
+   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
+       size < reg->merge_set->size) {
+      physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
+                                         reg->merge_set->alignment, is_source);
+      if (best_reg != (physreg_t)~0u) {
+         best_reg += reg->merge_set_offset;
+         return best_reg;
+      }
+   }
+
+   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
+    * Because this doesn't introduce unnecessary dependencies, and it
+    * potentially avoids needing (ss) syncs for write after read hazards for
+    * SFU instructions:
+    */
+   if (is_sfu(reg->instr) || is_alu(reg->instr)) {
+      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
+         struct ir3_register *src = reg->instr->srcs[i];
+         if (!ra_reg_is_src(src))
+            continue;
+         if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
+            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
+            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
+            if (src_physreg % reg_elem_size(reg) == 0 &&
+                src_physreg + size <= file_size &&
+                get_reg_specified(file, reg, src_physreg, is_source))
+               return src_physreg;
+         }
+      }
+   }
+
+   physreg_t best_reg =
+      find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
+   if (best_reg != (physreg_t)~0u) {
+      return best_reg;
+   }
+
+   /* Ok, we couldn't find anything that fits. Here is where we have to start
+    * moving things around to make stuff fit. First try solely evicting
+    * registers in the way.
+    */
+   unsigned best_eviction_count = ~0;
+   for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
+      unsigned eviction_count;
+      if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
+         if (eviction_count < best_eviction_count) {
+            best_eviction_count = eviction_count;
+            best_reg = i;
+         }
+      }
+   }
+
+   if (best_eviction_count != ~0) {
+      ASSERTED bool result = try_evict_regs(
+         ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
+      assert(result);
+      return best_reg;
+   }
+
+   /* Use the dumb fallback only if try_evict_regs() fails. */
+   return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
+                             is_source);
  }
  
  static void
-assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, unsigned num)
+assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
+           unsigned num)
  {
-       if (reg->flags & IR3_REG_ARRAY) {
-               reg->array.base = num;
-               if (reg->flags & IR3_REG_RELATIV)
-                       reg->array.offset += num;
-               else
-                       reg->num = num + reg->array.offset;
-       } else {
-               reg->num = num;
-       }
+   if (reg->flags & IR3_REG_ARRAY) {
+      reg->array.base = num;
+      if (reg->flags & IR3_REG_RELATIV)
+         reg->array.offset += num;
+      else
+         reg->num = num + reg->array.offset;
+   } else {
+      reg->num = num;
+   }
  }
  
  static void
  mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
  {
-       struct ra_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+   if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
+       interval->interval.parent ||
+       !rb_tree_is_empty(&interval->interval.children))
+      return;
  
-       if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
-               interval->interval.parent || !rb_tree_is_empty(&interval->interval.children))
-               return;
-       
-       ra_file_mark_killed(ra_get_file(ctx, src), interval);
+   ra_file_mark_killed(ra_get_file(ctx, src), interval);
  }
  
  static void
  insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
  {
-       struct ra_file *file = ra_get_file(ctx, dst);
-       struct ra_interval *interval = &ctx->intervals[dst->name];
+   struct ra_file *file = ra_get_file(ctx, dst);
+   struct ra_interval *interval = &ctx->intervals[dst->name];
  
-       d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
+   d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
  
-       if (!(dst->flags & IR3_REG_UNUSED))
-               ra_file_insert(file, interval);
+   if (!(dst->flags & IR3_REG_UNUSED))
+      ra_file_insert(file, interval);
  
-       assign_reg(dst->instr, dst, ra_interval_get_num(interval));
+   assign_reg(dst->instr, dst, ra_interval_get_num(interval));
  }
  
  static void
-allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, physreg_t physreg)
+allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
+                   physreg_t physreg)
  {
-       struct ra_interval *interval = &ctx->intervals[dst->name];
-       update_affinity(dst, physreg);
+   struct ra_interval *interval = &ctx->intervals[dst->name];
+   update_affinity(dst, physreg);
  
-       ra_interval_init(interval, dst);
-       interval->physreg_start = physreg;
-       interval->physreg_end = physreg + reg_size(dst);
+   ra_interval_init(interval, dst);
+   interval->physreg_start = physreg;
+   interval->physreg_end = physreg + reg_size(dst);
  }
  
  static void
  allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
  {
-       struct ra_file *file = ra_get_file(ctx, dst);
-
-       struct ir3_register *tied = dst->tied;
-       if (tied) {
-               struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
-               struct ra_interval *dst_interval = &ctx->intervals[dst->name];
-               physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
-               if (tied_interval->is_killed) {
-                       /* The easy case: the source is killed, so we can just reuse it
-                        * for the destination.
-                        */
-                       allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
-               } else {
-                       /* The source is live-through, so we need to get a free register
-                        * (which is free for both the source and destination!), copy the
-                        * original source to it, then use that for the source and
-                        * destination.
-                        */
-                       physreg_t physreg = get_reg(ctx, file, dst, true);
-                       allocate_dst_fixed(ctx, dst, physreg);
-                       array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) {
-                                       .interval = dst_interval,
-                                       .src = tied_physreg,
-                       });
-               }
-
-               return;
-       }
-
-       /* All the hard work is done by get_reg here. */
-       physreg_t physreg = get_reg(ctx, file, dst, false);
-
-       allocate_dst_fixed(ctx, dst, physreg);
+   struct ra_file *file = ra_get_file(ctx, dst);
+
+   struct ir3_register *tied = dst->tied;
+   if (tied) {
+      struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
+      struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+      physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
+      if (tied_interval->is_killed) {
+         /* The easy case: the source is killed, so we can just reuse it
+          * for the destination.
+          */
+         allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
+      } else {
+         /* The source is live-through, so we need to get a free register
+          * (which is free for both the source and destination!), copy the
+          * original source to it, then use that for the source and
+          * destination.
+          */
+         physreg_t physreg = get_reg(ctx, file, dst, true);
+         allocate_dst_fixed(ctx, dst, physreg);
+         array_insert(ctx, ctx->parallel_copies,
+                      (struct ra_parallel_copy){
+                         .interval = dst_interval,
+                         .src = tied_physreg,
+                      });
+      }
+
+      return;
+   }
+
+   /* All the hard work is done by get_reg here. */
+   physreg_t physreg = get_reg(ctx, file, dst, false);
+
+   allocate_dst_fixed(ctx, dst, physreg);
  }
  
  static void
-assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
+assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
+           struct ir3_register *src)
  {
-       struct ra_interval *interval = &ctx->intervals[src->def->name];
-       struct ra_file *file = ra_get_file(ctx, src);
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_file *file = ra_get_file(ctx, src);
  
-       struct ir3_register *tied = src->tied;
-       physreg_t physreg;
-       if (tied) {
-               struct ra_interval *tied_interval = &ctx->intervals[tied->name];
-               physreg = ra_interval_get_physreg(tied_interval);
-       } else {
-               physreg = ra_interval_get_physreg(interval);
-       }
+   struct ir3_register *tied = src->tied;
+   physreg_t physreg;
+   if (tied) {
+      struct ra_interval *tied_interval = &ctx->intervals[tied->name];
+      physreg = ra_interval_get_physreg(tied_interval);
+   } else {
+      physreg = ra_interval_get_physreg(interval);
+   }
  
-       assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
+   assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
  
-       if (src->flags & IR3_REG_FIRST_KILL)
-               ra_file_remove(file, interval);
+   if (src->flags & IR3_REG_FIRST_KILL)
+      ra_file_remove(file, interval);
  }
  
  /* Insert a parallel copy instruction before the instruction with the parallel
@@ -1165,190 +1201,190 @@ assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_registe
  static void
  insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (ctx->parallel_copies_count == 0)
-               return;
-
-       struct ir3_instruction *pcopy =
-               ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
-                                                ctx->parallel_copies_count,
-                                                ctx->parallel_copies_count);
-
-       for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
-               struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
-               struct ir3_register *reg =
-                       ir3_dst_create(pcopy, INVALID_REG,
-                                                  entry->interval->interval.reg->flags & ~IR3_REG_SSA);
-               reg->size = entry->interval->interval.reg->size;
-               reg->wrmask = entry->interval->interval.reg->wrmask;
-               assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
-       }
-
-       for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
-               struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
-               struct ir3_register *reg =
-                       ir3_src_create(pcopy, INVALID_REG,
-                                                  entry->interval->interval.reg->flags & ~IR3_REG_SSA);
-               reg->size = entry->interval->interval.reg->size;
-               reg->wrmask = entry->interval->interval.reg->wrmask;
-               assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
-       }
-
-       list_del(&pcopy->node);
-       list_addtail(&pcopy->node, &instr->node);
-       ctx->parallel_copies_count = 0;
+   if (ctx->parallel_copies_count == 0)
+      return;
+
+   struct ir3_instruction *pcopy =
+      ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
+                       ctx->parallel_copies_count, ctx->parallel_copies_count);
+
+   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
+      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
+      struct ir3_register *reg =
+         ir3_dst_create(pcopy, INVALID_REG,
+                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
+      reg->size = entry->interval->interval.reg->size;
+      reg->wrmask = entry->interval->interval.reg->wrmask;
+      assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
+   }
+
+   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
+      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
+      struct ir3_register *reg =
+         ir3_src_create(pcopy, INVALID_REG,
+                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
+      reg->size = entry->interval->interval.reg->size;
+      reg->wrmask = entry->interval->interval.reg->wrmask;
+      assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
+   }
+
+   list_del(&pcopy->node);
+   list_addtail(&pcopy->node, &instr->node);
+   ctx->parallel_copies_count = 0;
  }
  
  static void
  handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       /* First, mark sources as going-to-be-killed while allocating the dest. */
-       ra_foreach_src(src, instr) {
-               mark_src_killed(ctx, src);
-       }
+   /* First, mark sources as going-to-be-killed while allocating the dest. */
+   ra_foreach_src (src, instr) {
+      mark_src_killed(ctx, src);
+   }
  
-       /* Allocate the destination. */
-       ra_foreach_dst(dst, instr) {
-               allocate_dst(ctx, dst);
-       }
+   /* Allocate the destination. */
+   ra_foreach_dst (dst, instr) {
+      allocate_dst(ctx, dst);
+   }
  
-       /* Now handle sources. Go backward so that in case there are multiple
-        * sources with the same def and that def is killed we only remove it at
-        * the end.
-        */
-       ra_foreach_src_rev(src, instr) {
-               assign_src(ctx, instr, src);
-       }
+   /* Now handle sources. Go backward so that in case there are multiple
+    * sources with the same def and that def is killed we only remove it at
+    * the end.
+    */
+   ra_foreach_src_rev (src, instr) {
+      assign_src(ctx, instr, src);
+   }
  
-       /* Now finally insert the destination into the map. */
-       ra_foreach_dst(dst, instr) {
-               insert_dst(ctx, dst);
-       }
+   /* Now finally insert the destination into the map. */
+   ra_foreach_dst (dst, instr) {
+      insert_dst(ctx, dst);
+   }
  
-       insert_parallel_copy_instr(ctx, instr);
+   insert_parallel_copy_instr(ctx, instr);
  }
  
  static void
  handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       struct ir3_register *dst = instr->dsts[0];
-       struct ir3_register *src = instr->srcs[0];
+   struct ir3_register *dst = instr->dsts[0];
+   struct ir3_register *src = instr->srcs[0];
  
-       if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
-               handle_normal_instr(ctx, instr);
-               return;
-       }
+   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
+      handle_normal_instr(ctx, instr);
+      return;
+   }
  
-       struct ra_interval *src_interval = &ctx->intervals[src->def->name];
+   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
  
-       physreg_t physreg = ra_interval_get_physreg(src_interval);
-       assign_src(ctx, instr, src);
+   physreg_t physreg = ra_interval_get_physreg(src_interval);
+   assign_src(ctx, instr, src);
  
-       allocate_dst_fixed(ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
-       insert_dst(ctx, dst);
+   allocate_dst_fixed(
+      ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
+   insert_dst(ctx, dst);
  }
  
  static void
  handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
-       unsigned dst_offset = instr->dsts[0]->merge_set_offset;
-       
-       if (!dst_set || dst_set->regs_count == 1) {
-               handle_normal_instr(ctx, instr);
-               return;
-       }
-
-       /* We need to check if any of the sources are contained in an interval
-        * that is at least as large as the vector. In this case, we should put
-        * the vector inside that larger interval. (There should be one
-        * unambiguous place to put it, because values sharing the same merge set
-        * should be allocated together.) This can happen in a case like:
-        *
-        * ssa_1 (wrmask=0xf) = ...
-        * ssa_2 = split ssa_1 off:0
-        * ssa_3 = split ssa_1 off:1
-        * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
-        * ... = (kill)ssa_1
-        * ... = (kill)ssa_4
-        *
-        * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
-        */
-       physreg_t dst_fixed = (physreg_t) ~0u;
-
-       for (unsigned i = 0; i < instr->srcs_count; i++) {
-               if (!ra_reg_is_src(instr->srcs[i]))
-                       continue;
-
-               if (instr->srcs[i]->flags & IR3_REG_FIRST_KILL) {
-                       mark_src_killed(ctx, instr->srcs[i]);
-               }
-
-               struct ir3_register *src = instr->srcs[i];
-               struct ra_interval *interval = &ctx->intervals[src->def->name];
-
-               if (src->def->merge_set != dst_set || interval->is_killed)
-                       continue;
-               while (interval->interval.parent != NULL) {
-                       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
-               }
-               if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
-                       dst_fixed = interval->physreg_start - interval->interval.reg->merge_set_offset + dst_offset;
-               } else {
-                       /* For sources whose root interval is smaller than the
-                        * destination (i.e. the normal case), we will shuffle them
-                        * around after allocating the destination. Mark them killed so
-                        * that the destination can be allocated over them, even if they
-                        * aren't actually killed.
-                        */
-                       ra_file_mark_killed(ra_get_file(ctx, src), interval);
-               }
-       }
-
-       if (dst_fixed != (physreg_t) ~0u)
-               allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
-       else
-               allocate_dst(ctx, instr->dsts[0]);
-
-       /* Remove the temporary is_killed we added */
-       for (unsigned i = 0; i < instr->srcs_count; i++) {
-               if (!ra_reg_is_src(instr->srcs[i]))
-                       continue;
-
-               struct ir3_register *src = instr->srcs[i];
-               struct ra_interval *interval = &ctx->intervals[src->def->name];
-               while (interval->interval.parent != NULL) {
-                       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
-               }
-
-               /* Filter out cases where it actually should be killed */
-               if (interval != &ctx->intervals[src->def->name] ||
-                       !(src->flags & IR3_REG_KILL))
-                       interval->is_killed = false;
-       }
-
-
-       ra_foreach_src_rev(src, instr) {
-               assign_src(ctx, instr, src);
-       }
-
-       /* We need to do this before insert_dst(), so that children of the
-        * destination which got marked as killed and then shuffled around to make
-        * space for the destination have the correct pcopy destination that
-        * matches what we assign the source of the collect to in assign_src().
-        *
-        * TODO: In this case we'll wind up copying the value in the pcopy and
-        * then again in the collect. We could avoid one of those by updating the
-        * pcopy destination to match up with the final location of the source
-        * after the collect and making the collect a no-op. However this doesn't
-        * seem to happen often.
-        */
-       insert_parallel_copy_instr(ctx, instr);
-
-       /* Note: insert_dst will automatically shuffle around any intervals that
-        * are a child of the collect by making them children of the collect.
-        */
-
-       insert_dst(ctx, instr->dsts[0]);
+   struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
+   unsigned dst_offset = instr->dsts[0]->merge_set_offset;
+
+   if (!dst_set || dst_set->regs_count == 1) {
+      handle_normal_instr(ctx, instr);
+      return;
+   }
+
+   /* We need to check if any of the sources are contained in an interval
+    * that is at least as large as the vector. In this case, we should put
+    * the vector inside that larger interval. (There should be one
+    * unambiguous place to put it, because values sharing the same merge set
+    * should be allocated together.) This can happen in a case like:
+    *
+    * ssa_1 (wrmask=0xf) = ...
+    * ssa_2 = split ssa_1 off:0
+    * ssa_3 = split ssa_1 off:1
+    * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
+    * ... = (kill)ssa_1
+    * ... = (kill)ssa_4
+    *
+    * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
+    */
+   physreg_t dst_fixed = (physreg_t)~0u;
+
+   for (unsigned i = 0; i < instr->srcs_count; i++) {
+      if (!ra_reg_is_src(instr->srcs[i]))
+         continue;
+
+      if (instr->srcs[i]->flags & IR3_REG_FIRST_KILL) {
+         mark_src_killed(ctx, instr->srcs[i]);
+      }
+
+      struct ir3_register *src = instr->srcs[i];
+      struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+      if (src->def->merge_set != dst_set || interval->is_killed)
+         continue;
+      while (interval->interval.parent != NULL) {
+         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+      }
+      if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
+         dst_fixed = interval->physreg_start -
+                     interval->interval.reg->merge_set_offset + dst_offset;
+      } else {
+         /* For sources whose root interval is smaller than the
+          * destination (i.e. the normal case), we will shuffle them
+          * around after allocating the destination. Mark them killed so
+          * that the destination can be allocated over them, even if they
+          * aren't actually killed.
+          */
+         ra_file_mark_killed(ra_get_file(ctx, src), interval);
+      }
+   }
+
+   if (dst_fixed != (physreg_t)~0u)
+      allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
+   else
+      allocate_dst(ctx, instr->dsts[0]);
+
+   /* Remove the temporary is_killed we added */
+   for (unsigned i = 0; i < instr->srcs_count; i++) {
+      if (!ra_reg_is_src(instr->srcs[i]))
+         continue;
+
+      struct ir3_register *src = instr->srcs[i];
+      struct ra_interval *interval = &ctx->intervals[src->def->name];
+      while (interval->interval.parent != NULL) {
+         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+      }
+
+      /* Filter out cases where it actually should be killed */
+      if (interval != &ctx->intervals[src->def->name] ||
+          !(src->flags & IR3_REG_KILL))
+         interval->is_killed = false;
+   }
+
+   ra_foreach_src_rev (src, instr) {
+      assign_src(ctx, instr, src);
+   }
+
+   /* We need to do this before insert_dst(), so that children of the
+    * destination which got marked as killed and then shuffled around to make
+    * space for the destination have the correct pcopy destination that
+    * matches what we assign the source of the collect to in assign_src().
+    *
+    * TODO: In this case we'll wind up copying the value in the pcopy and
+    * then again in the collect. We could avoid one of those by updating the
+    * pcopy destination to match up with the final location of the source
+    * after the collect and making the collect a no-op. However this doesn't
+    * seem to happen often.
+    */
+   insert_parallel_copy_instr(ctx, instr);
+
+   /* Note: insert_dst will automatically shuffle around any intervals that
+    * are a child of the collect by making them children of the collect.
+    */
+
+   insert_dst(ctx, instr->dsts[0]);
  }
  
  /* Parallel copies before RA should only be at the end of the block, for
@@ -1358,9 +1394,9 @@ handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
  static void
  handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       ra_foreach_src_rev(src, instr) {
-               assign_src(ctx, instr, src);
-       }
+   ra_foreach_src_rev (src, instr) {
+      assign_src(ctx, instr, src);
+   }
  }
  
  /* Some inputs may need to be precolored. We need to handle those first, so
@@ -1372,46 +1408,46 @@ handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
  static void
  handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (instr->dsts[0]->num == INVALID_REG)
-               return;
+   if (instr->dsts[0]->num == INVALID_REG)
+      return;
  
-       struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
-       physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
-       allocate_dst_fixed(ctx, instr->dsts[0], physreg);
-       insert_dst(ctx, instr->dsts[0]);
-       interval->frozen = true;
+   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
+   physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
+   allocate_dst_fixed(ctx, instr->dsts[0], physreg);
+   insert_dst(ctx, instr->dsts[0]);
+   interval->frozen = true;
  }
  
  static void
  handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (instr->dsts[0]->num != INVALID_REG)
-               return;
+   if (instr->dsts[0]->num != INVALID_REG)
+      return;
  
-       allocate_dst(ctx, instr->dsts[0]);
+   allocate_dst(ctx, instr->dsts[0]);
  
-       struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
-       struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
-       ra_file_insert(file, interval);
+   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
+   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
+   ra_file_insert(file, interval);
  }
  
  static void
  assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
-       struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
+   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
+   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
  
-       if (instr->dsts[0]->num == INVALID_REG) {
-               assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
-       } else {
-               interval->frozen = false;
-       }
+   if (instr->dsts[0]->num == INVALID_REG) {
+      assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
+   } else {
+      interval->frozen = false;
+   }
  
-       if (instr->dsts[0]->flags & IR3_REG_UNUSED)
-               ra_file_remove(file, interval);
+   if (instr->dsts[0]->flags & IR3_REG_UNUSED)
+      ra_file_remove(file, interval);
  
-       ra_foreach_src_rev(src, instr)
-               assign_src(ctx, instr, src);
+   ra_foreach_src_rev (src, instr)
+      assign_src(ctx, instr, src);
  }
  
  /* chmask is a bit weird, because it has pre-colored sources due to the need
@@ -1433,156 +1469,158 @@ assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
  static void
  handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
  {
-       struct ra_file *file = ra_get_file(ctx, src);
-       struct ra_interval *interval = &ctx->intervals[src->def->name];
-       physreg_t physreg = ra_reg_get_physreg(src);
+   struct ra_file *file = ra_get_file(ctx, src);
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+   physreg_t physreg = ra_reg_get_physreg(src);
  
-       if (ra_interval_get_num(interval) == src->num)
-               return;
+   if (ra_interval_get_num(interval) == src->num)
+      return;
  
-       /* Try evicting stuff in our way if it isn't free. This won't move
-        * anything unless it overlaps with our precolored physreg, so we don't
-        * have to worry about evicting other precolored sources.
-        */
-       if (!get_reg_specified(file, src, physreg, true)) {
-               unsigned eviction_count;
-               if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, false)) {
-                       unreachable("failed to evict for precolored source!");
-                       return;
-               }
-       }
+   /* Try evicting stuff in our way if it isn't free. This won't move
+    * anything unless it overlaps with our precolored physreg, so we don't
+    * have to worry about evicting other precolored sources.
+    */
+   if (!get_reg_specified(file, src, physreg, true)) {
+      unsigned eviction_count;
+      if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
+                          false)) {
+         unreachable("failed to evict for precolored source!");
+         return;
+      }
+   }
  
-       ra_move_interval(ctx, file, interval, physreg);
+   ra_move_interval(ctx, file, interval, physreg);
  }
  
  static void
  handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
  {
-       /* Note: we purposely don't mark sources as killed, so that we can reuse
-        * some of the get_reg() machinery as-if the source is a destination.
-        * Marking it as killed would make e.g. get_reg_specified() wouldn't work
-        * correctly.
-        */
-       ra_foreach_src(src, instr) {
-               assert(src->num != INVALID_REG);
-               handle_precolored_source(ctx, src);
-       }
+   /* Note: we purposely don't mark sources as killed, so that we can reuse
+    * some of the get_reg() machinery as-if the source is a destination.
+    * Marking it as killed would make e.g. get_reg_specified() wouldn't work
+    * correctly.
+    */
+   ra_foreach_src (src, instr) {
+      assert(src->num != INVALID_REG);
+      handle_precolored_source(ctx, src);
+   }
  
-       ra_foreach_src(src, instr) {
-               struct ra_file *file = ra_get_file(ctx, src);
-               struct ra_interval *interval = &ctx->intervals[src->def->name];
-               if (src->flags & IR3_REG_FIRST_KILL)
-                       ra_file_remove(file, interval);
-       }
+   ra_foreach_src (src, instr) {
+      struct ra_file *file = ra_get_file(ctx, src);
+      struct ra_interval *interval = &ctx->intervals[src->def->name];
+      if (src->flags & IR3_REG_FIRST_KILL)
+         ra_file_remove(file, interval);
+   }
  
-       insert_parallel_copy_instr(ctx, instr);
+   insert_parallel_copy_instr(ctx, instr);
  }
  
  static physreg_t
-read_register(struct ra_ctx *ctx, struct ir3_block *block, struct ir3_register *def)
+read_register(struct ra_ctx *ctx, struct ir3_block *block,
+              struct ir3_register *def)
  {
-       struct ra_block_state *state = &ctx->blocks[block->index];
-       if (state->renames) {
-               struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
-               if (entry) {
-                       return (physreg_t)(uintptr_t)entry->data;
-               }
-       }
+   struct ra_block_state *state = &ctx->blocks[block->index];
+   if (state->renames) {
+      struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
+      if (entry) {
+         return (physreg_t)(uintptr_t)entry->data;
+      }
+   }
  
-       return ra_reg_get_physreg(def);
+   return ra_reg_get_physreg(def);
  }
  
  static void
  handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
  {
-       physreg_t physreg = ~0;
-       for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
-               struct ir3_block *pred = ctx->block->predecessors[i];
-               struct ra_block_state *pred_state = &ctx->blocks[pred->index];
+   physreg_t physreg = ~0;
+   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+      struct ir3_block *pred = ctx->block->predecessors[i];
+      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
  
-               if (!pred_state->visited ||
-                       (pred_state->logical_unreachable && !(def->flags & IR3_REG_SHARED)))
-                       continue;
+      if (!pred_state->visited ||
+          (pred_state->logical_unreachable && !(def->flags & IR3_REG_SHARED)))
+         continue;
  
-               physreg = read_register(ctx, pred, def);
-               break;
-       }
+      physreg = read_register(ctx, pred, def);
+      break;
+   }
  
-       assert(physreg != (physreg_t)~0);
+   assert(physreg != (physreg_t)~0);
  
-       struct ra_interval *interval = &ctx->intervals[def->name];
-       struct ra_file *file = ra_get_file(ctx, def);
-       ra_interval_init(interval, def);
-       interval->physreg_start = physreg;
-       interval->physreg_end = physreg + reg_size(def);
-       ra_file_insert(file, interval);
+   struct ra_interval *interval = &ctx->intervals[def->name];
+   struct ra_file *file = ra_get_file(ctx, def);
+   ra_interval_init(interval, def);
+   interval->physreg_start = physreg;
+   interval->physreg_end = physreg + reg_size(def);
+   ra_file_insert(file, interval);
  }
  
  static void
  handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
  {
-       /* Skip parallelcopy's which in the original program are only used as phi
-        * arguments. Even though phi arguments are live out, they are only
-        * assigned when the phi is.
-        */
-       if (def->instr->opc == OPC_META_PARALLEL_COPY)
-               return;
+   /* Skip parallelcopy's which in the original program are only used as phi
+    * arguments. Even though phi arguments are live out, they are only
+    * assigned when the phi is.
+    */
+   if (def->instr->opc == OPC_META_PARALLEL_COPY)
+      return;
  
-       struct ra_block_state *state = &ctx->blocks[ctx->block->index];
-       struct ra_interval *interval = &ctx->intervals[def->name];
-       physreg_t physreg = ra_interval_get_physreg(interval);
-       if (physreg != ra_reg_get_physreg(def)) {
-               if (!state->renames)
-                       state->renames = _mesa_pointer_hash_table_create(ctx);
-               _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
-       }
+   struct ra_block_state *state = &ctx->blocks[ctx->block->index];
+   struct ra_interval *interval = &ctx->intervals[def->name];
+   physreg_t physreg = ra_interval_get_physreg(interval);
+   if (physreg != ra_reg_get_physreg(def)) {
+      if (!state->renames)
+         state->renames = _mesa_pointer_hash_table_create(ctx);
+      _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
+   }
  }
  
  static void
  handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
  {
-       struct ra_file *file = ra_get_file(ctx, def);
-       struct ra_interval *interval = &ctx->intervals[def->name];
+   struct ra_file *file = ra_get_file(ctx, def);
+   struct ra_interval *interval = &ctx->intervals[def->name];
  
-       /* phis are always scalar, so they should already be the smallest possible 
-        * size. However they may be coalesced with other live-in values/phi
-        * nodes, so check for that here.
-        */
-       struct ir3_reg_interval *parent_ir3 =
-               ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
-       physreg_t physreg;
-       if (parent_ir3) {
-               struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
-               physreg = ra_interval_get_physreg(parent) +
-                       (def->interval_start - parent_ir3->reg->interval_start);
-       } else {
-               physreg = get_reg(ctx, file, def, false);
-       }
+   /* phis are always scalar, so they should already be the smallest possible
+    * size. However they may be coalesced with other live-in values/phi
+    * nodes, so check for that here.
+    */
+   struct ir3_reg_interval *parent_ir3 =
+      ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
+   physreg_t physreg;
+   if (parent_ir3) {
+      struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
+      physreg = ra_interval_get_physreg(parent) +
+                (def->interval_start - parent_ir3->reg->interval_start);
+   } else {
+      physreg = get_reg(ctx, file, def, false);
+   }
  
-       allocate_dst_fixed(ctx, def, physreg);
+   allocate_dst_fixed(ctx, def, physreg);
  
-       ra_file_insert(file, interval);
+   ra_file_insert(file, interval);
  }
  
  static void
  assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
  {
-       struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
-       struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
-       assert(!interval->interval.parent);
-       unsigned num = ra_interval_get_num(interval);
-       assign_reg(phi, phi->dsts[0], num);
+   struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
+   struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
+   assert(!interval->interval.parent);
+   unsigned num = ra_interval_get_num(interval);
+   assign_reg(phi, phi->dsts[0], num);
  
-       /* Assign the parallelcopy sources of this phi */
-       for (unsigned i = 0; i < phi->srcs_count; i++) {
-               if (phi->srcs[i]->def) {
-                       assign_reg(phi, phi->srcs[i], num);
-                       assign_reg(phi, phi->srcs[i]->def, num);
-               }
-       }
+   /* Assign the parallelcopy sources of this phi */
+   for (unsigned i = 0; i < phi->srcs_count; i++) {
+      if (phi->srcs[i]->def) {
+         assign_reg(phi, phi->srcs[i], num);
+         assign_reg(phi, phi->srcs[i]->def, num);
+      }
+   }
  
-       if (phi->dsts[0]->flags & IR3_REG_UNUSED)
-               ra_file_remove(file, interval);
+   if (phi->dsts[0]->flags & IR3_REG_UNUSED)
+      ra_file_remove(file, interval);
  }
  
  /* When we split a live range, we sometimes need to emit fixup code at the end
@@ -1609,421 +1647,423 @@ assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
  
  static void
  insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
-                                       struct ir3_register *reg)
-{
-       struct ir3_instruction *old_pcopy = NULL;
-       if (!list_is_empty(&block->instr_list)) {
-               struct ir3_instruction *last =
-                       LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
-               if (last->opc == OPC_META_PARALLEL_COPY)
-                       old_pcopy = last;
-       }
-
-       unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
-       struct ir3_instruction *pcopy =
-               ir3_instr_create(block, OPC_META_PARALLEL_COPY,
-                                                old_pcopy_srcs + 1, old_pcopy_srcs + 1);
-
-       for (unsigned i = 0; i < old_pcopy_srcs; i++) {
-               old_pcopy->dsts[i]->instr = pcopy;
-               pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
-       }
-
-       struct ir3_register *dst_reg =
-               ir3_dst_create(pcopy, INVALID_REG,
-                                          reg->flags & ~IR3_REG_SSA);
-       dst_reg->wrmask = reg->wrmask;
-       dst_reg->size = reg->size;
-       assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
-
-       for (unsigned i = 0; i < old_pcopy_srcs; i++) {
-               pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
-       }
-
-       struct ir3_register *src_reg =
-               ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
-       src_reg->wrmask = reg->wrmask;
-       src_reg->size = reg->size;
-       assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
-
-       if (old_pcopy)
-               list_del(&old_pcopy->node);
+                    struct ir3_register *reg)
+{
+   struct ir3_instruction *old_pcopy = NULL;
+   if (!list_is_empty(&block->instr_list)) {
+      struct ir3_instruction *last =
+         LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
+      if (last->opc == OPC_META_PARALLEL_COPY)
+         old_pcopy = last;
+   }
+
+   unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
+   struct ir3_instruction *pcopy = ir3_instr_create(
+      block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
+
+   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
+      old_pcopy->dsts[i]->instr = pcopy;
+      pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
+   }
+
+   struct ir3_register *dst_reg =
+      ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
+   dst_reg->wrmask = reg->wrmask;
+   dst_reg->size = reg->size;
+   assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
+
+   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
+      pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
+   }
+
+   struct ir3_register *src_reg =
+      ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
+   src_reg->wrmask = reg->wrmask;
+   src_reg->size = reg->size;
+   assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
+
+   if (old_pcopy)
+      list_del(&old_pcopy->node);
  }
  
  static void
  insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
  {
-       physreg_t physreg = ra_interval_get_physreg(interval);
-       
-       bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
-       struct ir3_block **predecessors =
-               shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
-       unsigned predecessors_count =
-               shared ? ctx->block->physical_predecessors_count : ctx->block->predecessors_count;
-
-       for (unsigned i = 0; i < predecessors_count; i++) {
-               struct ir3_block *pred = predecessors[i];
-               struct ra_block_state *pred_state = &ctx->blocks[pred->index];
-
-               if (!pred_state->visited)
-                       continue;
-
-               physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
-               if (pred_reg != physreg) {
-                       insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
-
-                       /* This is a bit tricky, but when visiting the destination of a
-                        * physical-only edge, we have two predecessors (the if and the
-                        * header block) and both have multiple successors. We pick the
-                        * register for all live-ins from the normal edge, which should
-                        * guarantee that there's no need for shuffling things around in
-                        * the normal predecessor as long as there are no phi nodes, but
-                        * we still may need to insert fixup code in the physical
-                        * predecessor (i.e. the last block of the if) and that has
-                        * another successor (the block after the if) so we need to update
-                        * the renames state for when we process the other successor. This
-                        * crucially depends on the other successor getting processed
-                        * after this.
-                        *
-                        * For normal (non-physical) edges we disallow critical edges so
-                        * that hacks like this aren't necessary.
-                        */
-                       if (!pred_state->renames)
-                               pred_state->renames = _mesa_pointer_hash_table_create(ctx);
-                       _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
-                                                                       (void *)(uintptr_t)physreg);
-               }
-       }
+   physreg_t physreg = ra_interval_get_physreg(interval);
+
+   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
+   struct ir3_block **predecessors =
+      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
+   unsigned predecessors_count = shared
+                                    ? ctx->block->physical_predecessors_count
+                                    : ctx->block->predecessors_count;
+
+   for (unsigned i = 0; i < predecessors_count; i++) {
+      struct ir3_block *pred = predecessors[i];
+      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
+
+      if (!pred_state->visited)
+         continue;
+
+      physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
+      if (pred_reg != physreg) {
+         insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
+
+         /* This is a bit tricky, but when visiting the destination of a
+          * physical-only edge, we have two predecessors (the if and the
+          * header block) and both have multiple successors. We pick the
+          * register for all live-ins from the normal edge, which should
+          * guarantee that there's no need for shuffling things around in
+          * the normal predecessor as long as there are no phi nodes, but
+          * we still may need to insert fixup code in the physical
+          * predecessor (i.e. the last block of the if) and that has
+          * another successor (the block after the if) so we need to update
+          * the renames state for when we process the other successor. This
+          * crucially depends on the other successor getting processed
+          * after this.
+          *
+          * For normal (non-physical) edges we disallow critical edges so
+          * that hacks like this aren't necessary.
+          */
+         if (!pred_state->renames)
+            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
+         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
+                                 (void *)(uintptr_t)physreg);
+      }
+   }
  }
  
  static void
  insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
  {
-       BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
-       rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
-               /* Skip phi nodes. This needs to happen after phi nodes are allocated,
-                * because we may have to move live-ins around to make space for phi
-                * nodes, but we shouldn't be handling phi nodes here.
-                */
-               if (BITSET_TEST(live_in, interval->interval.reg->name))
-                       insert_live_in_move(ctx, interval);
-       }
+   BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
+   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
+                    physreg_node) {
+      /* Skip phi nodes. This needs to happen after phi nodes are allocated,
+       * because we may have to move live-ins around to make space for phi
+       * nodes, but we shouldn't be handling phi nodes here.
+       */
+      if (BITSET_TEST(live_in, interval->interval.reg->name))
+         insert_live_in_move(ctx, interval);
+   }
  }
  
  static void
  insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
  {
-       rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
-               _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
-                               (void *)(uintptr_t)interval->physreg_start);
-       }
+   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
+                    physreg_node) {
+      _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
+                              (void *)(uintptr_t)interval->physreg_start);
+   }
  }
  
  static void
  insert_live_in_moves(struct ra_ctx *ctx)
  {
-       insert_file_live_in_moves(ctx, &ctx->full);
-       insert_file_live_in_moves(ctx, &ctx->half);
-       insert_file_live_in_moves(ctx, &ctx->shared);
-
-       /* If not all predecessors are visited, insert live-in regs so that
-        * insert_live_out_moves() will work.
-        */
-       bool all_preds_visited = true;
-       for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
-               if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
-                       all_preds_visited = false;
-                       break;
-               }
-       }
-
-       if (!all_preds_visited) {
-               struct ra_block_state *state = &ctx->blocks[ctx->block->index];
-               state->entry_regs = _mesa_pointer_hash_table_create(ctx);
-               
-               insert_entry_regs(state, &ctx->full);
-               insert_entry_regs(state, &ctx->half);
-               insert_entry_regs(state, &ctx->shared);
-       }
+   insert_file_live_in_moves(ctx, &ctx->full);
+   insert_file_live_in_moves(ctx, &ctx->half);
+   insert_file_live_in_moves(ctx, &ctx->shared);
+
+   /* If not all predecessors are visited, insert live-in regs so that
+    * insert_live_out_moves() will work.
+    */
+   bool all_preds_visited = true;
+   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+      if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
+         all_preds_visited = false;
+         break;
+      }
+   }
+
+   if (!all_preds_visited) {
+      struct ra_block_state *state = &ctx->blocks[ctx->block->index];
+      state->entry_regs = _mesa_pointer_hash_table_create(ctx);
+
+      insert_entry_regs(state, &ctx->full);
+      insert_entry_regs(state, &ctx->half);
+      insert_entry_regs(state, &ctx->shared);
+   }
  }
  
  static void
  insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
  {
-       for (unsigned i = 0; i < 2; i++) {
-               if (!ctx->block->successors[i])
-                       continue;
+   for (unsigned i = 0; i < 2; i++) {
+      if (!ctx->block->successors[i])
+         continue;
  
-               struct ir3_block *succ = ctx->block->successors[i];
-               struct ra_block_state *succ_state = &ctx->blocks[succ->index];
+      struct ir3_block *succ = ctx->block->successors[i];
+      struct ra_block_state *succ_state = &ctx->blocks[succ->index];
  
-               if (!succ_state->visited)
-                       continue;
+      if (!succ_state->visited)
+         continue;
  
-               struct hash_entry *entry =
-                       _mesa_hash_table_search(succ_state->entry_regs, interval->interval.reg);
-               if (!entry)
-                       continue;
+      struct hash_entry *entry = _mesa_hash_table_search(
+         succ_state->entry_regs, interval->interval.reg);
+      if (!entry)
+         continue;
  
-               physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
-               if (new_reg != interval->physreg_start) {
-                       insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
-                                                               interval->interval.reg);
-               }
-       }
+      physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
+      if (new_reg != interval->physreg_start) {
+         insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
+                             interval->interval.reg);
+      }
+   }
  }
  
  static void
  insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
  {
-       rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
-               insert_live_out_move(ctx, interval);
-       }
+   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
+                    physreg_node) {
+      insert_live_out_move(ctx, interval);
+   }
  }
  
  static void
  insert_live_out_moves(struct ra_ctx *ctx)
  {
-       insert_file_live_out_moves(ctx, &ctx->full);
-       insert_file_live_out_moves(ctx, &ctx->half);
-       insert_file_live_out_moves(ctx, &ctx->shared);
+   insert_file_live_out_moves(ctx, &ctx->full);
+   insert_file_live_out_moves(ctx, &ctx->half);
+   insert_file_live_out_moves(ctx, &ctx->shared);
  }
  
  static void
  handle_block(struct ra_ctx *ctx, struct ir3_block *block)
  {
-       ctx->block = block;
-
-       /* Reset the register files from the last block */
-       ra_file_init(&ctx->full);
-       ra_file_init(&ctx->half);
-       ra_file_init(&ctx->shared);
-
-       bool unreachable = false;
-       if (block != ir3_start_block(ctx->ir)) {
-               unreachable = true;
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       struct ra_block_state *pred_state =
-                               &ctx->blocks[block->predecessors[i]->index];
-                       if (!pred_state->logical_unreachable) {
-                               unreachable = false;
-                               break;
-                       }
-               }
-       }
-
-       ctx->blocks[block->index].logical_unreachable = unreachable;
-
-       /* Handle live-ins, phis, and input meta-instructions. These all appear
-        * live at the beginning of the block, and interfere with each other
-        * therefore need to be allocated "in parallel". This means that we
-        * have to allocate all of them, inserting them into the file, and then
-        * delay updating the IR until all of them are allocated.
-        *
-        * Handle precolored inputs first, because we need to make sure that other
-        * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
-        * and inputs at the same time, because the first block doesn't have
-        * predecessors. Therefore handle_live_in doesn't have to worry about
-        * them.
-        */
-
-       foreach_instr (instr, &block->instr_list) {
-               if (instr->opc == OPC_META_INPUT)
-                       handle_precolored_input(ctx, instr);
-               else
-                       break;
-       }
-
-       unsigned name;
-       BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
-                                          ctx->live->definitions_count) {
-               struct ir3_register *reg = ctx->live->definitions[name];
-               if (unreachable && !(reg->flags & IR3_REG_SHARED))
-                       continue;
-               handle_live_in(ctx, reg);
-       }
-
-       foreach_instr (instr, &block->instr_list) {
-               if (instr->opc == OPC_META_PHI)
-                       handle_phi(ctx, instr->dsts[0]);
-               else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH)
-                       handle_input(ctx, instr);
-               else
-                       break;
-       }
-
-       /* After this point, every live-in/phi/input has an interval assigned to
-        * it. We delay actually assigning values until everything has been
-        * allocated, so we can simply ignore any parallel copy entries created
-        * when shuffling them around.
-        */
-       ctx->parallel_copies_count = 0;
-
-       insert_live_in_moves(ctx);
-
-       if (RA_DEBUG) {
-               printf("after live-in block %u:\n", block->index);
-               ra_ctx_dump(ctx);
-       }
-
-       /* Now we're done with processing live-ins, and can handle the body of the
-        * block.
-        */
-       foreach_instr (instr, &block->instr_list) {
-               if (RA_DEBUG) {
-                       printf("processing: ");
-                       ir3_print_instr(instr);
-               }
-
-               if (instr->opc == OPC_META_PHI)
-                       assign_phi(ctx, instr);
-               else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH)
-                       assign_input(ctx, instr);
-               else if (instr->opc == OPC_META_SPLIT)
-                       handle_split(ctx, instr);
-               else if (instr->opc == OPC_META_COLLECT)
-                       handle_collect(ctx, instr);
-               else if (instr->opc == OPC_META_PARALLEL_COPY)
-                       handle_pcopy(ctx, instr);
-               else if (instr->opc == OPC_CHMASK)
-                       handle_chmask(ctx, instr);
-               else
-                       handle_normal_instr(ctx, instr);
-
-               if (RA_DEBUG)
-                       ra_ctx_dump(ctx);
-       }
-
-       insert_live_out_moves(ctx);
-
-       BITSET_FOREACH_SET(name, ctx->live->live_out[block->index],
-                                          ctx->live->definitions_count) {
-               struct ir3_register *reg = ctx->live->definitions[name];
-               handle_live_out(ctx, reg);
-       }
-
-       ctx->blocks[block->index].visited = true;
+   ctx->block = block;
+
+   /* Reset the register files from the last block */
+   ra_file_init(&ctx->full);
+   ra_file_init(&ctx->half);
+   ra_file_init(&ctx->shared);
+
+   bool unreachable = false;
+   if (block != ir3_start_block(ctx->ir)) {
+      unreachable = true;
+      for (unsigned i = 0; i < block->predecessors_count; i++) {
+         struct ra_block_state *pred_state =
+            &ctx->blocks[block->predecessors[i]->index];
+         if (!pred_state->logical_unreachable) {
+            unreachable = false;
+            break;
+         }
+      }
+   }
+
+   ctx->blocks[block->index].logical_unreachable = unreachable;
+
+   /* Handle live-ins, phis, and input meta-instructions. These all appear
+    * live at the beginning of the block, and interfere with each other
+    * therefore need to be allocated "in parallel". This means that we
+    * have to allocate all of them, inserting them into the file, and then
+    * delay updating the IR until all of them are allocated.
+    *
+    * Handle precolored inputs first, because we need to make sure that other
+    * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
+    * and inputs at the same time, because the first block doesn't have
+    * predecessors. Therefore handle_live_in doesn't have to worry about
+    * them.
+    */
+
+   foreach_instr (instr, &block->instr_list) {
+      if (instr->opc == OPC_META_INPUT)
+         handle_precolored_input(ctx, instr);
+      else
+         break;
+   }
+
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      if (unreachable && !(reg->flags & IR3_REG_SHARED))
+         continue;
+      handle_live_in(ctx, reg);
+   }
+
+   foreach_instr (instr, &block->instr_list) {
+      if (instr->opc == OPC_META_PHI)
+         handle_phi(ctx, instr->dsts[0]);
+      else if (instr->opc == OPC_META_INPUT ||
+               instr->opc == OPC_META_TEX_PREFETCH)
+         handle_input(ctx, instr);
+      else
+         break;
+   }
+
+   /* After this point, every live-in/phi/input has an interval assigned to
+    * it. We delay actually assigning values until everything has been
+    * allocated, so we can simply ignore any parallel copy entries created
+    * when shuffling them around.
+    */
+   ctx->parallel_copies_count = 0;
+
+   insert_live_in_moves(ctx);
+
+   if (RA_DEBUG) {
+      printf("after live-in block %u:\n", block->index);
+      ra_ctx_dump(ctx);
+   }
+
+   /* Now we're done with processing live-ins, and can handle the body of the
+    * block.
+    */
+   foreach_instr (instr, &block->instr_list) {
+      if (RA_DEBUG) {
+         printf("processing: ");
+         ir3_print_instr(instr);
+      }
+
+      if (instr->opc == OPC_META_PHI)
+         assign_phi(ctx, instr);
+      else if (instr->opc == OPC_META_INPUT ||
+               instr->opc == OPC_META_TEX_PREFETCH)
+         assign_input(ctx, instr);
+      else if (instr->opc == OPC_META_SPLIT)
+         handle_split(ctx, instr);
+      else if (instr->opc == OPC_META_COLLECT)
+         handle_collect(ctx, instr);
+      else if (instr->opc == OPC_META_PARALLEL_COPY)
+         handle_pcopy(ctx, instr);
+      else if (instr->opc == OPC_CHMASK)
+         handle_chmask(ctx, instr);
+      else
+         handle_normal_instr(ctx, instr);
+
+      if (RA_DEBUG)
+         ra_ctx_dump(ctx);
+   }
+
+   insert_live_out_moves(ctx);
+
+   BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      handle_live_out(ctx, reg);
+   }
+
+   ctx->blocks[block->index].visited = true;
  }
  
  static unsigned
  calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
  {
-       /* Registers are allocated in units of vec4, so switch from units of
-        * half-regs to vec4.
-        */
-       unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
+   /* Registers are allocated in units of vec4, so switch from units of
+    * half-regs to vec4.
+    */
+   unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
  
-       bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
+   bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
  
-       unsigned target = reg_count;
-       unsigned reg_independent_max_waves =
-               ir3_get_reg_independent_max_waves(v, double_threadsize);
-       unsigned reg_dependent_max_waves =
-               ir3_get_reg_dependent_max_waves(v->shader->compiler, reg_count,
-                               double_threadsize);
-       unsigned target_waves =
-               MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+   unsigned target = reg_count;
+   unsigned reg_independent_max_waves =
+      ir3_get_reg_independent_max_waves(v, double_threadsize);
+   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
+      v->shader->compiler, reg_count, double_threadsize);
+   unsigned target_waves =
+      MIN2(reg_independent_max_waves, reg_dependent_max_waves);
  
-       while (target <= RA_FULL_SIZE / (2 * 4) &&
-                  ir3_should_double_threadsize(v, target) == double_threadsize &&
-                  ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
-                                                                                  double_threadsize) >= target_waves)
-               target++;
+   while (target <= RA_FULL_SIZE / (2 * 4) &&
+          ir3_should_double_threadsize(v, target) == double_threadsize &&
+          ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
+                                          double_threadsize) >= target_waves)
+      target++;
  
-       return (target - 1) * 2 * 4;
+   return (target - 1) * 2 * 4;
  }
  
  int
  ir3_ra(struct ir3_shader_variant *v)
  {
-       ir3_calc_dominance(v->ir);
+   ir3_calc_dominance(v->ir);
  
-       ir3_create_parallel_copies(v->ir);
+   ir3_create_parallel_copies(v->ir);
  
-       struct ir3_liveness *live = ir3_calc_liveness(v);
+   struct ir3_liveness *live = ir3_calc_liveness(v);
  
-       ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
+   ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
  
-       ir3_merge_regs(live, v->ir);
+   ir3_merge_regs(live, v->ir);
  
-       struct ir3_pressure max_pressure;
-       ir3_calc_pressure(v, live, &max_pressure);
-       d("max pressure:");
-       d("\tfull: %u", max_pressure.full);
-       d("\thalf: %u", max_pressure.half);
-       d("\tshared: %u", max_pressure.shared);
+   struct ir3_pressure max_pressure;
+   ir3_calc_pressure(v, live, &max_pressure);
+   d("max pressure:");
+   d("\tfull: %u", max_pressure.full);
+   d("\thalf: %u", max_pressure.half);
+   d("\tshared: %u", max_pressure.shared);
  
-       if (v->mergedregs) {
-               max_pressure.full += max_pressure.half;
-               max_pressure.half = 0;
-       }
+   if (v->mergedregs) {
+      max_pressure.full += max_pressure.half;
+      max_pressure.half = 0;
+   }
  
-       if (max_pressure.full > RA_FULL_SIZE ||
-               max_pressure.half > RA_HALF_SIZE ||
-               max_pressure.shared > RA_SHARED_SIZE) {
-               d("max pressure exceeded!");
-               return 1;
-       }
+   if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||
+       max_pressure.shared > RA_SHARED_SIZE) {
+      d("max pressure exceeded!");
+      return 1;
+   }
  
-       struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
+   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
  
-       ctx->ir = v->ir;
-       ctx->merged_regs = v->mergedregs;
-       ctx->compiler = v->shader->compiler;
-       ctx->stage = v->type;
-       ctx->live = live;
-       ctx->intervals = rzalloc_array(ctx, struct ra_interval, live->definitions_count);
-       ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
+   ctx->ir = v->ir;
+   ctx->merged_regs = v->mergedregs;
+   ctx->compiler = v->shader->compiler;
+   ctx->stage = v->type;
+   ctx->live = live;
+   ctx->intervals =
+      rzalloc_array(ctx, struct ra_interval, live->definitions_count);
+   ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
  
-       ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
-       d("full size: %u", ctx->full.size);
-       
-       if (!v->mergedregs)
-               ctx->half.size = RA_HALF_SIZE;
+   ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
+   d("full size: %u", ctx->full.size);
  
-       ctx->shared.size = RA_SHARED_SIZE;
+   if (!v->mergedregs)
+      ctx->half.size = RA_HALF_SIZE;
  
-       foreach_block (block, &v->ir->block_list)
-               handle_block(ctx, block);
+   ctx->shared.size = RA_SHARED_SIZE;
  
-       ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
+   foreach_block (block, &v->ir->block_list)
+      handle_block(ctx, block);
  
-       /* Strip array-ness and SSA-ness at the end, because various helpers still
-        * need to work even on definitions that have already been assigned. For
-        * example, we need to preserve array-ness so that array live-ins have the
-        * right size.
-        */
-       foreach_block (block, &v->ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       for (unsigned i = 0; i < instr->dsts_count; i++) {
-                               instr->dsts[i]->flags &= ~IR3_REG_SSA;
+   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
  
-                               /* Parallel copies of array registers copy the whole register,
-                                * and we need some way to let the parallel copy code know
-                                * that this was an array whose size is determined by
-                                * reg->size. So keep the array flag on those.
-                                */
-                               if (!is_meta(instr))
-                                       instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
-                       }
+   /* Strip array-ness and SSA-ness at the end, because various helpers still
+    * need to work even on definitions that have already been assigned. For
+    * example, we need to preserve array-ness so that array live-ins have the
+    * right size.
+    */
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         for (unsigned i = 0; i < instr->dsts_count; i++) {
+            instr->dsts[i]->flags &= ~IR3_REG_SSA;
  
-                       for (unsigned i = 0; i < instr->srcs_count; i++) {
-                               instr->srcs[i]->flags &= ~IR3_REG_SSA;
+            /* Parallel copies of array registers copy the whole register,
+             * and we need some way to let the parallel copy code know
+             * that this was an array whose size is determined by
+             * reg->size. So keep the array flag on those.
+             */
+            if (!is_meta(instr))
+               instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
+         }
  
-                               if (!is_meta(instr))
-                                       instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
-                       }
-               }
-       }
+         for (unsigned i = 0; i < instr->srcs_count; i++) {
+            instr->srcs[i]->flags &= ~IR3_REG_SSA;
  
-       ir3_debug_print(v->ir, "AFTER: register allocation");
+            if (!is_meta(instr))
+               instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
+         }
+      }
+   }
  
-       ir3_lower_copies(v);
+   ir3_debug_print(v->ir, "AFTER: register allocation");
  
-       ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
+   ir3_lower_copies(v);
  
-       ralloc_free(ctx);
-       ralloc_free(live);
-       return 0;
-}
+   ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
  
+   ralloc_free(ctx);
+   ralloc_free(live);
+   return 0;
+}
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h

index 688795d..98533a3 100644 (file)
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -24,62 +24,68 @@
  #ifndef _IR3_RA_H
  #define _IR3_RA_H
  
+#include "util/rb_tree.h"
  #include "ir3.h"
  #include "ir3_compiler.h"
-#include "util/rb_tree.h"
  
  #ifdef DEBUG
  #define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
  #else
  #define RA_DEBUG 0
  #endif
-#define d(fmt, ...) do { if (RA_DEBUG) { \
-       printf("RA: "fmt"\n", ##__VA_ARGS__); \
-} } while (0)
-
-#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
-       printf("RA: "fmt": ", ##__VA_ARGS__); \
-       ir3_print_instr(instr); \
-} } while (0)
+#define d(fmt, ...)                                                            \
+   do {                                                                        \
+      if (RA_DEBUG) {                                                          \
+         printf("RA: " fmt "\n", ##__VA_ARGS__);                               \
+      }                                                                        \
+   } while (0)
+
+#define di(instr, fmt, ...)                                                    \
+   do {                                                                        \
+      if (RA_DEBUG) {                                                          \
+         printf("RA: " fmt ": ", ##__VA_ARGS__);                               \
+         ir3_print_instr(instr);                                               \
+      }                                                                        \
+   } while (0)
  
  typedef uint16_t physreg_t;
  
  static inline unsigned
  ra_physreg_to_num(physreg_t physreg, unsigned flags)
  {
-       if (!(flags & IR3_REG_HALF))
-               physreg /= 2;
-       if (flags & IR3_REG_SHARED)
-               physreg += 48 * 4;
-       return physreg;
+   if (!(flags & IR3_REG_HALF))
+      physreg /= 2;
+   if (flags & IR3_REG_SHARED)
+      physreg += 48 * 4;
+   return physreg;
  }
  
  static inline physreg_t
  ra_num_to_physreg(unsigned num, unsigned flags)
  {
-       if (flags & IR3_REG_SHARED)
-               num -= 48 * 4;
-       if (!(flags & IR3_REG_HALF))
-               num *= 2;
-       return num;
+   if (flags & IR3_REG_SHARED)
+      num -= 48 * 4;
+   if (!(flags & IR3_REG_HALF))
+      num *= 2;
+   return num;
  }
  
  static inline unsigned
  ra_reg_get_num(const struct ir3_register *reg)
  {
-       return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
+   return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
  }
  
  static inline physreg_t
  ra_reg_get_physreg(const struct ir3_register *reg)
  {
-       return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
+   return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
  }
  
  static inline bool
  def_is_gpr(const struct ir3_register *reg)
  {
-       return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
+   return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
  }
  
  /* Note: don't count undef as a source.
@@ -87,16 +93,14 @@ def_is_gpr(const struct ir3_register *reg)
  static inline bool
  ra_reg_is_src(const struct ir3_register *reg)
  {
-       return (reg->flags & IR3_REG_SSA) && reg->def &&
-               def_is_gpr(reg->def);
+   return (reg->flags & IR3_REG_SSA) && reg->def && def_is_gpr(reg->def);
  }
  
  static inline bool
  ra_reg_is_dst(const struct ir3_register *reg)
  {
-       return (reg->flags & IR3_REG_SSA) &&
-               def_is_gpr(reg) &&
-               ((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
+   return (reg->flags & IR3_REG_SSA) && def_is_gpr(reg) &&
+          ((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
  }
  
  /* Iterators for sources and destinations which:
@@ -105,53 +109,54 @@ ra_reg_is_dst(const struct ir3_register *reg)
   * - Consider array destinations as both a source and a destination
   */
  
-#define ra_foreach_src(__srcreg, __instr) \
-       for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
-               for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt; __i++) \
-                       if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
-
-#define ra_foreach_src_rev(__srcreg, __instr) \
-       for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
-               for (int __cnt = (__instr)->srcs_count, __i = __cnt - 1; __i >= 0; __i--) \
-                       if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
-
-#define ra_foreach_dst(__dstreg, __instr) \
-       for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
-               for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt; __i++) \
-                       if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
-
-
-#define RA_HALF_SIZE (4 * 48)
-#define RA_FULL_SIZE (4 * 48 * 2)
-#define RA_SHARED_SIZE (2 * 4 * 8)
+#define ra_foreach_src(__srcreg, __instr)                                      \
+   for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
+      for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt;       \
+           __i++)                                                              \
+         if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
+
+#define ra_foreach_src_rev(__srcreg, __instr)                                  \
+   for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
+      for (int __cnt = (__instr)->srcs_count, __i = __cnt - 1; __i >= 0;       \
+           __i--)                                                              \
+         if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
+
+#define ra_foreach_dst(__dstreg, __instr)                                      \
+   for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
+      for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt;       \
+           __i++)                                                              \
+         if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
+
+#define RA_HALF_SIZE     (4 * 48)
+#define RA_FULL_SIZE     (4 * 48 * 2)
+#define RA_SHARED_SIZE   (2 * 4 * 8)
  #define RA_MAX_FILE_SIZE RA_FULL_SIZE
  
  struct ir3_liveness {
-       unsigned block_count;
-       DECLARE_ARRAY(struct ir3_register *, definitions);
-       DECLARE_ARRAY(BITSET_WORD *, live_out);
-       DECLARE_ARRAY(BITSET_WORD *, live_in);
+   unsigned block_count;
+   DECLARE_ARRAY(struct ir3_register *, definitions);
+   DECLARE_ARRAY(BITSET_WORD *, live_out);
+   DECLARE_ARRAY(BITSET_WORD *, live_in);
  };
  
  struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v);
  
  bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
-                                               struct ir3_instruction *instr);
+                        struct ir3_instruction *instr);
  
  void ir3_create_parallel_copies(struct ir3 *ir);
  
  void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
  
  struct ir3_pressure {
-       unsigned full, half, shared;
+   unsigned full, half, shared;
  };
  
-void ir3_calc_pressure(struct ir3_shader_variant *v,
-                                          struct ir3_liveness *live,
-                                          struct ir3_pressure *max_pressure);
+void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
+                       struct ir3_pressure *max_pressure);
  
-void ir3_ra_validate(struct ir3_shader_variant *v,
-                                        unsigned full_size, unsigned half_size, unsigned block_count);
+void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
+                     unsigned half_size, unsigned block_count);
  
  void ir3_lower_copies(struct ir3_shader_variant *v);
  
@@ -176,91 +181,90 @@ void ir3_lower_copies(struct ir3_shader_variant *v);
   */
  
  struct ir3_reg_interval {
-       struct rb_node node;
+   struct rb_node node;
  
-       struct rb_tree children;
+   struct rb_tree children;
  
-       struct ir3_reg_interval *parent;
+   struct ir3_reg_interval *parent;
  
-       struct ir3_register *reg;
+   struct ir3_register *reg;
  
-       bool inserted;
+   bool inserted;
  };
  
  struct ir3_reg_ctx {
-       /* The tree of top-level intervals in the forest. */
-       struct rb_tree intervals;
-
-       /* Users of ir3_reg_ctx need to keep around additional state that is
-        * modified when top-level intervals are added or removed. For register
-        * pressure tracking, this is just the register pressure, but for RA we
-        * need to keep track of the physreg of each top-level interval. These
-        * callbacks provide a place to let users deriving from ir3_reg_ctx update
-        * their state when top-level intervals are inserted/removed.
-        */
-
-       /* Called when an interval is added and it turns out to be at the top
-        * level.
-        */
-       void (*interval_add)(struct ir3_reg_ctx *ctx,
-                                                struct ir3_reg_interval *interval);
-
-       /* Called when an interval is deleted from the top level. */
-       void (*interval_delete)(struct ir3_reg_ctx *ctx,
-                                                       struct ir3_reg_interval *interval);
-
-       /* Called when an interval is deleted and its child becomes top-level.
-        */
-       void (*interval_readd)(struct ir3_reg_ctx *ctx,
-                                                  struct ir3_reg_interval *parent,
-                                                  struct ir3_reg_interval *child);
+   /* The tree of top-level intervals in the forest. */
+   struct rb_tree intervals;
+
+   /* Users of ir3_reg_ctx need to keep around additional state that is
+    * modified when top-level intervals are added or removed. For register
+    * pressure tracking, this is just the register pressure, but for RA we
+    * need to keep track of the physreg of each top-level interval. These
+    * callbacks provide a place to let users deriving from ir3_reg_ctx update
+    * their state when top-level intervals are inserted/removed.
+    */
+
+   /* Called when an interval is added and it turns out to be at the top
+    * level.
+    */
+   void (*interval_add)(struct ir3_reg_ctx *ctx,
+                        struct ir3_reg_interval *interval);
+
+   /* Called when an interval is deleted from the top level. */
+   void (*interval_delete)(struct ir3_reg_ctx *ctx,
+                           struct ir3_reg_interval *interval);
+
+   /* Called when an interval is deleted and its child becomes top-level.
+    */
+   void (*interval_readd)(struct ir3_reg_ctx *ctx,
+                          struct ir3_reg_interval *parent,
+                          struct ir3_reg_interval *child);
  };
  
  static inline struct ir3_reg_interval *
  ir3_rb_node_to_interval(struct rb_node *node)
  {
-       return rb_node_data(struct ir3_reg_interval, node, node);
+   return rb_node_data(struct ir3_reg_interval, node, node);
  }
  
  static inline const struct ir3_reg_interval *
  ir3_rb_node_to_interval_const(const struct rb_node *node)
  {
-       return rb_node_data(struct ir3_reg_interval, node, node);
+   return rb_node_data(struct ir3_reg_interval, node, node);
  }
  
  static inline struct ir3_reg_interval *
  ir3_reg_interval_next(struct ir3_reg_interval *interval)
  {
-       struct rb_node *next = rb_node_next(&interval->node);
-       return next ? ir3_rb_node_to_interval(next) : NULL;
+   struct rb_node *next = rb_node_next(&interval->node);
+   return next ? ir3_rb_node_to_interval(next) : NULL;
  }
  
  static inline struct ir3_reg_interval *
  ir3_reg_interval_next_or_null(struct ir3_reg_interval *interval)
  {
-       return interval ? ir3_reg_interval_next(interval) : NULL;
+   return interval ? ir3_reg_interval_next(interval) : NULL;
  }
  
  static inline void
-ir3_reg_interval_init(struct ir3_reg_interval *interval, struct ir3_register *reg)
+ir3_reg_interval_init(struct ir3_reg_interval *interval,
+                      struct ir3_register *reg)
  {
-       rb_tree_init(&interval->children);
-       interval->reg = reg;
-       interval->parent = NULL;
-       interval->inserted = false;
+   rb_tree_init(&interval->children);
+   interval->reg = reg;
+   interval->parent = NULL;
+   interval->inserted = false;
  }
  
-void
-ir3_reg_interval_dump(struct ir3_reg_interval *interval);
+void ir3_reg_interval_dump(struct ir3_reg_interval *interval);
  
  void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
-                                                        struct ir3_reg_interval *interval);
+                             struct ir3_reg_interval *interval);
  
  void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
-                                                        struct ir3_reg_interval *interval);
+                             struct ir3_reg_interval *interval);
  
  void ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
-                                                                struct ir3_reg_interval *interval);
+                                 struct ir3_reg_interval *interval);
  
  #endif
-
diff --git a/src/freedreno/ir3/ir3_ra_validate.c b/src/freedreno/ir3/ir3_ra_validate.c

index 070ddc1..aab2676 100644 (file)
--- a/src/freedreno/ir3/ir3_ra_validate.c
+++ b/src/freedreno/ir3/ir3_ra_validate.c
@@ -73,59 +73,61 @@
   */
  
  #define UNKNOWN ((struct ir3_register *)NULL)
-#define UNDEF ((struct ir3_register *)(uintptr_t)1)
+#define UNDEF   ((struct ir3_register *)(uintptr_t)1)
  #define OVERDEF ((struct ir3_register *)(uintptr_t)2)
  
  struct reg_state {
-       struct ir3_register *def;
-       unsigned offset;
+   struct ir3_register *def;
+   unsigned offset;
  };
  
  struct file_state {
-       struct reg_state regs[RA_MAX_FILE_SIZE];
+   struct reg_state regs[RA_MAX_FILE_SIZE];
  };
  
  struct reaching_state {
-       struct file_state half, full, shared;
+   struct file_state half, full, shared;
  };
  
  struct ra_val_ctx {
-       struct ir3_instruction *current_instr;
+   struct ir3_instruction *current_instr;
  
-       struct reaching_state reaching;
-       struct reaching_state *block_reaching;
-       unsigned block_count;
+   struct reaching_state reaching;
+   struct reaching_state *block_reaching;
+   unsigned block_count;
  
-       unsigned full_size, half_size;
+   unsigned full_size, half_size;
  
-       bool merged_regs;
+   bool merged_regs;
  
-       bool failed;
+   bool failed;
  };
  
  static void
  validate_error(struct ra_val_ctx *ctx, const char *condstr)
  {
-       fprintf(stderr, "ra validation fail: %s\n", condstr);
-       fprintf(stderr, "  -> for instruction: ");
-       ir3_print_instr(ctx->current_instr);
-       abort();
+   fprintf(stderr, "ra validation fail: %s\n", condstr);
+   fprintf(stderr, "  -> for instruction: ");
+   ir3_print_instr(ctx->current_instr);
+   abort();
  }
  
-#define validate_assert(ctx, cond) do { \
-       if (!(cond)) { \
-               validate_error(ctx, #cond); \
-       } } while (0)
+#define validate_assert(ctx, cond)                                             \
+   do {                                                                        \
+      if (!(cond)) {                                                           \
+         validate_error(ctx, #cond);                                           \
+      }                                                                        \
+   } while (0)
  
  static unsigned
  get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_SHARED)
-               return RA_SHARED_SIZE;
-       else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
-               return ctx->full_size;
-       else
-               return ctx->half_size;
+   if (reg->flags & IR3_REG_SHARED)
+      return RA_SHARED_SIZE;
+   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
+      return ctx->full_size;
+   else
+      return ctx->half_size;
  }
  
  /* Validate simple things, like the registers being in-bounds. This way we
@@ -135,438 +137,434 @@ get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
  static void
  validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
  {
-       ctx->current_instr = instr;
-       ra_foreach_dst (dst, instr) {
-               unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
-               validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
-               if (dst->tied)
-                       validate_assert(ctx, ra_reg_get_num(dst) == ra_reg_get_num(dst->tied));
-       }
-
-       ra_foreach_src (src, instr) {
-               unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
-               validate_assert(ctx, src_max <= get_file_size(ctx, src));
-       }
+   ctx->current_instr = instr;
+   ra_foreach_dst (dst, instr) {
+      unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
+      validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
+      if (dst->tied)
+         validate_assert(ctx, ra_reg_get_num(dst) == ra_reg_get_num(dst->tied));
+   }
+
+   ra_foreach_src (src, instr) {
+      unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
+      validate_assert(ctx, src_max <= get_file_size(ctx, src));
+   }
  }
  
  /* This is the lattice operator. */
  static bool
  merge_reg(struct reg_state *dst, const struct reg_state *src)
  {
-       if (dst->def == UNKNOWN) {
-               *dst = *src;
-               return src->def != UNKNOWN;
-       } else if (dst->def == OVERDEF) {
-               return false;
-       } else {
-               if (src->def == UNKNOWN)
-                       return false;
-               else if (src->def == OVERDEF) {
-                       *dst = *src;
-                       return true;
-               } else {
-                       if (dst->def != src->def || dst->offset != src->offset) {
-                               dst->def = OVERDEF;
-                               dst->offset = 0;
-                               return true;
-                       } else {
-                               return false;
-                       }
-               }
-       }
+   if (dst->def == UNKNOWN) {
+      *dst = *src;
+      return src->def != UNKNOWN;
+   } else if (dst->def == OVERDEF) {
+      return false;
+   } else {
+      if (src->def == UNKNOWN)
+         return false;
+      else if (src->def == OVERDEF) {
+         *dst = *src;
+         return true;
+      } else {
+         if (dst->def != src->def || dst->offset != src->offset) {
+            dst->def = OVERDEF;
+            dst->offset = 0;
+            return true;
+         } else {
+            return false;
+         }
+      }
+   }
  }
  
  static bool
  merge_file(struct file_state *dst, const struct file_state *src, unsigned size)
  {
-       bool progress = false;
-       for (unsigned i = 0; i < size; i++)
-               progress |= merge_reg(&dst->regs[i], &src->regs[i]);
-       return progress;
+   bool progress = false;
+   for (unsigned i = 0; i < size; i++)
+      progress |= merge_reg(&dst->regs[i], &src->regs[i]);
+   return progress;
  }
  
  static bool
  merge_state(struct ra_val_ctx *ctx, struct reaching_state *dst,
-                       const struct reaching_state *src)
+            const struct reaching_state *src)
  {
-       bool progress = false;
-       progress |= merge_file(&dst->full, &src->full, ctx->full_size);
-       progress |= merge_file(&dst->half, &src->half, ctx->half_size);
-       return progress;
+   bool progress = false;
+   progress |= merge_file(&dst->full, &src->full, ctx->full_size);
+   progress |= merge_file(&dst->half, &src->half, ctx->half_size);
+   return progress;
  }
  
  static bool
  merge_state_physical(struct ra_val_ctx *ctx, struct reaching_state *dst,
-                                        const struct reaching_state *src)
+                     const struct reaching_state *src)
  {
-       return merge_file(&dst->shared, &src->shared, RA_SHARED_SIZE);
+   return merge_file(&dst->shared, &src->shared, RA_SHARED_SIZE);
  }
  
  static struct file_state *
  ra_val_get_file(struct ra_val_ctx *ctx, struct ir3_register *reg)
  {
-       if (reg->flags & IR3_REG_SHARED)
-               return &ctx->reaching.shared;
-       else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
-               return &ctx->reaching.full;
-       else
-               return &ctx->reaching.half;
+   if (reg->flags & IR3_REG_SHARED)
+      return &ctx->reaching.shared;
+   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
+      return &ctx->reaching.full;
+   else
+      return &ctx->reaching.half;
  }
  
  static void
  propagate_normal_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
  {
-       ra_foreach_dst (dst, instr) {
-               struct file_state *file = ra_val_get_file(ctx, dst);
-               physreg_t physreg = ra_reg_get_physreg(dst);
-               for (unsigned i = 0; i < reg_size(dst); i++) {
-                       file->regs[physreg + i] = (struct reg_state) {
-                               .def = dst,
-                               .offset = i,
-                       };
-               }
-       }
+   ra_foreach_dst (dst, instr) {
+      struct file_state *file = ra_val_get_file(ctx, dst);
+      physreg_t physreg = ra_reg_get_physreg(dst);
+      for (unsigned i = 0; i < reg_size(dst); i++) {
+         file->regs[physreg + i] = (struct reg_state){
+            .def = dst,
+            .offset = i,
+         };
+      }
+   }
  }
  
  static void
  propagate_split(struct ra_val_ctx *ctx, struct ir3_instruction *split)
  {
-       struct ir3_register *dst = split->dsts[0];
-       struct ir3_register *src = split->srcs[0];
-       physreg_t dst_physreg = ra_reg_get_physreg(dst);
-       physreg_t src_physreg = ra_reg_get_physreg(src);
-       struct file_state *file = ra_val_get_file(ctx, dst);
-
-       unsigned offset = split->split.off * reg_elem_size(src);
-       for (unsigned i = 0; i < reg_elem_size(src); i++) {
-               file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
-       }
+   struct ir3_register *dst = split->dsts[0];
+   struct ir3_register *src = split->srcs[0];
+   physreg_t dst_physreg = ra_reg_get_physreg(dst);
+   physreg_t src_physreg = ra_reg_get_physreg(src);
+   struct file_state *file = ra_val_get_file(ctx, dst);
+
+   unsigned offset = split->split.off * reg_elem_size(src);
+   for (unsigned i = 0; i < reg_elem_size(src); i++) {
+      file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
+   }
  }
  
  static void
  propagate_collect(struct ra_val_ctx *ctx, struct ir3_instruction *collect)
  {
-       struct ir3_register *dst = collect->dsts[0];
-       physreg_t dst_physreg = ra_reg_get_physreg(dst);
-       struct file_state *file = ra_val_get_file(ctx, dst);
-
-       unsigned size = reg_size(dst);
-       struct reg_state srcs[size];
-
-       for (unsigned i = 0; i < collect->srcs_count; i++) {
-               struct ir3_register *src = collect->srcs[i];
-               unsigned dst_offset = i * reg_elem_size(dst);
-               for (unsigned j = 0; j < reg_elem_size(dst); j++) {
-                       if (!ra_reg_is_src(src)) {
-                               srcs[dst_offset + j] = (struct reg_state) {
-                                       .def = dst,
-                                       .offset = dst_offset + j,
-                               };
-                       } else {
-                               physreg_t src_physreg = ra_reg_get_physreg(src);
-                               srcs[dst_offset + j] = file->regs[src_physreg + j];
-                       }
-               }
-       }
-
-       for (unsigned i = 0; i < size; i++)
-               file->regs[dst_physreg + i] = srcs[i];
+   struct ir3_register *dst = collect->dsts[0];
+   physreg_t dst_physreg = ra_reg_get_physreg(dst);
+   struct file_state *file = ra_val_get_file(ctx, dst);
+
+   unsigned size = reg_size(dst);
+   struct reg_state srcs[size];
+
+   for (unsigned i = 0; i < collect->srcs_count; i++) {
+      struct ir3_register *src = collect->srcs[i];
+      unsigned dst_offset = i * reg_elem_size(dst);
+      for (unsigned j = 0; j < reg_elem_size(dst); j++) {
+         if (!ra_reg_is_src(src)) {
+            srcs[dst_offset + j] = (struct reg_state){
+               .def = dst,
+               .offset = dst_offset + j,
+            };
+         } else {
+            physreg_t src_physreg = ra_reg_get_physreg(src);
+            srcs[dst_offset + j] = file->regs[src_physreg + j];
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < size; i++)
+      file->regs[dst_physreg + i] = srcs[i];
  }
  
  static void
  propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
  {
-       unsigned size = 0;
-       for (unsigned i = 0; i < pcopy->dsts_count; i++) {
-               size += reg_size(pcopy->srcs[i]);
-       }
-
-       struct reg_state srcs[size];
-
-       unsigned offset = 0;
-       for (unsigned i = 0; i < pcopy->srcs_count; i++) {
-               struct ir3_register *dst = pcopy->dsts[i];
-               struct ir3_register *src = pcopy->srcs[i];
-               struct file_state *file = ra_val_get_file(ctx, dst);
-
-               for (unsigned j = 0; j < reg_size(dst); j++) {
-                       if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
-                               srcs[offset + j] = (struct reg_state) {
-                                       .def = dst,
-                                       .offset = j,
-                               };
-                       } else {
-                               physreg_t src_physreg = ra_reg_get_physreg(src);
-                               srcs[offset + j] = file->regs[src_physreg + j];
-                       }
-               }
-
-               offset += reg_size(dst);
-       }
-       assert(offset == size);
-
-       offset = 0;
-       for (unsigned i = 0; i < pcopy->dsts_count; i++) {
-               struct ir3_register *dst = pcopy->dsts[i];
-               physreg_t dst_physreg = ra_reg_get_physreg(dst);
-               struct file_state *file = ra_val_get_file(ctx, dst);
-
-               for (unsigned j = 0; j < reg_size(dst); j++)
-                       file->regs[dst_physreg + j] = srcs[offset + j];
-
-               offset += reg_size(dst);
-       }
-       assert(offset == size);
+   unsigned size = 0;
+   for (unsigned i = 0; i < pcopy->dsts_count; i++) {
+      size += reg_size(pcopy->srcs[i]);
+   }
+
+   struct reg_state srcs[size];
+
+   unsigned offset = 0;
+   for (unsigned i = 0; i < pcopy->srcs_count; i++) {
+      struct ir3_register *dst = pcopy->dsts[i];
+      struct ir3_register *src = pcopy->srcs[i];
+      struct file_state *file = ra_val_get_file(ctx, dst);
+
+      for (unsigned j = 0; j < reg_size(dst); j++) {
+         if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
+            srcs[offset + j] = (struct reg_state){
+               .def = dst,
+               .offset = j,
+            };
+         } else {
+            physreg_t src_physreg = ra_reg_get_physreg(src);
+            srcs[offset + j] = file->regs[src_physreg + j];
+         }
+      }
+
+      offset += reg_size(dst);
+   }
+   assert(offset == size);
+
+   offset = 0;
+   for (unsigned i = 0; i < pcopy->dsts_count; i++) {
+      struct ir3_register *dst = pcopy->dsts[i];
+      physreg_t dst_physreg = ra_reg_get_physreg(dst);
+      struct file_state *file = ra_val_get_file(ctx, dst);
+
+      for (unsigned j = 0; j < reg_size(dst); j++)
+         file->regs[dst_physreg + j] = srcs[offset + j];
+
+      offset += reg_size(dst);
+   }
+   assert(offset == size);
  }
  
  static void
  propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (instr->opc == OPC_META_SPLIT)
-               propagate_split(ctx, instr);
-       else if (instr->opc == OPC_META_COLLECT)
-               propagate_collect(ctx, instr);
-       else if (instr->opc == OPC_META_PARALLEL_COPY)
-               propagate_parallelcopy(ctx, instr);
-       else
-               propagate_normal_instr(ctx, instr);
+   if (instr->opc == OPC_META_SPLIT)
+      propagate_split(ctx, instr);
+   else if (instr->opc == OPC_META_COLLECT)
+      propagate_collect(ctx, instr);
+   else if (instr->opc == OPC_META_PARALLEL_COPY)
+      propagate_parallelcopy(ctx, instr);
+   else
+      propagate_normal_instr(ctx, instr);
  }
  
  static bool
  propagate_block(struct ra_val_ctx *ctx, struct ir3_block *block)
  {
-       ctx->reaching = ctx->block_reaching[block->index];
-
-       foreach_instr (instr, &block->instr_list) {
-               propagate_instr(ctx, instr);
-       }
-
-       bool progress = false;
-       for (unsigned i = 0; i < 2; i++) {
-               struct ir3_block *succ = block->successors[i];
-               if (!succ)
-                       continue;
-               progress |= merge_state(ctx,
-                                                               &ctx->block_reaching[succ->index],
-                                                               &ctx->reaching);
-       }
-       for (unsigned i = 0; i < 2; i++) {
-               struct ir3_block *succ = block->physical_successors[i];
-               if (!succ)
-                       continue;
-               progress |= merge_state_physical(ctx,
-                                                                                &ctx->block_reaching[succ->index],
-                                                                                &ctx->reaching);
-       }
-       return progress;
+   ctx->reaching = ctx->block_reaching[block->index];
+
+   foreach_instr (instr, &block->instr_list) {
+      propagate_instr(ctx, instr);
+   }
+
+   bool progress = false;
+   for (unsigned i = 0; i < 2; i++) {
+      struct ir3_block *succ = block->successors[i];
+      if (!succ)
+         continue;
+      progress |=
+         merge_state(ctx, &ctx->block_reaching[succ->index], &ctx->reaching);
+   }
+   for (unsigned i = 0; i < 2; i++) {
+      struct ir3_block *succ = block->physical_successors[i];
+      if (!succ)
+         continue;
+      progress |= merge_state_physical(ctx, &ctx->block_reaching[succ->index],
+                                       &ctx->reaching);
+   }
+   return progress;
  }
  
  static void
  chase_definition(struct reg_state *state)
  {
-       while (true) {
-               struct ir3_instruction *instr = state->def->instr;
-               switch (instr->opc) {
-               case OPC_META_SPLIT: {
-                       struct ir3_register *new_def = instr->srcs[0]->def;
-                       unsigned offset = instr->split.off * reg_elem_size(new_def);
-                       *state = (struct reg_state) {
-                               .def = new_def,
-                               .offset = state->offset + offset,
-                       };
-                       break;
-               }
-               case OPC_META_COLLECT: {
-                       unsigned src_idx = state->offset / reg_elem_size(state->def);
-                       unsigned src_offset = state->offset % reg_elem_size(state->def);
-                       struct ir3_register *new_def = instr->srcs[src_idx]->def;
-                       if (new_def) {
-                               *state = (struct reg_state) {
-                                       .def = new_def,
-                                       .offset = src_offset,
-                               };
-                       } else {
-                               /* Bail on immed/const */
-                               return;
-                       }
-                       break;
-               }
-               case OPC_META_PARALLEL_COPY: {
-                       unsigned dst_idx = ~0;
-                       for (unsigned i = 0; i < instr->dsts_count; i++) {
-                               if (instr->dsts[i] == state->def) {
-                                       dst_idx = i;
-                                       break;
-                               }
-                       }
-                       assert(dst_idx != ~0);
-
-                       struct ir3_register *new_def = instr->srcs[dst_idx]->def;
-                       if (new_def) {
-                               state->def = new_def;
-                       } else {
-                               /* Bail on immed/const */
-                               return;
-                       }
-                       break;
-               }
-               default:
-                       return;
-               }
-       }
+   while (true) {
+      struct ir3_instruction *instr = state->def->instr;
+      switch (instr->opc) {
+      case OPC_META_SPLIT: {
+         struct ir3_register *new_def = instr->srcs[0]->def;
+         unsigned offset = instr->split.off * reg_elem_size(new_def);
+         *state = (struct reg_state){
+            .def = new_def,
+            .offset = state->offset + offset,
+         };
+         break;
+      }
+      case OPC_META_COLLECT: {
+         unsigned src_idx = state->offset / reg_elem_size(state->def);
+         unsigned src_offset = state->offset % reg_elem_size(state->def);
+         struct ir3_register *new_def = instr->srcs[src_idx]->def;
+         if (new_def) {
+            *state = (struct reg_state){
+               .def = new_def,
+               .offset = src_offset,
+            };
+         } else {
+            /* Bail on immed/const */
+            return;
+         }
+         break;
+      }
+      case OPC_META_PARALLEL_COPY: {
+         unsigned dst_idx = ~0;
+         for (unsigned i = 0; i < instr->dsts_count; i++) {
+            if (instr->dsts[i] == state->def) {
+               dst_idx = i;
+               break;
+            }
+         }
+         assert(dst_idx != ~0);
+
+         struct ir3_register *new_def = instr->srcs[dst_idx]->def;
+         if (new_def) {
+            state->def = new_def;
+         } else {
+            /* Bail on immed/const */
+            return;
+         }
+         break;
+      }
+      default:
+         return;
+      }
+   }
  }
  
  static void
  dump_reg_state(struct reg_state *state)
  {
-       if (state->def == UNDEF) {
-               fprintf(stderr, "no reaching definition");
-       } else if (state->def == OVERDEF) {
-               fprintf(stderr, "more than one reaching definition or partial definition");
-       } else {
-               /* The analysis should always remove UNKNOWN eventually. */
-               assert(state->def != UNKNOWN);
-
-               fprintf(stderr, "ssa_%u:%u(%sr%u.%c) + %u",
-                               state->def->instr->serialno, state->def->name,
-                               (state->def->flags & IR3_REG_HALF) ? "h" : "",
-                               state->def->num / 4, "xyzw"[state->def->num % 4],
-                               state->offset);
-       }
+   if (state->def == UNDEF) {
+      fprintf(stderr, "no reaching definition");
+   } else if (state->def == OVERDEF) {
+      fprintf(stderr,
+              "more than one reaching definition or partial definition");
+   } else {
+      /* The analysis should always remove UNKNOWN eventually. */
+      assert(state->def != UNKNOWN);
+
+      fprintf(stderr, "ssa_%u:%u(%sr%u.%c) + %u", state->def->instr->serialno,
+              state->def->name, (state->def->flags & IR3_REG_HALF) ? "h" : "",
+              state->def->num / 4, "xyzw"[state->def->num % 4],
+              state -> offset);
+   }
  }
  
  static void
  check_reaching_src(struct ra_val_ctx *ctx, struct ir3_instruction *instr,
-                                  struct ir3_register *src)
+                   struct ir3_register *src)
  {
-       struct file_state *file = ra_val_get_file(ctx, src);
-       physreg_t physreg = ra_reg_get_physreg(src);
-       for (unsigned i = 0; i < reg_size(src); i++) {
-               struct reg_state expected = (struct reg_state) {
-                       .def = src->def,
-                       .offset = i,
-               };
-               chase_definition(&expected);
-
-               struct reg_state actual = file->regs[physreg + i];
-
-               if (expected.def != actual.def ||
-                       expected.offset != actual.offset) {
-                       fprintf(stderr, "ra validation fail: wrong definition reaches source ssa_%u:%u + %u\n",
-                                       src->def->instr->serialno, src->def->name, i);
-                       fprintf(stderr, "expected: ");
-                       dump_reg_state(&expected);
-                       fprintf(stderr, "\n");
-                       fprintf(stderr, "actual: ");
-                       dump_reg_state(&actual);
-                       fprintf(stderr, "\n");
-                       fprintf(stderr, "-> for instruction: ");
-                       ir3_print_instr(instr);
-                       ctx->failed = true;
-               }
-       }
+   struct file_state *file = ra_val_get_file(ctx, src);
+   physreg_t physreg = ra_reg_get_physreg(src);
+   for (unsigned i = 0; i < reg_size(src); i++) {
+      struct reg_state expected = (struct reg_state){
+         .def = src->def,
+         .offset = i,
+      };
+      chase_definition(&expected);
+
+      struct reg_state actual = file->regs[physreg + i];
+
+      if (expected.def != actual.def || expected.offset != actual.offset) {
+         fprintf(
+            stderr,
+            "ra validation fail: wrong definition reaches source ssa_%u:%u + %u\n",
+            src->def->instr->serialno, src->def->name, i);
+         fprintf(stderr, "expected: ");
+         dump_reg_state(&expected);
+         fprintf(stderr, "\n");
+         fprintf(stderr, "actual: ");
+         dump_reg_state(&actual);
+         fprintf(stderr, "\n");
+         fprintf(stderr, "-> for instruction: ");
+         ir3_print_instr(instr);
+         ctx->failed = true;
+      }
+   }
  }
  
  static void
  check_reaching_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (instr->opc == OPC_META_SPLIT ||
-               instr->opc == OPC_META_COLLECT ||
-               instr->opc == OPC_META_PARALLEL_COPY ||
-               instr->opc == OPC_META_PHI) {
-               return;
-       }
-
-       ra_foreach_src (src, instr) {
-               check_reaching_src(ctx, instr, src);
-       }
+   if (instr->opc == OPC_META_SPLIT || instr->opc == OPC_META_COLLECT ||
+       instr->opc == OPC_META_PARALLEL_COPY || instr->opc == OPC_META_PHI) {
+      return;
+   }
+
+   ra_foreach_src (src, instr) {
+      check_reaching_src(ctx, instr, src);
+   }
  }
  
  static void
  check_reaching_block(struct ra_val_ctx *ctx, struct ir3_block *block)
  {
-       ctx->reaching = ctx->block_reaching[block->index];
-
-       foreach_instr (instr, &block->instr_list) {
-               check_reaching_instr(ctx, instr);
-               propagate_instr(ctx, instr);
-       }
-
-       for (unsigned i = 0; i < 2; i++) {
-               struct ir3_block *succ = block->successors[i];
-               if (!succ)
-                       continue;
-
-               unsigned pred_idx = ir3_block_get_pred_index(succ, block);
-               foreach_instr (instr, &succ->instr_list) {
-                       if (instr->opc != OPC_META_PHI)
-                               break;
-                       if (instr->srcs[pred_idx]->def)
-                               check_reaching_src(ctx, instr, instr->srcs[pred_idx]);
-               }
-       }
+   ctx->reaching = ctx->block_reaching[block->index];
+
+   foreach_instr (instr, &block->instr_list) {
+      check_reaching_instr(ctx, instr);
+      propagate_instr(ctx, instr);
+   }
+
+   for (unsigned i = 0; i < 2; i++) {
+      struct ir3_block *succ = block->successors[i];
+      if (!succ)
+         continue;
+
+      unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+      foreach_instr (instr, &succ->instr_list) {
+         if (instr->opc != OPC_META_PHI)
+            break;
+         if (instr->srcs[pred_idx]->def)
+            check_reaching_src(ctx, instr, instr->srcs[pred_idx]);
+      }
+   }
  }
  
  static void
  check_reaching_defs(struct ra_val_ctx *ctx, struct ir3 *ir)
  {
-       ctx->block_reaching =
-               rzalloc_array(ctx, struct reaching_state, ctx->block_count);
-
-       struct reaching_state *start = &ctx->block_reaching[0];
-       for (unsigned i = 0; i < ctx->full_size; i++)
-               start->full.regs[i].def = UNDEF;
-       for (unsigned i = 0; i < ctx->half_size; i++)
-               start->half.regs[i].def = UNDEF;
-       for (unsigned i = 0; i < RA_SHARED_SIZE; i++)
-               start->shared.regs[i].def = UNDEF;
-
-       bool progress;
-       do {
-               progress = false;
-               foreach_block (block, &ir->block_list) {
-                       progress |= propagate_block(ctx, block);
-               }
-       } while (progress);
-       
-       foreach_block (block, &ir->block_list) {
-               check_reaching_block(ctx, block);
-       }
-
-       if (ctx->failed) {
-               fprintf(stderr, "failing shader:\n");
-               ir3_print(ir);
-               abort();
-       }
+   ctx->block_reaching =
+      rzalloc_array(ctx, struct reaching_state, ctx->block_count);
+
+   struct reaching_state *start = &ctx->block_reaching[0];
+   for (unsigned i = 0; i < ctx->full_size; i++)
+      start->full.regs[i].def = UNDEF;
+   for (unsigned i = 0; i < ctx->half_size; i++)
+      start->half.regs[i].def = UNDEF;
+   for (unsigned i = 0; i < RA_SHARED_SIZE; i++)
+      start->shared.regs[i].def = UNDEF;
+
+   bool progress;
+   do {
+      progress = false;
+      foreach_block (block, &ir->block_list) {
+         progress |= propagate_block(ctx, block);
+      }
+   } while (progress);
+
+   foreach_block (block, &ir->block_list) {
+      check_reaching_block(ctx, block);
+   }
+
+   if (ctx->failed) {
+      fprintf(stderr, "failing shader:\n");
+      ir3_print(ir);
+      abort();
+   }
  }
  
  void
-ir3_ra_validate(struct ir3_shader_variant *v,
-                           unsigned full_size, unsigned half_size, unsigned block_count)
+ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
+                unsigned half_size, unsigned block_count)
  {
  #ifdef NDEBUG
-#  define VALIDATE 0
+#define VALIDATE 0
  #else
-#  define VALIDATE 1
+#define VALIDATE 1
  #endif
  
-       if (!VALIDATE)
-               return;
+   if (!VALIDATE)
+      return;
  
-       struct ra_val_ctx *ctx = rzalloc(NULL, struct ra_val_ctx);
-       ctx->merged_regs = v->mergedregs;
-       ctx->full_size = full_size;
-       ctx->half_size = half_size;
-       ctx->block_count = block_count;
+   struct ra_val_ctx *ctx = rzalloc(NULL, struct ra_val_ctx);
+   ctx->merged_regs = v->mergedregs;
+   ctx->full_size = full_size;
+   ctx->half_size = half_size;
+   ctx->block_count = block_count;
  
-       foreach_block (block, &v->ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       validate_simple(ctx, instr);
-               }
-       }
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         validate_simple(ctx, instr);
+      }
+   }
  
-       check_reaching_defs(ctx, v->ir);
+   check_reaching_defs(ctx, v->ir);
  
-       ralloc_free(ctx);
+   ralloc_free(ctx);
  }
-
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c

index e8f979c..7b452d6 100644 (file)
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -24,7 +24,6 @@
   *    Rob Clark <robclark@freedesktop.org>
   */
  
-
  #include "util/dag.h"
  #include "util/u_math.h"
  
@@ -36,14 +35,20 @@
  #else
  #define SCHED_DEBUG 0
  #endif
-#define d(fmt, ...) do { if (SCHED_DEBUG) { \
-       printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
-} } while (0)
-
-#define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
-       printf("SCHED: "fmt": ", ##__VA_ARGS__); \
-       ir3_print_instr(instr); \
-} } while (0)
+#define d(fmt, ...)                                                            \
+   do {                                                                        \
+      if (SCHED_DEBUG) {                                                       \
+         printf("SCHED: " fmt "\n", ##__VA_ARGS__);                            \
+      }                                                                        \
+   } while (0)
+
+#define di(instr, fmt, ...)                                                    \
+   do {                                                                        \
+      if (SCHED_DEBUG) {                                                       \
+         printf("SCHED: " fmt ": ", ##__VA_ARGS__);                            \
+         ir3_print_instr(instr);                                               \
+      }                                                                        \
+   } while (0)
  
  /*
   * Instruction Scheduling:
@@ -83,269 +88,275 @@
   */
  
  struct ir3_sched_ctx {
-       struct ir3_block *block;           /* the current block */
-       struct dag *dag;
+   struct ir3_block *block; /* the current block */
+   struct dag *dag;
  
-       struct list_head unscheduled_list; /* unscheduled instructions */
-       struct ir3_instruction *scheduled; /* last scheduled instr */
-       struct ir3_instruction *addr0;     /* current a0.x user, if any */
-       struct ir3_instruction *addr1;     /* current a1.x user, if any */
-       struct ir3_instruction *pred;      /* current p0.x user, if any */
+   struct list_head unscheduled_list; /* unscheduled instructions */
+   struct ir3_instruction *scheduled; /* last scheduled instr */
+   struct ir3_instruction *addr0;     /* current a0.x user, if any */
+   struct ir3_instruction *addr1;     /* current a1.x user, if any */
+   struct ir3_instruction *pred;      /* current p0.x user, if any */
  
-       struct ir3_instruction *split;     /* most-recently-split a0/a1/p0 producer */
+   struct ir3_instruction *split; /* most-recently-split a0/a1/p0 producer */
  
-       int remaining_kills;
-       int remaining_tex;
+   int remaining_kills;
+   int remaining_tex;
  
-       bool error;
+   bool error;
  
-       int sfu_delay;
-       int tex_delay;
+   int sfu_delay;
+   int tex_delay;
  
-       /* We order the scheduled tex/SFU instructions, and keep track of the
-        * index of the last waited on instruction, so we can know which
-        * instructions are still outstanding (and therefore would require us to
-        * wait for all outstanding instructions before scheduling a use).
-        */
-       int tex_index, first_outstanding_tex_index;
-       int sfu_index, first_outstanding_sfu_index;
+   /* We order the scheduled tex/SFU instructions, and keep track of the
+    * index of the last waited on instruction, so we can know which
+    * instructions are still outstanding (and therefore would require us to
+    * wait for all outstanding instructions before scheduling a use).
+    */
+   int tex_index, first_outstanding_tex_index;
+   int sfu_index, first_outstanding_sfu_index;
  };
  
  struct ir3_sched_node {
-       struct dag_node dag;     /* must be first for util_dynarray_foreach */
-       struct ir3_instruction *instr;
-
-       unsigned delay;
-       unsigned max_delay;
-
-       unsigned tex_index;
-       unsigned sfu_index;
-
-       /* For instructions that are a meta:collect src, once we schedule
-        * the first src of the collect, the entire vecN is live (at least
-        * from the PoV of the first RA pass.. the 2nd scalar pass can fill
-        * in some of the gaps, but often not all).  So we want to help out
-        * RA, and realize that as soon as we schedule the first collect
-        * src, there is no penalty to schedule the remainder (ie. they
-        * don't make additional values live).  In fact we'd prefer to
-        * schedule the rest ASAP to minimize the live range of the vecN.
-        *
-        * For instructions that are the src of a collect, we track the
-        * corresponding collect, and mark them as partially live as soon
-        * as any one of the src's is scheduled.
-        */
-       struct ir3_instruction *collect;
-       bool partially_live;
-
-       /* Is this instruction a direct or indirect dependency for a kill?
-        * If so, we should prioritize it when possible
-        */
-       bool kill_path;
-
-       /* This node represents a shader output.  A semi-common pattern in
-        * shaders is something along the lines of:
-        *
-        *    fragcolor.w = 1.0
-        *
-        * Which we'd prefer to schedule as late as possible, since it
-        * produces a live value that is never killed/consumed.  So detect
-        * outputs up-front, and avoid scheduling them unless the reduce
-        * register pressure (or at least are neutral)
-        */
-       bool output;
+   struct dag_node dag; /* must be first for util_dynarray_foreach */
+   struct ir3_instruction *instr;
+
+   unsigned delay;
+   unsigned max_delay;
+
+   unsigned tex_index;
+   unsigned sfu_index;
+
+   /* For instructions that are a meta:collect src, once we schedule
+    * the first src of the collect, the entire vecN is live (at least
+    * from the PoV of the first RA pass.. the 2nd scalar pass can fill
+    * in some of the gaps, but often not all).  So we want to help out
+    * RA, and realize that as soon as we schedule the first collect
+    * src, there is no penalty to schedule the remainder (ie. they
+    * don't make additional values live).  In fact we'd prefer to
+    * schedule the rest ASAP to minimize the live range of the vecN.
+    *
+    * For instructions that are the src of a collect, we track the
+    * corresponding collect, and mark them as partially live as soon
+    * as any one of the src's is scheduled.
+    */
+   struct ir3_instruction *collect;
+   bool partially_live;
+
+   /* Is this instruction a direct or indirect dependency for a kill?
+    * If so, we should prioritize it when possible
+    */
+   bool kill_path;
+
+   /* This node represents a shader output.  A semi-common pattern in
+    * shaders is something along the lines of:
+    *
+    *    fragcolor.w = 1.0
+    *
+    * Which we'd prefer to schedule as late as possible, since it
+    * produces a live value that is never killed/consumed.  So detect
+    * outputs up-front, and avoid scheduling them unless the reduce
+    * register pressure (or at least are neutral)
+    */
+   bool output;
  };
  
-#define foreach_sched_node(__n, __list) \
-       list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
+#define foreach_sched_node(__n, __list)                                        \
+   list_for_each_entry (struct ir3_sched_node, __n, __list, dag.link)
  
-static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
-static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i);
+static void sched_node_init(struct ir3_sched_ctx *ctx,
+                            struct ir3_instruction *instr);
+static void sched_node_add_dep(struct ir3_instruction *instr,
+                               struct ir3_instruction *src, int i);
  
-static bool is_scheduled(struct ir3_instruction *instr)
+static bool
+is_scheduled(struct ir3_instruction *instr)
  {
-       return !!(instr->flags & IR3_INSTR_MARK);
+   return !!(instr->flags & IR3_INSTR_MARK);
  }
  
  /* check_src_cond() passing a ir3_sched_ctx. */
  static bool
  sched_check_src_cond(struct ir3_instruction *instr,
-                                        bool (*cond)(struct ir3_instruction *, struct ir3_sched_ctx *),
-                                        struct ir3_sched_ctx *ctx)
+                     bool (*cond)(struct ir3_instruction *,
+                                  struct ir3_sched_ctx *),
+                     struct ir3_sched_ctx *ctx)
  {
-       foreach_ssa_src (src, instr) {
-               /* meta:split/collect aren't real instructions, the thing that
-                * we actually care about is *their* srcs
-                */
-               if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) {
-                       if (sched_check_src_cond(src, cond, ctx))
-                               return true;
-               } else {
-                       if (cond(src, ctx))
-                               return true;
-               }
-       }
-
-       return false;
+   foreach_ssa_src (src, instr) {
+      /* meta:split/collect aren't real instructions, the thing that
+       * we actually care about is *their* srcs
+       */
+      if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) {
+         if (sched_check_src_cond(src, cond, ctx))
+            return true;
+      } else {
+         if (cond(src, ctx))
+            return true;
+      }
+   }
+
+   return false;
  }
  
  /* Is this a prefetch or tex that hasn't been waited on yet? */
  
  static bool
-is_outstanding_tex_or_prefetch(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
+is_outstanding_tex_or_prefetch(struct ir3_instruction *instr,
+                               struct ir3_sched_ctx *ctx)
  {
-       if (!is_tex_or_prefetch(instr))
-               return false;
+   if (!is_tex_or_prefetch(instr))
+      return false;
  
-       /* The sched node is only valid within the same block, we cannot
-        * really say anything about src's from other blocks
-        */
-       if (instr->block != ctx->block)
-               return true;
+   /* The sched node is only valid within the same block, we cannot
+    * really say anything about src's from other blocks
+    */
+   if (instr->block != ctx->block)
+      return true;
  
-       struct ir3_sched_node *n = instr->data;
-       return n->tex_index >= ctx->first_outstanding_tex_index;
+   struct ir3_sched_node *n = instr->data;
+   return n->tex_index >= ctx->first_outstanding_tex_index;
  }
  
  static bool
  is_outstanding_sfu(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
  {
-       if (!is_sfu(instr))
-               return false;
+   if (!is_sfu(instr))
+      return false;
  
-       /* The sched node is only valid within the same block, we cannot
-        * really say anything about src's from other blocks
-        */
-       if (instr->block != ctx->block)
-               return true;
+   /* The sched node is only valid within the same block, we cannot
+    * really say anything about src's from other blocks
+    */
+   if (instr->block != ctx->block)
+      return true;
  
-       struct ir3_sched_node *n = instr->data;
-       return n->sfu_index >= ctx->first_outstanding_sfu_index;
+   struct ir3_sched_node *n = instr->data;
+   return n->sfu_index >= ctx->first_outstanding_sfu_index;
  }
  
  static unsigned
  cycle_count(struct ir3_instruction *instr)
  {
-       if (instr->opc == OPC_META_COLLECT) {
-               /* Assume that only immed/const sources produce moves */
-               unsigned n = 0;
-               foreach_src(src, instr) {
-                       if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
-                               n++;
-               }
-               return n;
-       } else if (is_meta(instr)) {
-               return 0;
-       } else {
-               return 1;
-       }
+   if (instr->opc == OPC_META_COLLECT) {
+      /* Assume that only immed/const sources produce moves */
+      unsigned n = 0;
+      foreach_src (src, instr) {
+         if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
+            n++;
+      }
+      return n;
+   } else if (is_meta(instr)) {
+      return 0;
+   } else {
+      return 1;
+   }
  }
  
  static void
  schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  {
-       debug_assert(ctx->block == instr->block);
-
-       /* remove from depth list:
-        */
-       list_delinit(&instr->node);
-
-       if (writes_addr0(instr)) {
-               debug_assert(ctx->addr0 == NULL);
-               ctx->addr0 = instr;
-       }
-
-       if (writes_addr1(instr)) {
-               debug_assert(ctx->addr1 == NULL);
-               ctx->addr1 = instr;
-       }
-
-       if (writes_pred(instr)) {
-               debug_assert(ctx->pred == NULL);
-               ctx->pred = instr;
-       }
-
-       instr->flags |= IR3_INSTR_MARK;
-
-       di(instr, "schedule");
-
-       list_addtail(&instr->node, &instr->block->instr_list);
-       ctx->scheduled = instr;
-
-       if (is_kill_or_demote(instr)){
-               assert(ctx->remaining_kills > 0);
-               ctx->remaining_kills--;
-       }
-
-       struct ir3_sched_node *n = instr->data;
-
-       /* If this instruction is a meta:collect src, mark the remaining
-        * collect srcs as partially live.
-        */
-       if (n->collect) {
-               foreach_ssa_src (src, n->collect) {
-                       if (src->block != instr->block)
-                               continue;
-                       struct ir3_sched_node *sn = src->data;
-                       sn->partially_live = true;
-               }
-       }
-
-       dag_prune_head(ctx->dag, &n->dag);
-
-       unsigned cycles = cycle_count(instr);
-
-       if (is_sfu(instr)) {
-               ctx->sfu_delay = 8;
-               n->sfu_index = ctx->sfu_index++;
-       } else if (!is_meta(instr) &&
-                          sched_check_src_cond(instr, is_outstanding_sfu, ctx)) {
-               ctx->sfu_delay = 0;
-               ctx->first_outstanding_sfu_index = ctx->sfu_index;
-       } else if (ctx->sfu_delay > 0) {
-               ctx->sfu_delay -= MIN2(cycles, ctx->sfu_delay);
-       }
-
-       if (is_tex_or_prefetch(instr)) {
-               /* NOTE that this isn't an attempt to hide texture fetch latency,
-                * but an attempt to hide the cost of switching to another warp.
-                * If we can, we'd like to try to schedule another texture fetch
-                * before scheduling something that would sync.
-                */
-               ctx->tex_delay = 10;
-               assert(ctx->remaining_tex > 0);
-               ctx->remaining_tex--;
-               n->tex_index = ctx->tex_index++;
-       } else if (!is_meta(instr) &&
-                          sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, ctx)) {
-               ctx->tex_delay = 0;
-               ctx->first_outstanding_tex_index = ctx->tex_index;
-       } else if (ctx->tex_delay > 0) {
-               ctx->tex_delay -= MIN2(cycles, ctx->tex_delay);
-       }
+   debug_assert(ctx->block == instr->block);
+
+   /* remove from depth list:
+    */
+   list_delinit(&instr->node);
+
+   if (writes_addr0(instr)) {
+      debug_assert(ctx->addr0 == NULL);
+      ctx->addr0 = instr;
+   }
+
+   if (writes_addr1(instr)) {
+      debug_assert(ctx->addr1 == NULL);
+      ctx->addr1 = instr;
+   }
+
+   if (writes_pred(instr)) {
+      debug_assert(ctx->pred == NULL);
+      ctx->pred = instr;
+   }
+
+   instr->flags |= IR3_INSTR_MARK;
+
+   di(instr, "schedule");
+
+   list_addtail(&instr->node, &instr->block->instr_list);
+   ctx->scheduled = instr;
+
+   if (is_kill_or_demote(instr)) {
+      assert(ctx->remaining_kills > 0);
+      ctx->remaining_kills--;
+   }
+
+   struct ir3_sched_node *n = instr->data;
+
+   /* If this instruction is a meta:collect src, mark the remaining
+    * collect srcs as partially live.
+    */
+   if (n->collect) {
+      foreach_ssa_src (src, n->collect) {
+         if (src->block != instr->block)
+            continue;
+         struct ir3_sched_node *sn = src->data;
+         sn->partially_live = true;
+      }
+   }
+
+   dag_prune_head(ctx->dag, &n->dag);
+
+   unsigned cycles = cycle_count(instr);
+
+   if (is_sfu(instr)) {
+      ctx->sfu_delay = 8;
+      n->sfu_index = ctx->sfu_index++;
+   } else if (!is_meta(instr) &&
+              sched_check_src_cond(instr, is_outstanding_sfu, ctx)) {
+      ctx->sfu_delay = 0;
+      ctx->first_outstanding_sfu_index = ctx->sfu_index;
+   } else if (ctx->sfu_delay > 0) {
+      ctx->sfu_delay -= MIN2(cycles, ctx->sfu_delay);
+   }
+
+   if (is_tex_or_prefetch(instr)) {
+      /* NOTE that this isn't an attempt to hide texture fetch latency,
+       * but an attempt to hide the cost of switching to another warp.
+       * If we can, we'd like to try to schedule another texture fetch
+       * before scheduling something that would sync.
+       */
+      ctx->tex_delay = 10;
+      assert(ctx->remaining_tex > 0);
+      ctx->remaining_tex--;
+      n->tex_index = ctx->tex_index++;
+   } else if (!is_meta(instr) &&
+              sched_check_src_cond(instr, is_outstanding_tex_or_prefetch,
+                                   ctx)) {
+      ctx->tex_delay = 0;
+      ctx->first_outstanding_tex_index = ctx->tex_index;
+   } else if (ctx->tex_delay > 0) {
+      ctx->tex_delay -= MIN2(cycles, ctx->tex_delay);
+   }
  }
  
  struct ir3_sched_notes {
-       /* there is at least one kill which could be scheduled, except
-        * for unscheduled bary.f's:
-        */
-       bool blocked_kill;
-       /* there is at least one instruction that could be scheduled,
-        * except for conflicting address/predicate register usage:
-        */
-       bool addr0_conflict, addr1_conflict, pred_conflict;
+   /* there is at least one kill which could be scheduled, except
+    * for unscheduled bary.f's:
+    */
+   bool blocked_kill;
+   /* there is at least one instruction that could be scheduled,
+    * except for conflicting address/predicate register usage:
+    */
+   bool addr0_conflict, addr1_conflict, pred_conflict;
  };
  
  /* could an instruction be scheduled if specified ssa src was scheduled? */
  static bool
  could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
  {
-       foreach_ssa_src (other_src, instr) {
-               /* if dependency not scheduled, we aren't ready yet: */
-               if ((src != other_src) && !is_scheduled(other_src)) {
-                       return false;
-               }
-       }
-       return true;
+   foreach_ssa_src (other_src, instr) {
+      /* if dependency not scheduled, we aren't ready yet: */
+      if ((src != other_src) && !is_scheduled(other_src)) {
+         return false;
+      }
+   }
+   return true;
  }
  
  /* Check if instruction is ok to schedule.  Make sure it is not blocked
@@ -353,117 +364,117 @@ could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
   */
  static bool
  check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               struct ir3_instruction *instr)
+            struct ir3_instruction *instr)
  {
-       debug_assert(!is_scheduled(instr));
-
-       if (instr == ctx->split) {
-               /* Don't schedule instructions created by splitting a a0.x/a1.x/p0.x
-                * write until another "normal" instruction has been scheduled.
-                */
-               return false;
-       }
-
-       if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
-               /* avoid texture/memory access if we have unscheduled kills
-                * that could make the expensive operation unnecessary.  By
-                * definition, if there are remaining kills, and this instr
-                * is not a dependency of a kill, there are other instructions
-                * that we can choose from.
-                */
-               struct ir3_sched_node *n = instr->data;
-               if (!n->kill_path)
-                       return false;
-       }
-
-       /* For instructions that write address register we need to
-        * make sure there is at least one instruction that uses the
-        * addr value which is otherwise ready.
-        *
-        * NOTE if any instructions use pred register and have other
-        * src args, we would need to do the same for writes_pred()..
-        */
-       if (writes_addr0(instr)) {
-               struct ir3 *ir = instr->block->shader;
-               bool ready = false;
-               for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
-                       struct ir3_instruction *indirect = ir->a0_users[i];
-                       if (!indirect)
-                               continue;
-                       if (indirect->address->def != instr->dsts[0])
-                               continue;
-                       ready = could_sched(indirect, instr);
-               }
-
-               /* nothing could be scheduled, so keep looking: */
-               if (!ready)
-                       return false;
-       }
-
-       if (writes_addr1(instr)) {
-               struct ir3 *ir = instr->block->shader;
-               bool ready = false;
-               for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
-                       struct ir3_instruction *indirect = ir->a1_users[i];
-                       if (!indirect)
-                               continue;
-                       if (indirect->address->def != instr->dsts[0])
-                               continue;
-                       ready = could_sched(indirect, instr);
-               }
-
-               /* nothing could be scheduled, so keep looking: */
-               if (!ready)
-                       return false;
-       }
-
-       /* if this is a write to address/predicate register, and that
-        * register is currently in use, we need to defer until it is
-        * free:
-        */
-       if (writes_addr0(instr) && ctx->addr0) {
-               debug_assert(ctx->addr0 != instr);
-               notes->addr0_conflict = true;
-               return false;
-       }
-
-       if (writes_addr1(instr) && ctx->addr1) {
-               debug_assert(ctx->addr1 != instr);
-               notes->addr1_conflict = true;
-               return false;
-       }
-
-       if (writes_pred(instr) && ctx->pred) {
-               debug_assert(ctx->pred != instr);
-               notes->pred_conflict = true;
-               return false;
-       }
-
-       /* if the instruction is a kill, we need to ensure *every*
-        * bary.f is scheduled.  The hw seems unhappy if the thread
-        * gets killed before the end-input (ei) flag is hit.
-        *
-        * We could do this by adding each bary.f instruction as
-        * virtual ssa src for the kill instruction.  But we have
-        * fixed length instr->srcs[].
-        *
-        * TODO we could handle this by false-deps now, probably.
-        */
-       if (is_kill_or_demote(instr)) {
-               struct ir3 *ir = instr->block->shader;
-
-               for (unsigned i = 0; i < ir->baryfs_count; i++) {
-                       struct ir3_instruction *baryf = ir->baryfs[i];
-                       if (baryf->flags & IR3_INSTR_UNUSED)
-                               continue;
-                       if (!is_scheduled(baryf)) {
-                               notes->blocked_kill = true;
-                               return false;
-                       }
-               }
-       }
-
-       return true;
+   debug_assert(!is_scheduled(instr));
+
+   if (instr == ctx->split) {
+      /* Don't schedule instructions created by splitting a a0.x/a1.x/p0.x
+       * write until another "normal" instruction has been scheduled.
+       */
+      return false;
+   }
+
+   if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
+      /* avoid texture/memory access if we have unscheduled kills
+       * that could make the expensive operation unnecessary.  By
+       * definition, if there are remaining kills, and this instr
+       * is not a dependency of a kill, there are other instructions
+       * that we can choose from.
+       */
+      struct ir3_sched_node *n = instr->data;
+      if (!n->kill_path)
+         return false;
+   }
+
+   /* For instructions that write address register we need to
+    * make sure there is at least one instruction that uses the
+    * addr value which is otherwise ready.
+    *
+    * NOTE if any instructions use pred register and have other
+    * src args, we would need to do the same for writes_pred()..
+    */
+   if (writes_addr0(instr)) {
+      struct ir3 *ir = instr->block->shader;
+      bool ready = false;
+      for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
+         struct ir3_instruction *indirect = ir->a0_users[i];
+         if (!indirect)
+            continue;
+         if (indirect->address->def != instr->dsts[0])
+            continue;
+         ready = could_sched(indirect, instr);
+      }
+
+      /* nothing could be scheduled, so keep looking: */
+      if (!ready)
+         return false;
+   }
+
+   if (writes_addr1(instr)) {
+      struct ir3 *ir = instr->block->shader;
+      bool ready = false;
+      for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
+         struct ir3_instruction *indirect = ir->a1_users[i];
+         if (!indirect)
+            continue;
+         if (indirect->address->def != instr->dsts[0])
+            continue;
+         ready = could_sched(indirect, instr);
+      }
+
+      /* nothing could be scheduled, so keep looking: */
+      if (!ready)
+         return false;
+   }
+
+   /* if this is a write to address/predicate register, and that
+    * register is currently in use, we need to defer until it is
+    * free:
+    */
+   if (writes_addr0(instr) && ctx->addr0) {
+      debug_assert(ctx->addr0 != instr);
+      notes->addr0_conflict = true;
+      return false;
+   }
+
+   if (writes_addr1(instr) && ctx->addr1) {
+      debug_assert(ctx->addr1 != instr);
+      notes->addr1_conflict = true;
+      return false;
+   }
+
+   if (writes_pred(instr) && ctx->pred) {
+      debug_assert(ctx->pred != instr);
+      notes->pred_conflict = true;
+      return false;
+   }
+
+   /* if the instruction is a kill, we need to ensure *every*
+    * bary.f is scheduled.  The hw seems unhappy if the thread
+    * gets killed before the end-input (ei) flag is hit.
+    *
+    * We could do this by adding each bary.f instruction as
+    * virtual ssa src for the kill instruction.  But we have
+    * fixed length instr->srcs[].
+    *
+    * TODO we could handle this by false-deps now, probably.
+    */
+   if (is_kill_or_demote(instr)) {
+      struct ir3 *ir = instr->block->shader;
+
+      for (unsigned i = 0; i < ir->baryfs_count; i++) {
+         struct ir3_instruction *baryf = ir->baryfs[i];
+         if (baryf->flags & IR3_INSTR_UNUSED)
+            continue;
+         if (!is_scheduled(baryf)) {
+            notes->blocked_kill = true;
+            return false;
+         }
+      }
+   }
+
+   return true;
  }
  
  /* Find the instr->ip of the closest use of an instruction, in
@@ -474,64 +485,69 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
   * to choose, to avoid creating too much parallelism (ie. blowing
   * up register pressure)
   *
- * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
+ * See
+ * dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
   */
  static int
  nearest_use(struct ir3_instruction *instr)
  {
-       unsigned nearest = ~0;
-       foreach_ssa_use (use, instr)
-               if (!is_scheduled(use))
-                       nearest = MIN2(nearest, use->ip);
-
-       /* slight hack.. this heuristic tends to push bary.f's to later
-        * in the shader, closer to their uses.  But we actually would
-        * prefer to get these scheduled earlier, to unlock varying
-        * storage for more VS jobs:
-        */
-       if (is_input(instr))
-               nearest /= 2;
-
-       return nearest;
+   unsigned nearest = ~0;
+   foreach_ssa_use (use, instr)
+      if (!is_scheduled(use))
+         nearest = MIN2(nearest, use->ip);
+
+   /* slight hack.. this heuristic tends to push bary.f's to later
+    * in the shader, closer to their uses.  But we actually would
+    * prefer to get these scheduled earlier, to unlock varying
+    * storage for more VS jobs:
+    */
+   if (is_input(instr))
+      nearest /= 2;
+
+   return nearest;
  }
  
  static bool
-is_only_nonscheduled_use(struct ir3_instruction *instr, struct ir3_instruction *use)
+is_only_nonscheduled_use(struct ir3_instruction *instr,
+                         struct ir3_instruction *use)
  {
-       foreach_ssa_use (other_use, instr) {
-               if (other_use != use && !is_scheduled(other_use))
-                       return false;
-       }
+   foreach_ssa_use (other_use, instr) {
+      if (other_use != use && !is_scheduled(other_use))
+         return false;
+   }
  
-       return true;
+   return true;
  }
  
  /* find net change to live values if instruction were scheduled: */
  static int
  live_effect(struct ir3_instruction *instr)
  {
-       struct ir3_sched_node *n = instr->data;
-       int new_live = (n->partially_live || !instr->uses || instr->uses->entries == 0) ? 0 : dest_regs(instr);
-       int freed_live = 0;
-
-       /* if we schedule something that causes a vecN to be live,
-        * then count all it's other components too:
-        */
-       if (n->collect)
-               new_live *= n->collect->srcs_count;
-
-       foreach_ssa_src_n (src, n, instr) {
-               if (__is_false_dep(instr, n))
-                       continue;
-
-               if (instr->block != src->block)
-                       continue;
-
-               if (is_only_nonscheduled_use(src, instr))
-                       freed_live += dest_regs(src);
-       }
-
-       return new_live - freed_live;
+   struct ir3_sched_node *n = instr->data;
+   int new_live =
+      (n->partially_live || !instr->uses || instr->uses->entries == 0)
+         ? 0
+         : dest_regs(instr);
+   int freed_live = 0;
+
+   /* if we schedule something that causes a vecN to be live,
+    * then count all it's other components too:
+    */
+   if (n->collect)
+      new_live *= n->collect->srcs_count;
+
+   foreach_ssa_src_n (src, n, instr) {
+      if (__is_false_dep(instr, n))
+         continue;
+
+      if (instr->block != src->block)
+         continue;
+
+      if (is_only_nonscheduled_use(src, instr))
+         freed_live += dest_regs(src);
+   }
+
+   return new_live - freed_live;
  }
  
  /* Determine if this is an instruction that we'd prefer not to schedule
@@ -543,39 +559,39 @@ live_effect(struct ir3_instruction *instr)
  static bool
  should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (ctx->sfu_delay) {
-               if (sched_check_src_cond(instr, is_outstanding_sfu, ctx))
-                       return true;
-       }
-
-       /* We mostly just want to try to schedule another texture fetch
-        * before scheduling something that would (sy) sync, so we can
-        * limit this rule to cases where there are remaining texture
-        * fetches
-        */
-       if (ctx->tex_delay && ctx->remaining_tex) {
-               if (sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, ctx))
-                       return true;
-       }
-
-       /* Avoid scheduling too many outstanding texture or sfu instructions at
-        * once by deferring further tex/SFU instructions. This both prevents
-        * stalls when the queue of texture/sfu instructions becomes too large,
-        * and prevents unacceptably large increases in register pressure from too
-        * many outstanding texture instructions.
-        */
-       if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr))
-               return true;
-
-       if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr))
-               return true;
-
-       return false;
+   if (ctx->sfu_delay) {
+      if (sched_check_src_cond(instr, is_outstanding_sfu, ctx))
+         return true;
+   }
+
+   /* We mostly just want to try to schedule another texture fetch
+    * before scheduling something that would (sy) sync, so we can
+    * limit this rule to cases where there are remaining texture
+    * fetches
+    */
+   if (ctx->tex_delay && ctx->remaining_tex) {
+      if (sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, ctx))
+         return true;
+   }
+
+   /* Avoid scheduling too many outstanding texture or sfu instructions at
+    * once by deferring further tex/SFU instructions. This both prevents
+    * stalls when the queue of texture/sfu instructions becomes too large,
+    * and prevents unacceptably large increases in register pressure from too
+    * many outstanding texture instructions.
+    */
+   if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr))
+      return true;
+
+   if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr))
+      return true;
+
+   return false;
  }
  
-static struct ir3_sched_node *
-choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               bool defer, bool avoid_output);
+static struct ir3_sched_node *choose_instr_inc(struct ir3_sched_ctx *ctx,
+                                               struct ir3_sched_notes *notes,
+                                               bool defer, bool avoid_output);
  
  /**
   * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
@@ -586,110 +602,110 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
   */
  static struct ir3_sched_node *
  choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               bool defer)
+                 bool defer)
  {
-       const char *mode = defer ? "-d" : "";
-       struct ir3_sched_node *chosen = NULL;
+   const char *mode = defer ? "-d" : "";
+   struct ir3_sched_node *chosen = NULL;
  
-       /* Find a ready inst with regs freed and pick the one with max cost. */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (defer && should_defer(ctx, n->instr))
-                       continue;
+   /* Find a ready inst with regs freed and pick the one with max cost. */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (defer && should_defer(ctx, n->instr))
+         continue;
  
-               /* Note: mergedregs is only used post-RA, just set it to false */
-               unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
+      /* Note: mergedregs is only used post-RA, just set it to false */
+      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
  
-               if (d > 0)
-                       continue;
+      if (d > 0)
+         continue;
  
-               if (live_effect(n->instr) > -1)
-                       continue;
+      if (live_effect(n->instr) > -1)
+         continue;
  
-               if (!check_instr(ctx, notes, n->instr))
-                       continue;
+      if (!check_instr(ctx, notes, n->instr))
+         continue;
  
-               if (!chosen || chosen->max_delay < n->max_delay) {
-                       chosen = n;
-               }
-       }
+      if (!chosen || chosen->max_delay < n->max_delay) {
+         chosen = n;
+      }
+   }
  
-       if (chosen) {
-               di(chosen->instr, "dec%s: chose (freed+ready)", mode);
-               return chosen;
-       }
+   if (chosen) {
+      di(chosen->instr, "dec%s: chose (freed+ready)", mode);
+      return chosen;
+   }
  
-       /* Find a leader with regs freed and pick the one with max cost. */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (defer && should_defer(ctx, n->instr))
-                       continue;
+   /* Find a leader with regs freed and pick the one with max cost. */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (defer && should_defer(ctx, n->instr))
+         continue;
  
-               if (live_effect(n->instr) > -1)
-                       continue;
+      if (live_effect(n->instr) > -1)
+         continue;
  
-               if (!check_instr(ctx, notes, n->instr))
-                       continue;
+      if (!check_instr(ctx, notes, n->instr))
+         continue;
  
-               if (!chosen || chosen->max_delay < n->max_delay) {
-                       chosen = n;
-               }
-       }
+      if (!chosen || chosen->max_delay < n->max_delay) {
+         chosen = n;
+      }
+   }
  
-       if (chosen) {
-               di(chosen->instr, "dec%s: chose (freed)", mode);
-               return chosen;
-       }
+   if (chosen) {
+      di(chosen->instr, "dec%s: chose (freed)", mode);
+      return chosen;
+   }
  
-       /* Contra the paper, pick a leader with no effect on used regs.  This may
-        * open up new opportunities, as otherwise a single-operand instr consuming
-        * a value will tend to block finding freeing that value.  This had a
-        * massive effect on reducing spilling on V3D.
-        *
-        * XXX: Should this prioritize ready?
-        */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (defer && should_defer(ctx, n->instr))
-                       continue;
+   /* Contra the paper, pick a leader with no effect on used regs.  This may
+    * open up new opportunities, as otherwise a single-operand instr consuming
+    * a value will tend to block finding freeing that value.  This had a
+    * massive effect on reducing spilling on V3D.
+    *
+    * XXX: Should this prioritize ready?
+    */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (defer && should_defer(ctx, n->instr))
+         continue;
  
-               unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
+      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
  
-               if (d > 0)
-                       continue;
+      if (d > 0)
+         continue;
  
-               if (live_effect(n->instr) > 0)
-                       continue;
+      if (live_effect(n->instr) > 0)
+         continue;
  
-               if (!check_instr(ctx, notes, n->instr))
-                       continue;
+      if (!check_instr(ctx, notes, n->instr))
+         continue;
  
-               if (!chosen || chosen->max_delay < n->max_delay)
-                       chosen = n;
-       }
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
  
-       if (chosen) {
-               di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
-               return chosen;
-       }
+   if (chosen) {
+      di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
+      return chosen;
+   }
  
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (defer && should_defer(ctx, n->instr))
-                       continue;
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (defer && should_defer(ctx, n->instr))
+         continue;
  
-               if (live_effect(n->instr) > 0)
-                       continue;
+      if (live_effect(n->instr) > 0)
+         continue;
  
-               if (!check_instr(ctx, notes, n->instr))
-                       continue;
+      if (!check_instr(ctx, notes, n->instr))
+         continue;
  
-               if (!chosen || chosen->max_delay < n->max_delay)
-                       chosen = n;
-       }
+      if (!chosen || chosen->max_delay < n->max_delay)
+         chosen = n;
+   }
  
-       if (chosen) {
-               di(chosen->instr, "dec%s: chose (neutral)", mode);
-               return chosen;
-       }
+   if (chosen) {
+      di(chosen->instr, "dec%s: chose (neutral)", mode);
+      return chosen;
+   }
  
-       return choose_instr_inc(ctx, notes, defer, true);
+   return choose_instr_inc(ctx, notes, defer, true);
  }
  
  /**
@@ -698,72 +714,72 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
   */
  static struct ir3_sched_node *
  choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               bool defer, bool avoid_output)
+                 bool defer, bool avoid_output)
  {
-       const char *mode = defer ? "-d" : "";
-       struct ir3_sched_node *chosen = NULL;
+   const char *mode = defer ? "-d" : "";
+   struct ir3_sched_node *chosen = NULL;
  
-       /*
-        * From hear on out, we are picking something that increases
-        * register pressure.  So try to pick something which will
-        * be consumed soon:
-        */
-       unsigned chosen_distance = 0;
+   /*
+    * From hear on out, we are picking something that increases
+    * register pressure.  So try to pick something which will
+    * be consumed soon:
+    */
+   unsigned chosen_distance = 0;
  
-       /* Pick the max delay of the remaining ready set. */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (avoid_output && n->output)
-                       continue;
+   /* Pick the max delay of the remaining ready set. */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (avoid_output && n->output)
+         continue;
  
-               if (defer && should_defer(ctx, n->instr))
-                       continue;
+      if (defer && should_defer(ctx, n->instr))
+         continue;
  
-               unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
+      unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
  
-               if (d > 0)
-                       continue;
+      if (d > 0)
+         continue;
  
-               if (!check_instr(ctx, notes, n->instr))
-                       continue;
+      if (!check_instr(ctx, notes, n->instr))
+         continue;
  
-               unsigned distance = nearest_use(n->instr);
+      unsigned distance = nearest_use(n->instr);
  
-               if (!chosen || distance < chosen_distance) {
-                       chosen = n;
-                       chosen_distance = distance;
-               }
-       }
+      if (!chosen || distance < chosen_distance) {
+         chosen = n;
+         chosen_distance = distance;
+      }
+   }
  
-       if (chosen) {
-               di(chosen->instr, "inc%s: chose (distance+ready)", mode);
-               return chosen;
-       }
+   if (chosen) {
+      di(chosen->instr, "inc%s: chose (distance+ready)", mode);
+      return chosen;
+   }
  
-       /* Pick the max delay of the remaining leaders. */
-       foreach_sched_node (n, &ctx->dag->heads) {
-               if (avoid_output && n->output)
-                       continue;
+   /* Pick the max delay of the remaining leaders. */
+   foreach_sched_node (n, &ctx->dag->heads) {
+      if (avoid_output && n->output)
+         continue;
  
-               if (defer && should_defer(ctx, n->instr))
-                       continue;
+      if (defer && should_defer(ctx, n->instr))
+         continue;
  
-               if (!check_instr(ctx, notes, n->instr))
-                       continue;
+      if (!check_instr(ctx, notes, n->instr))
+         continue;
  
-               unsigned distance = nearest_use(n->instr);
+      unsigned distance = nearest_use(n->instr);
  
-               if (!chosen || distance < chosen_distance) {
-                       chosen = n;
-                       chosen_distance = distance;
-               }
-       }
+      if (!chosen || distance < chosen_distance) {
+         chosen = n;
+         chosen_distance = distance;
+      }
+   }
  
-       if (chosen) {
-               di(chosen->instr, "inc%s: chose (distance)", mode);
-               return chosen;
-       }
+   if (chosen) {
+      di(chosen->instr, "inc%s: chose (distance)", mode);
+      return chosen;
+   }
  
-       return NULL;
+   return NULL;
  }
  
  /* Handles instruction selections for instructions we want to prioritize
@@ -772,86 +788,85 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  static struct ir3_sched_node *
  choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
  {
-       struct ir3_sched_node *chosen = NULL;
-
-       foreach_sched_node (n, &ctx->dag->heads) {
-               /*
-                * - phi nodes and inputs must be scheduled first
-                * - split should be scheduled first, so that the vector value is
-                *   killed as soon as possible. RA cannot split up the vector and
-                *   reuse components that have been killed until it's been killed.
-                * - collect, on the other hand, should be treated as a "normal"
-                *   instruction, and may add to register pressure if its sources are
-                *   part of another vector or immediates.
-                */
-               if (!is_meta(n->instr) || n->instr->opc == OPC_META_COLLECT)
-                       continue;
-
-               if (!chosen || (chosen->max_delay < n->max_delay))
-                       chosen = n;
-       }
-
-       if (chosen) {
-               di(chosen->instr, "prio: chose (meta)");
-               return chosen;
-       }
-
-       return NULL;
+   struct ir3_sched_node *chosen = NULL;
+
+   foreach_sched_node (n, &ctx->dag->heads) {
+      /*
+       * - phi nodes and inputs must be scheduled first
+       * - split should be scheduled first, so that the vector value is
+       *   killed as soon as possible. RA cannot split up the vector and
+       *   reuse components that have been killed until it's been killed.
+       * - collect, on the other hand, should be treated as a "normal"
+       *   instruction, and may add to register pressure if its sources are
+       *   part of another vector or immediates.
+       */
+      if (!is_meta(n->instr) || n->instr->opc == OPC_META_COLLECT)
+         continue;
+
+      if (!chosen || (chosen->max_delay < n->max_delay))
+         chosen = n;
+   }
+
+   if (chosen) {
+      di(chosen->instr, "prio: chose (meta)");
+      return chosen;
+   }
+
+   return NULL;
  }
  
  static void
  dump_state(struct ir3_sched_ctx *ctx)
  {
-       if (!SCHED_DEBUG)
-               return;
+   if (!SCHED_DEBUG)
+      return;
  
-       foreach_sched_node (n, &ctx->dag->heads) {
-               di(n->instr, "maxdel=%3d le=%d del=%u ",
-                               n->max_delay, live_effect(n->instr),
-                               ir3_delay_calc_prera(ctx->block, n->instr));
+   foreach_sched_node (n, &ctx->dag->heads) {
+      di(n->instr, "maxdel=%3d le=%d del=%u ", n->max_delay,
+         live_effect(n->instr), ir3_delay_calc_prera(ctx->block, n->instr));
  
-               util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
-                       struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
+      util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+         struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
  
-                       di(child->instr, " -> (%d parents) ", child->dag.parent_count);
-               }
-       }
+         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
+      }
+   }
  }
  
  /* find instruction to schedule: */
  static struct ir3_instruction *
  choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
  {
-       struct ir3_sched_node *chosen;
+   struct ir3_sched_node *chosen;
  
-       dump_state(ctx);
+   dump_state(ctx);
  
-       chosen = choose_instr_prio(ctx, notes);
-       if (chosen)
-               return chosen->instr;
+   chosen = choose_instr_prio(ctx, notes);
+   if (chosen)
+      return chosen->instr;
  
-       chosen = choose_instr_dec(ctx, notes, true);
-       if (chosen)
-               return chosen->instr;
+   chosen = choose_instr_dec(ctx, notes, true);
+   if (chosen)
+      return chosen->instr;
  
-       chosen = choose_instr_dec(ctx, notes, false);
-       if (chosen)
-               return chosen->instr;
+   chosen = choose_instr_dec(ctx, notes, false);
+   if (chosen)
+      return chosen->instr;
  
-       chosen = choose_instr_inc(ctx, notes, false, false);
-       if (chosen)
-               return chosen->instr;
+   chosen = choose_instr_inc(ctx, notes, false, false);
+   if (chosen)
+      return chosen->instr;
  
-       return NULL;
+   return NULL;
  }
  
  static struct ir3_instruction *
  split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
  {
-       struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
-       di(new_instr, "split instruction");
-       sched_node_init(ctx, new_instr);
-       return new_instr;
+   struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
+   di(new_instr, "split instruction");
+   sched_node_init(ctx, new_instr);
+   return new_instr;
  }
  
  /* "spill" the address registers by remapping any unscheduled
@@ -860,45 +875,45 @@ split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
   */
  static struct ir3_instruction *
  split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
-                  struct ir3_instruction **users, unsigned users_count)
+           struct ir3_instruction **users, unsigned users_count)
  {
-       struct ir3_instruction *new_addr = NULL;
-       unsigned i;
-
-       debug_assert(*addr);
-
-       for (i = 0; i < users_count; i++) {
-               struct ir3_instruction *indirect = users[i];
-
-               if (!indirect)
-                       continue;
-
-               /* skip instructions already scheduled: */
-               if (is_scheduled(indirect))
-                       continue;
-
-               /* remap remaining instructions using current addr
-                * to new addr:
-                */
-               if (indirect->address->def == (*addr)->dsts[0]) {
-                       if (!new_addr) {
-                               new_addr = split_instr(ctx, *addr);
-                               /* original addr is scheduled, but new one isn't: */
-                               new_addr->flags &= ~IR3_INSTR_MARK;
-                       }
-                       indirect->address->def = new_addr->dsts[0];
-                       /* don't need to remove old dag edge since old addr is
-                        * already scheduled:
-                        */
-                       sched_node_add_dep(indirect, new_addr, 0);
-                       di(indirect, "new address");
-               }
-       }
-
-       /* all remaining indirects remapped to new addr: */
-       *addr = NULL;
-
-       return new_addr;
+   struct ir3_instruction *new_addr = NULL;
+   unsigned i;
+
+   debug_assert(*addr);
+
+   for (i = 0; i < users_count; i++) {
+      struct ir3_instruction *indirect = users[i];
+
+      if (!indirect)
+         continue;
+
+      /* skip instructions already scheduled: */
+      if (is_scheduled(indirect))
+         continue;
+
+      /* remap remaining instructions using current addr
+       * to new addr:
+       */
+      if (indirect->address->def == (*addr)->dsts[0]) {
+         if (!new_addr) {
+            new_addr = split_instr(ctx, *addr);
+            /* original addr is scheduled, but new one isn't: */
+            new_addr->flags &= ~IR3_INSTR_MARK;
+         }
+         indirect->address->def = new_addr->dsts[0];
+         /* don't need to remove old dag edge since old addr is
+          * already scheduled:
+          */
+         sched_node_add_dep(indirect, new_addr, 0);
+         di(indirect, "new address");
+      }
+   }
+
+   /* all remaining indirects remapped to new addr: */
+   *addr = NULL;
+
+   return new_addr;
  }
  
  /* "spill" the predicate register by remapping any unscheduled
@@ -908,452 +923,456 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
  static struct ir3_instruction *
  split_pred(struct ir3_sched_ctx *ctx)
  {
-       struct ir3 *ir;
-       struct ir3_instruction *new_pred = NULL;
-       unsigned i;
-
-       debug_assert(ctx->pred);
-
-       ir = ctx->pred->block->shader;
-
-       for (i = 0; i < ir->predicates_count; i++) {
-               struct ir3_instruction *predicated = ir->predicates[i];
-
-               if (!predicated)
-                       continue;
-
-               /* skip instructions already scheduled: */
-               if (is_scheduled(predicated))
-                       continue;
-
-               /* remap remaining instructions using current pred
-                * to new pred:
-                *
-                * TODO is there ever a case when pred isn't first
-                * (and only) src?
-                */
-               if (ssa(predicated->srcs[0]) == ctx->pred) {
-                       if (!new_pred) {
-                               new_pred = split_instr(ctx, ctx->pred);
-                               /* original pred is scheduled, but new one isn't: */
-                               new_pred->flags &= ~IR3_INSTR_MARK;
-                       }
-                       predicated->srcs[0]->instr = new_pred;
-                       /* don't need to remove old dag edge since old pred is
-                        * already scheduled:
-                        */
-                       sched_node_add_dep(predicated, new_pred, 0);
-                       di(predicated, "new predicate");
-               }
-       }
-
-       if (ctx->block->condition == ctx->pred) {
-               if (!new_pred) {
-                       new_pred = split_instr(ctx, ctx->pred);
-                       /* original pred is scheduled, but new one isn't: */
-                       new_pred->flags &= ~IR3_INSTR_MARK;
-               }
-               ctx->block->condition = new_pred;
-               d("new branch condition");
-       }
-
-       /* all remaining predicated remapped to new pred: */
-       ctx->pred = NULL;
-
-       return new_pred;
+   struct ir3 *ir;
+   struct ir3_instruction *new_pred = NULL;
+   unsigned i;
+
+   debug_assert(ctx->pred);
+
+   ir = ctx->pred->block->shader;
+
+   for (i = 0; i < ir->predicates_count; i++) {
+      struct ir3_instruction *predicated = ir->predicates[i];
+
+      if (!predicated)
+         continue;
+
+      /* skip instructions already scheduled: */
+      if (is_scheduled(predicated))
+         continue;
+
+      /* remap remaining instructions using current pred
+       * to new pred:
+       *
+       * TODO is there ever a case when pred isn't first
+       * (and only) src?
+       */
+      if (ssa(predicated->srcs[0]) == ctx->pred) {
+         if (!new_pred) {
+            new_pred = split_instr(ctx, ctx->pred);
+            /* original pred is scheduled, but new one isn't: */
+            new_pred->flags &= ~IR3_INSTR_MARK;
+         }
+         predicated->srcs[0]->instr = new_pred;
+         /* don't need to remove old dag edge since old pred is
+          * already scheduled:
+          */
+         sched_node_add_dep(predicated, new_pred, 0);
+         di(predicated, "new predicate");
+      }
+   }
+
+   if (ctx->block->condition == ctx->pred) {
+      if (!new_pred) {
+         new_pred = split_instr(ctx, ctx->pred);
+         /* original pred is scheduled, but new one isn't: */
+         new_pred->flags &= ~IR3_INSTR_MARK;
+      }
+      ctx->block->condition = new_pred;
+      d("new branch condition");
+   }
+
+   /* all remaining predicated remapped to new pred: */
+   ctx->pred = NULL;
+
+   return new_pred;
  }
  
  static void
  sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  {
-       struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
+   struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
  
-       dag_init_node(ctx->dag, &n->dag);
+   dag_init_node(ctx->dag, &n->dag);
  
-       n->instr = instr;
-       instr->data = n;
+   n->instr = instr;
+   instr->data = n;
  }
  
  static void
-sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i)
+sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
+                   int i)
  {
-       /* don't consider dependencies in other blocks: */
-       if (src->block != instr->block)
-               return;
-
-       /* we could have false-dep's that end up unused: */
-       if (src->flags & IR3_INSTR_UNUSED) {
-               debug_assert(__is_false_dep(instr, i));
-               return;
-       }
+   /* don't consider dependencies in other blocks: */
+   if (src->block != instr->block)
+      return;
  
-       struct ir3_sched_node *n = instr->data;
-       struct ir3_sched_node *sn = src->data;
+   /* we could have false-dep's that end up unused: */
+   if (src->flags & IR3_INSTR_UNUSED) {
+      debug_assert(__is_false_dep(instr, i));
+      return;
+   }
  
-       /* If src is consumed by a collect, track that to realize that once
-        * any of the collect srcs are live, we should hurry up and schedule
-        * the rest.
-        */
-       if (instr->opc == OPC_META_COLLECT)
-               sn->collect = instr;
+   struct ir3_sched_node *n = instr->data;
+   struct ir3_sched_node *sn = src->data;
  
-       dag_add_edge(&sn->dag, &n->dag, NULL);
+   /* If src is consumed by a collect, track that to realize that once
+    * any of the collect srcs are live, we should hurry up and schedule
+    * the rest.
+    */
+   if (instr->opc == OPC_META_COLLECT)
+      sn->collect = instr;
  
+   dag_add_edge(&sn->dag, &n->dag, NULL);
  
-       unsigned d = ir3_delayslots(src, instr, i, true);
+   unsigned d = ir3_delayslots(src, instr, i, true);
  
-       n->delay = MAX2(n->delay, d);
+   n->delay = MAX2(n->delay, d);
  }
  
  static void
  mark_kill_path(struct ir3_instruction *instr)
  {
-       struct ir3_sched_node *n = instr->data;
+   struct ir3_sched_node *n = instr->data;
  
-       if (n->kill_path) {
-               return;
-       }
+   if (n->kill_path) {
+      return;
+   }
  
-       n->kill_path = true;
+   n->kill_path = true;
  
-       foreach_ssa_src (src, instr) {
-               if (src->block != instr->block)
-                       continue;
-               mark_kill_path(src);
-       }
+   foreach_ssa_src (src, instr) {
+      if (src->block != instr->block)
+         continue;
+      mark_kill_path(src);
+   }
  }
  
  /* Is it an output? */
  static bool
  is_output_collect(struct ir3_instruction *instr)
  {
-       if (instr->opc != OPC_META_COLLECT)
-               return false;
+   if (instr->opc != OPC_META_COLLECT)
+      return false;
  
-       foreach_ssa_use (use, instr) {
-               if (use->opc != OPC_END && use->opc != OPC_CHMASK)
-                       return false;
-       }
+   foreach_ssa_use (use, instr) {
+      if (use->opc != OPC_END && use->opc != OPC_CHMASK)
+         return false;
+   }
  
-       return true;
+   return true;
  }
  
  /* Is it's only use as output? */
  static bool
  is_output_only(struct ir3_instruction *instr)
  {
-       if (!writes_gpr(instr))
-               return false;
+   if (!writes_gpr(instr))
+      return false;
  
-       if (!(instr->dsts[0]->flags & IR3_REG_SSA))
-               return false;
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return false;
  
-       foreach_ssa_use (use, instr)
-               if (!is_output_collect(use))
-                       return false;
+   foreach_ssa_use (use, instr)
+      if (!is_output_collect(use))
+         return false;
  
-       return true;
+   return true;
  }
  
  static void
  sched_node_add_deps(struct ir3_instruction *instr)
  {
-       /* There's nothing to do for phi nodes, since they always go first. And
-        * phi nodes can reference sources later in the same block, so handling
-        * sources is not only unnecessary but could cause problems.
-        */
-       if (instr->opc == OPC_META_PHI)
-               return;
-
-       /* Since foreach_ssa_src() already handles false-dep's we can construct
-        * the DAG easily in a single pass.
-        */
-       foreach_ssa_src_n (src, i, instr) {
-               sched_node_add_dep(instr, src, i);
-       }
-
-       /* NOTE that all inputs must be scheduled before a kill, so
-        * mark these to be prioritized as well:
-        */
-       if (is_kill_or_demote(instr) || is_input(instr)) {
-               mark_kill_path(instr);
-       }
-
-       if (is_output_only(instr)) {
-               struct ir3_sched_node *n = instr->data;
-               n->output = true;
-       }
+   /* There's nothing to do for phi nodes, since they always go first. And
+    * phi nodes can reference sources later in the same block, so handling
+    * sources is not only unnecessary but could cause problems.
+    */
+   if (instr->opc == OPC_META_PHI)
+      return;
+
+   /* Since foreach_ssa_src() already handles false-dep's we can construct
+    * the DAG easily in a single pass.
+    */
+   foreach_ssa_src_n (src, i, instr) {
+      sched_node_add_dep(instr, src, i);
+   }
+
+   /* NOTE that all inputs must be scheduled before a kill, so
+    * mark these to be prioritized as well:
+    */
+   if (is_kill_or_demote(instr) || is_input(instr)) {
+      mark_kill_path(instr);
+   }
+
+   if (is_output_only(instr)) {
+      struct ir3_sched_node *n = instr->data;
+      n->output = true;
+   }
  }
  
  static void
  sched_dag_max_delay_cb(struct dag_node *node, void *state)
  {
-       struct ir3_sched_node *n = (struct ir3_sched_node *)node;
-       uint32_t max_delay = 0;
+   struct ir3_sched_node *n = (struct ir3_sched_node *)node;
+   uint32_t max_delay = 0;
  
-       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
-               struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
-               max_delay = MAX2(child->max_delay, max_delay);
-       }
+   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
+      struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
+      max_delay = MAX2(child->max_delay, max_delay);
+   }
  
-       n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
+   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
  }
  
  static void
  sched_dag_init(struct ir3_sched_ctx *ctx)
  {
-       ctx->dag = dag_create(ctx);
+   ctx->dag = dag_create(ctx);
  
-       foreach_instr (instr, &ctx->unscheduled_list) {
-               sched_node_init(ctx, instr);
-               sched_node_add_deps(instr);
-       }
+   foreach_instr (instr, &ctx->unscheduled_list) {
+      sched_node_init(ctx, instr);
+      sched_node_add_deps(instr);
+   }
  
-       dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
+   dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
  }
  
  static void
  sched_dag_destroy(struct ir3_sched_ctx *ctx)
  {
-       ralloc_free(ctx->dag);
-       ctx->dag = NULL;
+   ralloc_free(ctx->dag);
+   ctx->dag = NULL;
  }
  
  static void
  sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
  {
-       ctx->block = block;
-
-       /* addr/pred writes are per-block: */
-       ctx->addr0 = NULL;
-       ctx->addr1 = NULL;
-       ctx->pred = NULL;
-       ctx->tex_delay = 0;
-       ctx->sfu_delay = 0;
-       ctx->tex_index = ctx->first_outstanding_tex_index = 0;
-       ctx->sfu_index = ctx->first_outstanding_sfu_index = 0;
-
-       /* move all instructions to the unscheduled list, and
-        * empty the block's instruction list (to which we will
-        * be inserting).
-        */
-       list_replace(&block->instr_list, &ctx->unscheduled_list);
-       list_inithead(&block->instr_list);
-
-       sched_dag_init(ctx);
-
-       ctx->remaining_kills = 0;
-       ctx->remaining_tex = 0;
-       foreach_instr_safe (instr, &ctx->unscheduled_list) {
-               if (is_kill_or_demote(instr))
-                       ctx->remaining_kills++;
-               if (is_tex_or_prefetch(instr))
-                       ctx->remaining_tex++;
-       }
-
-       /* First schedule all meta:input and meta:phi instructions, followed by
-        * tex-prefetch.  We want all of the instructions that load values into
-        * registers before the shader starts to go before any other instructions.
-        * But in particular we want inputs to come before prefetches.  This is
-        * because a FS's bary_ij input may not actually be live in the shader,
-        * but it should not be scheduled on top of any other input (but can be
-        * overwritten by a tex prefetch)
-        *
-        * Note: Because the first block cannot have predecessors, meta:input and
-        * meta:phi cannot exist in the same block.
-        */
-       foreach_instr_safe (instr, &ctx->unscheduled_list)
-               if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_PHI)
-                       schedule(ctx, instr);
-
-       foreach_instr_safe (instr, &ctx->unscheduled_list)
-               if (instr->opc == OPC_META_TEX_PREFETCH)
-                       schedule(ctx, instr);
-
-       while (!list_is_empty(&ctx->unscheduled_list)) {
-               struct ir3_sched_notes notes = {0};
-               struct ir3_instruction *instr;
-
-               instr = choose_instr(ctx, &notes);
-               if (instr) {
-                       unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
-                       d("delay=%u", delay);
-
-                       /* and if we run out of instructions that can be scheduled,
-                        * then it is time for nop's:
-                        */
-                       debug_assert(delay <= 6);
-                       while (delay > 0) {
-                               ir3_NOP(block);
-                               delay--;
-                       }
-
-                       schedule(ctx, instr);
-
-                       /* Since we've scheduled a "real" instruction, we can now
-                        * schedule any split instruction created by the scheduler again.
-                        */
-                       ctx->split = NULL;
-               } else {
-                       struct ir3_instruction *new_instr = NULL;
-                       struct ir3 *ir = block->shader;
-
-                       /* nothing available to schedule.. if we are blocked on
-                        * address/predicate register conflict, then break the
-                        * deadlock by cloning the instruction that wrote that
-                        * reg:
-                        */
-                       if (notes.addr0_conflict) {
-                               new_instr = split_addr(ctx, &ctx->addr0,
-                                                                          ir->a0_users, ir->a0_users_count);
-                       } else if (notes.addr1_conflict) {
-                               new_instr = split_addr(ctx, &ctx->addr1,
-                                                                          ir->a1_users, ir->a1_users_count);
-                       } else if (notes.pred_conflict) {
-                               new_instr = split_pred(ctx);
-                       } else {
-                               d("unscheduled_list:");
-                               foreach_instr (instr, &ctx->unscheduled_list)
-                                       di(instr, "unscheduled: ");
-                               debug_assert(0);
-                               ctx->error = true;
-                               return;
-                       }
-
-                       if (new_instr) {
-                               list_delinit(&new_instr->node);
-                               list_addtail(&new_instr->node, &ctx->unscheduled_list);
-                       }
-
-                       /* If we produced a new instruction, do not schedule it next to
-                        * guarantee progress.
-                        */
-                       ctx->split = new_instr;
-               }
-       }
-
-       sched_dag_destroy(ctx);
+   ctx->block = block;
+
+   /* addr/pred writes are per-block: */
+   ctx->addr0 = NULL;
+   ctx->addr1 = NULL;
+   ctx->pred = NULL;
+   ctx->tex_delay = 0;
+   ctx->sfu_delay = 0;
+   ctx->tex_index = ctx->first_outstanding_tex_index = 0;
+   ctx->sfu_index = ctx->first_outstanding_sfu_index = 0;
+
+   /* move all instructions to the unscheduled list, and
+    * empty the block's instruction list (to which we will
+    * be inserting).
+    */
+   list_replace(&block->instr_list, &ctx->unscheduled_list);
+   list_inithead(&block->instr_list);
+
+   sched_dag_init(ctx);
+
+   ctx->remaining_kills = 0;
+   ctx->remaining_tex = 0;
+   foreach_instr_safe (instr, &ctx->unscheduled_list) {
+      if (is_kill_or_demote(instr))
+         ctx->remaining_kills++;
+      if (is_tex_or_prefetch(instr))
+         ctx->remaining_tex++;
+   }
+
+   /* First schedule all meta:input and meta:phi instructions, followed by
+    * tex-prefetch.  We want all of the instructions that load values into
+    * registers before the shader starts to go before any other instructions.
+    * But in particular we want inputs to come before prefetches.  This is
+    * because a FS's bary_ij input may not actually be live in the shader,
+    * but it should not be scheduled on top of any other input (but can be
+    * overwritten by a tex prefetch)
+    *
+    * Note: Because the first block cannot have predecessors, meta:input and
+    * meta:phi cannot exist in the same block.
+    */
+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_PHI)
+         schedule(ctx, instr);
+
+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_META_TEX_PREFETCH)
+         schedule(ctx, instr);
+
+   while (!list_is_empty(&ctx->unscheduled_list)) {
+      struct ir3_sched_notes notes = {0};
+      struct ir3_instruction *instr;
+
+      instr = choose_instr(ctx, &notes);
+      if (instr) {
+         unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
+         d("delay=%u", delay);
+
+         /* and if we run out of instructions that can be scheduled,
+          * then it is time for nop's:
+          */
+         debug_assert(delay <= 6);
+         while (delay > 0) {
+            ir3_NOP(block);
+            delay--;
+         }
+
+         schedule(ctx, instr);
+
+         /* Since we've scheduled a "real" instruction, we can now
+          * schedule any split instruction created by the scheduler again.
+          */
+         ctx->split = NULL;
+      } else {
+         struct ir3_instruction *new_instr = NULL;
+         struct ir3 *ir = block->shader;
+
+         /* nothing available to schedule.. if we are blocked on
+          * address/predicate register conflict, then break the
+          * deadlock by cloning the instruction that wrote that
+          * reg:
+          */
+         if (notes.addr0_conflict) {
+            new_instr =
+               split_addr(ctx, &ctx->addr0, ir->a0_users, ir->a0_users_count);
+         } else if (notes.addr1_conflict) {
+            new_instr =
+               split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count);
+         } else if (notes.pred_conflict) {
+            new_instr = split_pred(ctx);
+         } else {
+            d("unscheduled_list:");
+            foreach_instr (instr, &ctx->unscheduled_list)
+               di(instr, "unscheduled: ");
+            debug_assert(0);
+            ctx->error = true;
+            return;
+         }
+
+         if (new_instr) {
+            list_delinit(&new_instr->node);
+            list_addtail(&new_instr->node, &ctx->unscheduled_list);
+         }
+
+         /* If we produced a new instruction, do not schedule it next to
+          * guarantee progress.
+          */
+         ctx->split = new_instr;
+      }
+   }
+
+   sched_dag_destroy(ctx);
  }
  
-int ir3_sched(struct ir3 *ir)
+int
+ir3_sched(struct ir3 *ir)
  {
-       struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
+   struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
  
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       instr->data = NULL;
-               }
-       }
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         instr->data = NULL;
+      }
+   }
  
-       ir3_count_instructions(ir);
-       ir3_clear_mark(ir);
-       ir3_find_ssa_uses(ir, ctx, false);
+   ir3_count_instructions(ir);
+   ir3_clear_mark(ir);
+   ir3_find_ssa_uses(ir, ctx, false);
  
-       foreach_block (block, &ir->block_list) {
-               sched_block(ctx, block);
-       }
+   foreach_block (block, &ir->block_list) {
+      sched_block(ctx, block);
+   }
  
-       int ret = ctx->error ? -1 : 0;
+   int ret = ctx->error ? -1 : 0;
  
-       ralloc_free(ctx);
+   ralloc_free(ctx);
  
-       return ret;
+   return ret;
  }
  
  static unsigned
  get_array_id(struct ir3_instruction *instr)
  {
-       /* The expectation is that there is only a single array
-        * src or dst, ir3_cp should enforce this.
-        */
-
-       foreach_dst (dst, instr)
-               if (dst->flags & IR3_REG_ARRAY)
-                       return dst->array.id;
-       foreach_src (src, instr)
-               if (src->flags & IR3_REG_ARRAY)
-                       return src->array.id;
-
-       unreachable("this was unexpected");
+   /* The expectation is that there is only a single array
+    * src or dst, ir3_cp should enforce this.
+    */
+
+   foreach_dst (dst, instr)
+      if (dst->flags & IR3_REG_ARRAY)
+         return dst->array.id;
+   foreach_src (src, instr)
+      if (src->flags & IR3_REG_ARRAY)
+         return src->array.id;
+
+   unreachable("this was unexpected");
  }
  
  /* does instruction 'prior' need to be scheduled before 'instr'? */
  static bool
  depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
  {
-       /* TODO for dependencies that are related to a specific object, ie
-        * a specific SSBO/image/array, we could relax this constraint to
-        * make accesses to unrelated objects not depend on each other (at
-        * least as long as not declared coherent)
-        */
-       if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
-                       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
-               return true;
-
-       if (instr->barrier_class & prior->barrier_conflict) {
-               if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
-                       /* if only array barrier, then we can further limit false-deps
-                        * by considering the array-id, ie reads/writes to different
-                        * arrays do not depend on each other (no aliasing)
-                        */
-                       if (get_array_id(instr) != get_array_id(prior)) {
-                               return false;
-                       }
-               }
-
-               return true;
-       }
-
-       return false;
+   /* TODO for dependencies that are related to a specific object, ie
+    * a specific SSBO/image/array, we could relax this constraint to
+    * make accesses to unrelated objects not depend on each other (at
+    * least as long as not declared coherent)
+    */
+   if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) &&
+        prior->barrier_class) ||
+       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) &&
+        instr->barrier_class))
+      return true;
+
+   if (instr->barrier_class & prior->barrier_conflict) {
+      if (!(instr->barrier_class &
+            ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
+         /* if only array barrier, then we can further limit false-deps
+          * by considering the array-id, ie reads/writes to different
+          * arrays do not depend on each other (no aliasing)
+          */
+         if (get_array_id(instr) != get_array_id(prior)) {
+            return false;
+         }
+      }
+
+      return true;
+   }
+
+   return false;
  }
  
  static void
  add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
  {
-       struct list_head *prev = instr->node.prev;
-       struct list_head *next = instr->node.next;
-
-       /* add dependencies on previous instructions that must be scheduled
-        * prior to the current instruction
-        */
-       while (prev != &block->instr_list) {
-               struct ir3_instruction *pi =
-                       LIST_ENTRY(struct ir3_instruction, prev, node);
-
-               prev = prev->prev;
-
-               if (is_meta(pi))
-                       continue;
-
-               if (instr->barrier_class == pi->barrier_class) {
-                       ir3_instr_add_dep(instr, pi);
-                       break;
-               }
-
-               if (depends_on(instr, pi))
-                       ir3_instr_add_dep(instr, pi);
-       }
-
-       /* add dependencies on this instruction to following instructions
-        * that must be scheduled after the current instruction:
-        */
-       while (next != &block->instr_list) {
-               struct ir3_instruction *ni =
-                       LIST_ENTRY(struct ir3_instruction, next, node);
-
-               next = next->next;
-
-               if (is_meta(ni))
-                       continue;
-
-               if (instr->barrier_class == ni->barrier_class) {
-                       ir3_instr_add_dep(ni, instr);
-                       break;
-               }
-
-               if (depends_on(ni, instr))
-                       ir3_instr_add_dep(ni, instr);
-       }
+   struct list_head *prev = instr->node.prev;
+   struct list_head *next = instr->node.next;
+
+   /* add dependencies on previous instructions that must be scheduled
+    * prior to the current instruction
+    */
+   while (prev != &block->instr_list) {
+      struct ir3_instruction *pi =
+         LIST_ENTRY(struct ir3_instruction, prev, node);
+
+      prev = prev->prev;
+
+      if (is_meta(pi))
+         continue;
+
+      if (instr->barrier_class == pi->barrier_class) {
+         ir3_instr_add_dep(instr, pi);
+         break;
+      }
+
+      if (depends_on(instr, pi))
+         ir3_instr_add_dep(instr, pi);
+   }
+
+   /* add dependencies on this instruction to following instructions
+    * that must be scheduled after the current instruction:
+    */
+   while (next != &block->instr_list) {
+      struct ir3_instruction *ni =
+         LIST_ENTRY(struct ir3_instruction, next, node);
+
+      next = next->next;
+
+      if (is_meta(ni))
+         continue;
+
+      if (instr->barrier_class == ni->barrier_class) {
+         ir3_instr_add_dep(ni, instr);
+         break;
+      }
+
+      if (depends_on(ni, instr))
+         ir3_instr_add_dep(ni, instr);
+   }
  }
  
  /* before scheduling a block, we need to add any necessary false-dependencies
@@ -1368,16 +1387,16 @@ add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
  bool
  ir3_sched_add_deps(struct ir3 *ir)
  {
-       bool progress = false;
-
-       foreach_block (block, &ir->block_list) {
-               foreach_instr (instr, &block->instr_list) {
-                       if (instr->barrier_class) {
-                               add_barrier_deps(block, instr);
-                               progress = true;
-                       }
-               }
-       }
-
-       return progress;
+   bool progress = false;
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->barrier_class) {
+            add_barrier_deps(block, instr);
+            progress = true;
+         }
+      }
+   }
+
+   return progress;
  }
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c

index 70a67eb..8a93c34 100644 (file)
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -24,19 +24,19 @@
   *    Rob Clark <robclark@freedesktop.org>
   */
  
+#include "util/format/u_format.h"
  #include "util/u_atomic.h"
-#include "util/u_string.h"
  #include "util/u_math.h"
  #include "util/u_memory.h"
-#include "util/format/u_format.h"
+#include "util/u_string.h"
  
  #include "drm/freedreno_drmif.h"
  
-#include "ir3_shader.h"
+#include "ir3_assembler.h"
  #include "ir3_compiler.h"
  #include "ir3_nir.h"
-#include "ir3_assembler.h"
  #include "ir3_parser.h"
+#include "ir3_shader.h"
  
  #include "isa/isa.h"
  
@@ -45,7 +45,7 @@
  int
  ir3_glsl_type_size(const struct glsl_type *type, bool bindless)
  {
-       return glsl_count_attribute_slots(type, false);
+   return glsl_count_attribute_slots(type, false);
  }
  
  /* for vertex shader, the inputs are loaded into registers before the shader
@@ -61,244 +61,249 @@ ir3_glsl_type_size(const struct glsl_type *type, bool bindless)
  static void
  fixup_regfootprint(struct ir3_shader_variant *v)
  {
-       unsigned i;
-
-       for (i = 0; i < v->inputs_count; i++) {
-               /* skip frag inputs fetch via bary.f since their reg's are
-                * not written by gpu before shader starts (and in fact the
-                * regid's might not even be valid)
-                */
-               if (v->inputs[i].bary)
-                       continue;
-
-               /* ignore high regs that are global to all threads in a warp
-                * (they exist by default) (a5xx+)
-                */
-               if (v->inputs[i].regid >= regid(48,0))
-                       continue;
-
-               if (v->inputs[i].compmask) {
-                       unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
-                       int32_t regid = v->inputs[i].regid + n;
-                       if (v->inputs[i].half) {
-                               if (!v->mergedregs) {
-                                       v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
-                               } else {
-                                       v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
-                               }
-                       } else {
-                               v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
-                       }
-               }
-       }
-
-       for (i = 0; i < v->outputs_count; i++) {
-               /* for ex, VS shaders with tess don't have normal varying outs: */
-               if (!VALIDREG(v->outputs[i].regid))
-                       continue;
-               int32_t regid = v->outputs[i].regid + 3;
-               if (v->outputs[i].half) {
-                       if (!v->mergedregs) {
-                               v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
-                       } else {
-                               v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
-                       }
-               } else {
-                       v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
-               }
-       }
-
-       for (i = 0; i < v->num_sampler_prefetch; i++) {
-               unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
-               int32_t regid = v->sampler_prefetch[i].dst + n;
-               if (v->sampler_prefetch[i].half_precision) {
-                       if (!v->mergedregs) {
-                               v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
-                       } else {
-                               v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
-                       }
-               } else {
-                       v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
-               }
-       }
+   unsigned i;
+
+   for (i = 0; i < v->inputs_count; i++) {
+      /* skip frag inputs fetch via bary.f since their reg's are
+       * not written by gpu before shader starts (and in fact the
+       * regid's might not even be valid)
+       */
+      if (v->inputs[i].bary)
+         continue;
+
+      /* ignore high regs that are global to all threads in a warp
+       * (they exist by default) (a5xx+)
+       */
+      if (v->inputs[i].regid >= regid(48, 0))
+         continue;
+
+      if (v->inputs[i].compmask) {
+         unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
+         int32_t regid = v->inputs[i].regid + n;
+         if (v->inputs[i].half) {
+            if (!v->mergedregs) {
+               v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
+            } else {
+               v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
+            }
+         } else {
+            v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
+         }
+      }
+   }
+
+   for (i = 0; i < v->outputs_count; i++) {
+      /* for ex, VS shaders with tess don't have normal varying outs: */
+      if (!VALIDREG(v->outputs[i].regid))
+         continue;
+      int32_t regid = v->outputs[i].regid + 3;
+      if (v->outputs[i].half) {
+         if (!v->mergedregs) {
+            v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
+         } else {
+            v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
+         }
+      } else {
+         v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
+      }
+   }
+
+   for (i = 0; i < v->num_sampler_prefetch; i++) {
+      unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
+      int32_t regid = v->sampler_prefetch[i].dst + n;
+      if (v->sampler_prefetch[i].half_precision) {
+         if (!v->mergedregs) {
+            v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
+         } else {
+            v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
+         }
+      } else {
+         v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
+      }
+   }
  }
  
  /* wrapper for ir3_assemble() which does some info fixup based on
   * shader state.  Non-static since used by ir3_cmdline too.
   */
-void * ir3_shader_assemble(struct ir3_shader_variant *v)
+void *
+ir3_shader_assemble(struct ir3_shader_variant *v)
  {
-       const struct ir3_compiler *compiler = v->shader->compiler;
-       struct ir3_info *info = &v->info;
-       uint32_t *bin;
-
-       ir3_collect_info(v);
-
-       if (v->constant_data_size) {
-               /* Make sure that where we're about to place the constant_data is safe
-                * to indirectly upload from.
-                */
-               info->constant_data_offset =
-                       align(info->size, v->shader->compiler->const_upload_unit * 16);
-               info->size = info->constant_data_offset + v->constant_data_size;
-       }
-
-       /* Pad out the size so that when turnip uploads the shaders in
-        * sequence, the starting offset of the next one is properly aligned.
-        */
-       info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
-
-       bin = isa_assemble(v);
-       if (!bin)
-               return NULL;
-
-       /* Append the immediates after the end of the program.  This lets us emit
-        * the immediates as an indirect load, while avoiding creating another BO.
-        */
-       if (v->constant_data_size)
-               memcpy(&bin[info->constant_data_offset / 4], v->constant_data, v->constant_data_size);
-       ralloc_free(v->constant_data);
-       v->constant_data = NULL;
-
-       /* NOTE: if relative addressing is used, we set constlen in
-        * the compiler (to worst-case value) since we don't know in
-        * the assembler what the max addr reg value can be:
-        */
-       v->constlen = MAX2(v->constlen, info->max_const + 1);
-
-       if (v->constlen > ir3_const_state(v)->offsets.driver_param)
-               v->need_driver_params = true;
-
-       /* On a4xx and newer, constlen must be a multiple of 16 dwords even though
-        * uploads are in units of 4 dwords. Round it up here to make calculations
-        * regarding the shared constlen simpler.
-        */
-       if (compiler->gpu_id >= 400)
-               v->constlen = align(v->constlen, 4);
-
-       /* Use the per-wave layout by default on a6xx for compute shaders. It
-        * should result in better performance when loads/stores are to a uniform
-        * index.
-        */
-       v->pvtmem_per_wave =
-               compiler->gpu_id >= 600 && !info->multi_dword_ldp_stp &&
-               v->type == MESA_SHADER_COMPUTE;
-
-       fixup_regfootprint(v);
-
-       return bin;
+   const struct ir3_compiler *compiler = v->shader->compiler;
+   struct ir3_info *info = &v->info;
+   uint32_t *bin;
+
+   ir3_collect_info(v);
+
+   if (v->constant_data_size) {
+      /* Make sure that where we're about to place the constant_data is safe
+       * to indirectly upload from.
+       */
+      info->constant_data_offset =
+         align(info->size, v->shader->compiler->const_upload_unit * 16);
+      info->size = info->constant_data_offset + v->constant_data_size;
+   }
+
+   /* Pad out the size so that when turnip uploads the shaders in
+    * sequence, the starting offset of the next one is properly aligned.
+    */
+   info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
+
+   bin = isa_assemble(v);
+   if (!bin)
+      return NULL;
+
+   /* Append the immediates after the end of the program.  This lets us emit
+    * the immediates as an indirect load, while avoiding creating another BO.
+    */
+   if (v->constant_data_size)
+      memcpy(&bin[info->constant_data_offset / 4], v->constant_data,
+             v->constant_data_size);
+   ralloc_free(v->constant_data);
+   v->constant_data = NULL;
+
+   /* NOTE: if relative addressing is used, we set constlen in
+    * the compiler (to worst-case value) since we don't know in
+    * the assembler what the max addr reg value can be:
+    */
+   v->constlen = MAX2(v->constlen, info->max_const + 1);
+
+   if (v->constlen > ir3_const_state(v)->offsets.driver_param)
+      v->need_driver_params = true;
+
+   /* On a4xx and newer, constlen must be a multiple of 16 dwords even though
+    * uploads are in units of 4 dwords. Round it up here to make calculations
+    * regarding the shared constlen simpler.
+    */
+   if (compiler->gpu_id >= 400)
+      v->constlen = align(v->constlen, 4);
+
+   /* Use the per-wave layout by default on a6xx for compute shaders. It
+    * should result in better performance when loads/stores are to a uniform
+    * index.
+    */
+   v->pvtmem_per_wave = compiler->gpu_id >= 600 && !info->multi_dword_ldp_stp &&
+                        v->type == MESA_SHADER_COMPUTE;
+
+   fixup_regfootprint(v);
+
+   return bin;
  }
  
  static bool
-try_override_shader_variant(struct ir3_shader_variant *v, const char *identifier)
+try_override_shader_variant(struct ir3_shader_variant *v,
+                            const char *identifier)
  {
-       assert(ir3_shader_override_path);
+   assert(ir3_shader_override_path);
  
-       char *name = ralloc_asprintf(NULL, "%s/%s.asm", ir3_shader_override_path, identifier);
+   char *name =
+      ralloc_asprintf(NULL, "%s/%s.asm", ir3_shader_override_path, identifier);
  
-       FILE* f = fopen(name, "r");
+   FILE *f = fopen(name, "r");
  
-       if (!f) {
-               ralloc_free(name);
-               return false;
-       }
+   if (!f) {
+      ralloc_free(name);
+      return false;
+   }
  
-       struct ir3_kernel_info info;
-       info.numwg = INVALID_REG;
-       v->ir = ir3_parse(v, &info, f);
+   struct ir3_kernel_info info;
+   info.numwg = INVALID_REG;
+   v->ir = ir3_parse(v, &info, f);
  
-       fclose(f);
+   fclose(f);
  
-       if (!v->ir) {
-               fprintf(stderr, "Failed to parse %s\n", name);
-               exit(1);
-       }
+   if (!v->ir) {
+      fprintf(stderr, "Failed to parse %s\n", name);
+      exit(1);
+   }
  
-       v->bin = ir3_shader_assemble(v);
-       if (!v->bin) {
-               fprintf(stderr, "Failed to assemble %s\n", name);
-               exit(1);
-       }
+   v->bin = ir3_shader_assemble(v);
+   if (!v->bin) {
+      fprintf(stderr, "Failed to assemble %s\n", name);
+      exit(1);
+   }
  
-       ralloc_free(name);
-       return true;
+   ralloc_free(name);
+   return true;
  }
  
  static void
  assemble_variant(struct ir3_shader_variant *v)
  {
-       v->bin = ir3_shader_assemble(v);
-
-       bool dbg_enabled = shader_debug_enabled(v->shader->type);
-       if (dbg_enabled || ir3_shader_override_path || v->disasm_info.write_disasm) {
-               unsigned char sha1[21];
-               char sha1buf[41];
-
-               _mesa_sha1_compute(v->bin, v->info.size, sha1);
-               _mesa_sha1_format(sha1buf, sha1);
-
-               bool shader_overridden =
-                       ir3_shader_override_path && try_override_shader_variant(v, sha1buf);
-
-               if (v->disasm_info.write_disasm) {
-                       char *stream_data = NULL;
-                       size_t stream_size = 0;
-                       FILE *stream = open_memstream(&stream_data, &stream_size);
-
-                       fprintf(stream, "Native code%s for unnamed %s shader %s with sha1 %s:\n",
-                               shader_overridden ? " (overridden)" : "",
-                               ir3_shader_stage(v), v->shader->nir->info.name, sha1buf);
-                       ir3_shader_disasm(v, v->bin, stream);
-
-                       fclose(stream);
-
-                       v->disasm_info.disasm = ralloc_size(v->shader, stream_size + 1);
-                       memcpy(v->disasm_info.disasm, stream_data, stream_size);
-                       v->disasm_info.disasm[stream_size] = 0;
-                       free(stream_data);
-               }
-
-               if (dbg_enabled || shader_overridden) {
-                       char *stream_data = NULL;
-                       size_t stream_size = 0;
-                       FILE *stream = open_memstream(&stream_data, &stream_size);
-
-                       fprintf(stream, "Native code%s for unnamed %s shader %s with sha1 %s:\n",
-                               shader_overridden ? " (overridden)" : "",
-                               ir3_shader_stage(v), v->shader->nir->info.name, sha1buf);
-                       if (v->shader->type == MESA_SHADER_FRAGMENT)
-                               fprintf(stream, "SIMD0\n");
-                       ir3_shader_disasm(v, v->bin, stream);
-                       fclose(stream);
-
-                       mesa_log_multiline(MESA_LOG_INFO, stream_data);
-                       free(stream_data);
-               }
-       }
-
-       /* no need to keep the ir around beyond this point: */
-       ir3_destroy(v->ir);
-       v->ir = NULL;
+   v->bin = ir3_shader_assemble(v);
+
+   bool dbg_enabled = shader_debug_enabled(v->shader->type);
+   if (dbg_enabled || ir3_shader_override_path || v->disasm_info.write_disasm) {
+      unsigned char sha1[21];
+      char sha1buf[41];
+
+      _mesa_sha1_compute(v->bin, v->info.size, sha1);
+      _mesa_sha1_format(sha1buf, sha1);
+
+      bool shader_overridden =
+         ir3_shader_override_path && try_override_shader_variant(v, sha1buf);
+
+      if (v->disasm_info.write_disasm) {
+         char *stream_data = NULL;
+         size_t stream_size = 0;
+         FILE *stream = open_memstream(&stream_data, &stream_size);
+
+         fprintf(stream,
+                 "Native code%s for unnamed %s shader %s with sha1 %s:\n",
+                 shader_overridden ? " (overridden)" : "", ir3_shader_stage(v),
+                 v->shader->nir->info.name, sha1buf);
+         ir3_shader_disasm(v, v->bin, stream);
+
+         fclose(stream);
+
+         v->disasm_info.disasm = ralloc_size(v->shader, stream_size + 1);
+         memcpy(v->disasm_info.disasm, stream_data, stream_size);
+         v->disasm_info.disasm[stream_size] = 0;
+         free(stream_data);
+      }
+
+      if (dbg_enabled || shader_overridden) {
+         char *stream_data = NULL;
+         size_t stream_size = 0;
+         FILE *stream = open_memstream(&stream_data, &stream_size);
+
+         fprintf(stream,
+                 "Native code%s for unnamed %s shader %s with sha1 %s:\n",
+                 shader_overridden ? " (overridden)" : "", ir3_shader_stage(v),
+                 v->shader->nir->info.name, sha1buf);
+         if (v->shader->type == MESA_SHADER_FRAGMENT)
+            fprintf(stream, "SIMD0\n");
+         ir3_shader_disasm(v, v->bin, stream);
+         fclose(stream);
+
+         mesa_log_multiline(MESA_LOG_INFO, stream_data);
+         free(stream_data);
+      }
+   }
+
+   /* no need to keep the ir around beyond this point: */
+   ir3_destroy(v->ir);
+   v->ir = NULL;
  }
  
  static bool
  compile_variant(struct ir3_shader_variant *v)
  {
-       int ret = ir3_compile_shader_nir(v->shader->compiler, v);
-       if (ret) {
-               mesa_loge("compile failed! (%s:%s)", v->shader->nir->info.name,
-                               v->shader->nir->info.label);
-               return false;
-       }
-
-       assemble_variant(v);
-       if (!v->bin) {
-               mesa_loge("assemble failed! (%s:%s)", v->shader->nir->info.name,
-                               v->shader->nir->info.label);
-               return false;
-       }
-
-       return true;
+   int ret = ir3_compile_shader_nir(v->shader->compiler, v);
+   if (ret) {
+      mesa_loge("compile failed! (%s:%s)", v->shader->nir->info.name,
+                v->shader->nir->info.label);
+      return false;
+   }
+
+   assemble_variant(v);
+   if (!v->bin) {
+      mesa_loge("assemble failed! (%s:%s)", v->shader->nir->info.name,
+                v->shader->nir->info.label);
+      return false;
+   }
+
+   return true;
  }
  
  /*
@@ -308,137 +313,138 @@ compile_variant(struct ir3_shader_variant *v)
   */
  static struct ir3_shader_variant *
  alloc_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
-               struct ir3_shader_variant *nonbinning)
+              struct ir3_shader_variant *nonbinning)
  {
-       void *mem_ctx = shader;
-       /* hang the binning variant off it's non-binning counterpart instead
-        * of the shader, to simplify the error cleanup paths
-        */
-       if (nonbinning)
-               mem_ctx = nonbinning;
-       struct ir3_shader_variant *v = rzalloc_size(mem_ctx, sizeof(*v));
-
-       if (!v)
-               return NULL;
-
-       v->id = ++shader->variant_count;
-       v->shader = shader;
-       v->binning_pass = !!nonbinning;
-       v->nonbinning = nonbinning;
-       v->key = *key;
-       v->type = shader->type;
-       v->mergedregs = shader->compiler->gpu_id >= 600;
-
-       if (!v->binning_pass)
-               v->const_state = rzalloc_size(v, sizeof(*v->const_state));
-
-       return v;
+   void *mem_ctx = shader;
+   /* hang the binning variant off it's non-binning counterpart instead
+    * of the shader, to simplify the error cleanup paths
+    */
+   if (nonbinning)
+      mem_ctx = nonbinning;
+   struct ir3_shader_variant *v = rzalloc_size(mem_ctx, sizeof(*v));
+
+   if (!v)
+      return NULL;
+
+   v->id = ++shader->variant_count;
+   v->shader = shader;
+   v->binning_pass = !!nonbinning;
+   v->nonbinning = nonbinning;
+   v->key = *key;
+   v->type = shader->type;
+   v->mergedregs = shader->compiler->gpu_id >= 600;
+
+   if (!v->binning_pass)
+      v->const_state = rzalloc_size(v, sizeof(*v->const_state));
+
+   return v;
  }
  
  static bool
  needs_binning_variant(struct ir3_shader_variant *v)
  {
-       if ((v->type == MESA_SHADER_VERTEX) && ir3_has_binning_vs(&v->key))
-               return true;
-       return false;
+   if ((v->type == MESA_SHADER_VERTEX) && ir3_has_binning_vs(&v->key))
+      return true;
+   return false;
  }
  
  static struct ir3_shader_variant *
  create_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
-                               bool write_disasm)
+               bool write_disasm)
  {
-       struct ir3_shader_variant *v = alloc_variant(shader, key, NULL);
+   struct ir3_shader_variant *v = alloc_variant(shader, key, NULL);
  
-       if (!v)
-               goto fail;
+   if (!v)
+      goto fail;
  
-       v->disasm_info.write_disasm = write_disasm;
+   v->disasm_info.write_disasm = write_disasm;
  
-       if (needs_binning_variant(v)) {
-               v->binning = alloc_variant(shader, key, v);
-               if (!v->binning)
-                       goto fail;
-               v->binning->disasm_info.write_disasm = write_disasm;
-       }
+   if (needs_binning_variant(v)) {
+      v->binning = alloc_variant(shader, key, v);
+      if (!v->binning)
+         goto fail;
+      v->binning->disasm_info.write_disasm = write_disasm;
+   }
  
-       if (ir3_disk_cache_retrieve(shader->compiler, v))
-               return v;
+   if (ir3_disk_cache_retrieve(shader->compiler, v))
+      return v;
  
-       if (!shader->nir_finalized) {
-               ir3_nir_post_finalize(shader->compiler, shader->nir);
+   if (!shader->nir_finalized) {
+      ir3_nir_post_finalize(shader->compiler, shader->nir);
  
-               if (ir3_shader_debug & IR3_DBG_DISASM) {
-                       mesa_logi("dump nir%d: type=%d", shader->id, shader->type);
-                       nir_log_shaderi(shader->nir);
-               }
+      if (ir3_shader_debug & IR3_DBG_DISASM) {
+         mesa_logi("dump nir%d: type=%d", shader->id, shader->type);
+         nir_log_shaderi(shader->nir);
+      }
  
-               if (v->disasm_info.write_disasm) {
-                       v->disasm_info.nir = nir_shader_as_str(shader->nir, shader);
-               }
+      if (v->disasm_info.write_disasm) {
+         v->disasm_info.nir = nir_shader_as_str(shader->nir, shader);
+      }
  
-               shader->nir_finalized = true;
-       }
+      shader->nir_finalized = true;
+   }
  
-       if (!compile_variant(v))
-               goto fail;
+   if (!compile_variant(v))
+      goto fail;
  
-       if (needs_binning_variant(v) && !compile_variant(v->binning))
-               goto fail;
+   if (needs_binning_variant(v) && !compile_variant(v->binning))
+      goto fail;
  
-       ir3_disk_cache_store(shader->compiler, v);
+   ir3_disk_cache_store(shader->compiler, v);
  
-       return v;
+   return v;
  
  fail:
-       ralloc_free(v);
-       return NULL;
+   ralloc_free(v);
+   return NULL;
  }
  
  static inline struct ir3_shader_variant *
  shader_variant(struct ir3_shader *shader, const struct ir3_shader_key *key)
  {
-       struct ir3_shader_variant *v;
+   struct ir3_shader_variant *v;
  
-       for (v = shader->variants; v; v = v->next)
-               if (ir3_shader_key_equal(key, &v->key))
-                       return v;
+   for (v = shader->variants; v; v = v->next)
+      if (ir3_shader_key_equal(key, &v->key))
+         return v;
  
-       return NULL;
+   return NULL;
  }
  
  struct ir3_shader_variant *
-ir3_shader_get_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
-               bool binning_pass, bool write_disasm, bool *created)
+ir3_shader_get_variant(struct ir3_shader *shader,
+                       const struct ir3_shader_key *key, bool binning_pass,
+                       bool write_disasm, bool *created)
  {
-       mtx_lock(&shader->variants_lock);
-       struct ir3_shader_variant *v = shader_variant(shader, key);
-
-       if (!v) {
-               /* compile new variant if it doesn't exist already: */
-               v = create_variant(shader, key, write_disasm);
-               if (v) {
-                       v->next = shader->variants;
-                       shader->variants = v;
-                       *created = true;
-               }
-       }
-
-       if (v && binning_pass) {
-               v = v->binning;
-               assert(v);
-       }
-
-       mtx_unlock(&shader->variants_lock);
-
-       return v;
+   mtx_lock(&shader->variants_lock);
+   struct ir3_shader_variant *v = shader_variant(shader, key);
+
+   if (!v) {
+      /* compile new variant if it doesn't exist already: */
+      v = create_variant(shader, key, write_disasm);
+      if (v) {
+         v->next = shader->variants;
+         shader->variants = v;
+         *created = true;
+      }
+   }
+
+   if (v && binning_pass) {
+      v = v->binning;
+      assert(v);
+   }
+
+   mtx_unlock(&shader->variants_lock);
+
+   return v;
  }
  
  void
  ir3_shader_destroy(struct ir3_shader *shader)
  {
-       ralloc_free(shader->nir);
-       mtx_destroy(&shader->variants_lock);
-       ralloc_free(shader);
+   ralloc_free(shader->nir);
+   mtx_destroy(&shader->variants_lock);
+   ralloc_free(shader);
  }
  
  /**
@@ -449,68 +455,68 @@ ir3_shader_destroy(struct ir3_shader *shader)
  static void
  ir3_setup_used_key(struct ir3_shader *shader)
  {
-       nir_shader *nir = shader->nir;
-       struct shader_info *info = &nir->info;
-       struct ir3_shader_key *key = &shader->key_mask;
-
-       /* This key flag is just used to make for a cheaper ir3_shader_key_equal
-        * check in the common case.
-        */
-       key->has_per_samp = true;
-
-       key->safe_constlen = true;
-
-       /* When clip/cull distances are natively supported, we only use
-        * ucp_enables to determine whether to lower legacy clip planes to
-        * gl_ClipDistance.
-        */
-       if (info->stage != MESA_SHADER_FRAGMENT || !shader->compiler->has_clip_cull)
-               key->ucp_enables = 0xff;
-
-       if (info->stage == MESA_SHADER_FRAGMENT) {
-               key->fastc_srgb = ~0;
-               key->fsamples = ~0;
-
-               if (info->inputs_read & VARYING_BITS_COLOR) {
-                       key->rasterflat = true;
-               }
-
-               if (info->inputs_read & VARYING_BIT_LAYER) {
-                       key->layer_zero = true;
-               }
-
-               if (info->inputs_read & VARYING_BIT_VIEWPORT) {
-                       key->view_zero = true;
-               }
-
-               /* Only used for deciding on behavior of
-                * nir_intrinsic_load_barycentric_sample, or the centroid demotion
-                * on older HW.
-                */
-               key->msaa = info->fs.uses_sample_qualifier ||
-                                       (shader->compiler->gpu_id < 600 &&
-                                        (BITSET_TEST(info->system_values_read, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID) ||
-                                         BITSET_TEST(info->system_values_read, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID)));
-       } else {
-               key->tessellation = ~0;
-               key->has_gs = true;
-
-               if (info->stage == MESA_SHADER_VERTEX) {
-                       key->vastc_srgb = ~0;
-                       key->vsamples = ~0;
-               }
-       }
+   nir_shader *nir = shader->nir;
+   struct shader_info *info = &nir->info;
+   struct ir3_shader_key *key = &shader->key_mask;
+
+   /* This key flag is just used to make for a cheaper ir3_shader_key_equal
+    * check in the common case.
+    */
+   key->has_per_samp = true;
+
+   key->safe_constlen = true;
+
+   /* When clip/cull distances are natively supported, we only use
+    * ucp_enables to determine whether to lower legacy clip planes to
+    * gl_ClipDistance.
+    */
+   if (info->stage != MESA_SHADER_FRAGMENT || !shader->compiler->has_clip_cull)
+      key->ucp_enables = 0xff;
+
+   if (info->stage == MESA_SHADER_FRAGMENT) {
+      key->fastc_srgb = ~0;
+      key->fsamples = ~0;
+
+      if (info->inputs_read & VARYING_BITS_COLOR) {
+         key->rasterflat = true;
+      }
+
+      if (info->inputs_read & VARYING_BIT_LAYER) {
+         key->layer_zero = true;
+      }
+
+      if (info->inputs_read & VARYING_BIT_VIEWPORT) {
+         key->view_zero = true;
+      }
+
+      /* Only used for deciding on behavior of
+       * nir_intrinsic_load_barycentric_sample, or the centroid demotion
+       * on older HW.
+       */
+      key->msaa = info->fs.uses_sample_qualifier ||
+                  (shader->compiler->gpu_id < 600 &&
+                   (BITSET_TEST(info->system_values_read,
+                                SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID) ||
+                    BITSET_TEST(info->system_values_read,
+                                SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID)));
+   } else {
+      key->tessellation = ~0;
+      key->has_gs = true;
+
+      if (info->stage == MESA_SHADER_VERTEX) {
+         key->vastc_srgb = ~0;
+         key->vsamples = ~0;
+      }
+   }
  }
  
-
  /* Given an array of constlen's, decrease some of them so that the sum stays
   * within "combined_limit" while trying to fairly share the reduction. Returns
   * a bitfield of which stages should be trimmed.
   */
  static uint32_t
-trim_constlens(unsigned *constlens,
-                          unsigned first_stage, unsigned last_stage,
-                          unsigned combined_limit, unsigned safe_limit)
+trim_constlens(unsigned *constlens, unsigned first_stage, unsigned last_stage,
+               unsigned combined_limit, unsigned safe_limit)
  {
     unsigned cur_total = 0;
     for (unsigned i = first_stage; i <= last_stage; i++) {
@@ -522,17 +528,17 @@ trim_constlens(unsigned *constlens,
     uint32_t trimmed = 0;
  
     while (cur_total > combined_limit) {
-          for (unsigned i = first_stage; i <= last_stage; i++) {
-                  if (constlens[i] >= max_const) {
-                          max_stage = i;
-                          max_const = constlens[i];
-                  }
-          }
-
-          assert(max_const > safe_limit);
-          trimmed |= 1 << max_stage;
-          cur_total = cur_total - max_const + safe_limit;
-          constlens[max_stage] = safe_limit;
+      for (unsigned i = first_stage; i <= last_stage; i++) {
+         if (constlens[i] >= max_const) {
+            max_stage = i;
+            max_const = constlens[i];
+         }
+      }
+
+      assert(max_const > safe_limit);
+      trimmed |= 1 << max_stage;
+      cur_total = cur_total - max_const + safe_limit;
+      constlens[max_stage] = safe_limit;
     }
  
     return trimmed;
@@ -543,298 +549,292 @@ trim_constlens(unsigned *constlens,
   */
  uint32_t
  ir3_trim_constlen(struct ir3_shader_variant **variants,
-                                 const struct ir3_compiler *compiler)
+                  const struct ir3_compiler *compiler)
  {
-       unsigned constlens[MESA_SHADER_STAGES] = {};
-
-       for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-               if (variants[i])
-                       constlens[i] = variants[i]->constlen;
-       }
-
-       uint32_t trimmed = 0;
-       STATIC_ASSERT(MESA_SHADER_STAGES <= 8 * sizeof(trimmed));
-
-       /* There are two shared limits to take into account, the geometry limit on
-        * a6xx and the total limit. The frag limit on a6xx only matters for a
-        * single stage, so it's always satisfied with the first variant.
-        */
-       if (compiler->gpu_id >= 600) {
-               trimmed |=
-                       trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_GEOMETRY,
-                                                  compiler->max_const_geom, compiler->max_const_safe);
-       }
-       trimmed |=
-               trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_FRAGMENT,
-                                          compiler->max_const_pipeline, compiler->max_const_safe);
-
-       return trimmed;
+   unsigned constlens[MESA_SHADER_STAGES] = {};
+
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (variants[i])
+         constlens[i] = variants[i]->constlen;
+   }
+
+   uint32_t trimmed = 0;
+   STATIC_ASSERT(MESA_SHADER_STAGES <= 8 * sizeof(trimmed));
+
+   /* There are two shared limits to take into account, the geometry limit on
+    * a6xx and the total limit. The frag limit on a6xx only matters for a
+    * single stage, so it's always satisfied with the first variant.
+    */
+   if (compiler->gpu_id >= 600) {
+      trimmed |=
+         trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_GEOMETRY,
+                        compiler->max_const_geom, compiler->max_const_safe);
+   }
+   trimmed |=
+      trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_FRAGMENT,
+                     compiler->max_const_pipeline, compiler->max_const_safe);
+
+   return trimmed;
  }
  
  struct ir3_shader *
  ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
-               unsigned reserved_user_consts, struct ir3_stream_output_info *stream_output)
+                    unsigned reserved_user_consts,
+                    struct ir3_stream_output_info *stream_output)
  {
-       struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
+   struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
  
-       mtx_init(&shader->variants_lock, mtx_plain);
-       shader->compiler = compiler;
-       shader->id = p_atomic_inc_return(&shader->compiler->shader_count);
-       shader->type = nir->info.stage;
-       if (stream_output)
-               memcpy(&shader->stream_output, stream_output, sizeof(shader->stream_output));
-       shader->num_reserved_user_consts = reserved_user_consts;
-       shader->nir = nir;
+   mtx_init(&shader->variants_lock, mtx_plain);
+   shader->compiler = compiler;
+   shader->id = p_atomic_inc_return(&shader->compiler->shader_count);
+   shader->type = nir->info.stage;
+   if (stream_output)
+      memcpy(&shader->stream_output, stream_output,
+             sizeof(shader->stream_output));
+   shader->num_reserved_user_consts = reserved_user_consts;
+   shader->nir = nir;
  
-       ir3_disk_cache_init_shader_key(compiler, shader);
+   ir3_disk_cache_init_shader_key(compiler, shader);
  
-       ir3_setup_used_key(shader);
+   ir3_setup_used_key(shader);
  
-       return shader;
+   return shader;
  }
  
-static void dump_reg(FILE *out, const char *name, uint32_t r)
+static void
+dump_reg(FILE *out, const char *name, uint32_t r)
  {
-       if (r != regid(63,0)) {
-               const char *reg_type = (r & HALF_REG_ID) ? "hr" : "r";
-               fprintf(out, "; %s: %s%d.%c\n", name, reg_type,
-                               (r & ~HALF_REG_ID) >> 2, "xyzw"[r & 0x3]);
-       }
+   if (r != regid(63, 0)) {
+      const char *reg_type = (r & HALF_REG_ID) ? "hr" : "r";
+      fprintf(out, "; %s: %s%d.%c\n", name, reg_type, (r & ~HALF_REG_ID) >> 2,
+              "xyzw"[r & 0x3]);
+   }
  }
  
-static void dump_output(FILE *out, struct ir3_shader_variant *so,
-               unsigned slot, const char *name)
+static void
+dump_output(FILE *out, struct ir3_shader_variant *so, unsigned slot,
+            const char *name)
  {
-       uint32_t regid;
-       regid = ir3_find_output_regid(so, slot);
-       dump_reg(out, name, regid);
+   uint32_t regid;
+   regid = ir3_find_output_regid(so, slot);
+   dump_reg(out, name, regid);
  }
  
  static const char *
  input_name(struct ir3_shader_variant *so, int i)
  {
-       if (so->inputs[i].sysval) {
-               return gl_system_value_name(so->inputs[i].slot);
-       } else if (so->type == MESA_SHADER_VERTEX) {
-               return gl_vert_attrib_name(so->inputs[i].slot);
-       } else {
-               return gl_varying_slot_name_for_stage(so->inputs[i].slot, so->type);
-       }
+   if (so->inputs[i].sysval) {
+      return gl_system_value_name(so->inputs[i].slot);
+   } else if (so->type == MESA_SHADER_VERTEX) {
+      return gl_vert_attrib_name(so->inputs[i].slot);
+   } else {
+      return gl_varying_slot_name_for_stage(so->inputs[i].slot, so->type);
+   }
  }
  
  static const char *
  output_name(struct ir3_shader_variant *so, int i)
  {
-       if (so->type == MESA_SHADER_FRAGMENT) {
-               return gl_frag_result_name(so->outputs[i].slot);
-       } else {
-               switch (so->outputs[i].slot) {
-               case VARYING_SLOT_GS_HEADER_IR3:
-                       return "GS_HEADER";
-               case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
-                       return "GS_VERTEX_FLAGS";
-               case VARYING_SLOT_TCS_HEADER_IR3:
-                       return "TCS_HEADER";
-               default:
-                       return gl_varying_slot_name_for_stage(so->outputs[i].slot, so->type);
-               }
-       }
+   if (so->type == MESA_SHADER_FRAGMENT) {
+      return gl_frag_result_name(so->outputs[i].slot);
+   } else {
+      switch (so->outputs[i].slot) {
+      case VARYING_SLOT_GS_HEADER_IR3:
+         return "GS_HEADER";
+      case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
+         return "GS_VERTEX_FLAGS";
+      case VARYING_SLOT_TCS_HEADER_IR3:
+         return "TCS_HEADER";
+      default:
+         return gl_varying_slot_name_for_stage(so->outputs[i].slot, so->type);
+      }
+   }
  }
  
  void
  ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
  {
-       struct ir3 *ir = so->ir;
-       struct ir3_register *reg;
-       const char *type = ir3_shader_stage(so);
-       uint8_t regid;
-       unsigned i;
-
-       foreach_input_n (instr, i, ir) {
-               reg = instr->dsts[0];
-               regid = reg->num;
-               fprintf(out, "@in(%sr%d.%c)\tin%d",
-                               (reg->flags & IR3_REG_HALF) ? "h" : "",
-                               (regid >> 2), "xyzw"[regid & 0x3], i);
-
-               if (reg->wrmask > 0x1)
-                       fprintf(out, " (wrmask=0x%x)", reg->wrmask);
-               fprintf(out, "\n");
-       }
-
-       /* print pre-dispatch texture fetches: */
-       for (i = 0; i < so->num_sampler_prefetch; i++) {
-               const struct ir3_sampler_prefetch *fetch = &so->sampler_prefetch[i];
-               fprintf(out, "@tex(%sr%d.%c)\tsrc=%u, samp=%u, tex=%u, wrmask=0x%x, cmd=%u\n",
-                               fetch->half_precision ? "h" : "",
-                               fetch->dst >> 2, "xyzw"[fetch->dst & 0x3],
-                               fetch->src, fetch->samp_id, fetch->tex_id,
-                               fetch->wrmask, fetch->cmd);
-       }
-
-       const struct ir3_const_state *const_state = ir3_const_state(so);
-       for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) {
-               fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i);
-               fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
-                               const_state->immediates[i * 4 + 0],
-                               const_state->immediates[i * 4 + 1],
-                               const_state->immediates[i * 4 + 2],
-                               const_state->immediates[i * 4 + 3]);
-       }
-
-       isa_decode(bin, so->info.sizedwords * 4, out, &(struct isa_decode_options){
-               .gpu_id = ir->compiler->gpu_id,
-               .show_errors = true,
-               .branch_labels = true,
-       });
-
-       fprintf(out, "; %s: outputs:", type);
-       for (i = 0; i < so->outputs_count; i++) {
-               uint8_t regid = so->outputs[i].regid;
-               const char *reg_type = so->outputs[i].half ? "hr" : "r";
-               fprintf(out, " %s%d.%c (%s)",
-                               reg_type, (regid >> 2), "xyzw"[regid & 0x3],
-                               output_name(so, i));
-       }
-       fprintf(out, "\n");
-
-       fprintf(out, "; %s: inputs:", type);
-       for (i = 0; i < so->inputs_count; i++) {
-               uint8_t regid = so->inputs[i].regid;
-               fprintf(out, " r%d.%c (%s slot=%d cm=%x,il=%u,b=%u)",
-                               (regid >> 2), "xyzw"[regid & 0x3],
-                               input_name(so, i),
-                               so->inputs[i].slot,
-                               so->inputs[i].compmask,
-                               so->inputs[i].inloc,
-                               so->inputs[i].bary);
-       }
-       fprintf(out, "\n");
-
-       /* print generic shader info: */
-       fprintf(out, "; %s prog %d/%d: %u instr, %u nops, %u non-nops, %u mov, %u cov, %u dwords\n",
-                       type, so->shader->id, so->id,
-                       so->info.instrs_count,
-                       so->info.nops_count,
-                       so->info.instrs_count - so->info.nops_count,
-                       so->info.mov_count, so->info.cov_count,
-                       so->info.sizedwords);
-
-       fprintf(out, "; %s prog %d/%d: %u last-baryf, %d half, %d full, %u constlen\n",
-                       type, so->shader->id, so->id,
-                       so->info.last_baryf,
-                       so->info.max_half_reg + 1,
-                       so->info.max_reg + 1,
-                       so->constlen);
-
-       fprintf(out, "; %s prog %d/%d: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, \n",
-                       type, so->shader->id, so->id,
-                       so->info.instrs_per_cat[0],
-                       so->info.instrs_per_cat[1],
-                       so->info.instrs_per_cat[2],
-                       so->info.instrs_per_cat[3],
-                       so->info.instrs_per_cat[4],
-                       so->info.instrs_per_cat[5],
-                       so->info.instrs_per_cat[6],
-                       so->info.instrs_per_cat[7]);
-
-       fprintf(out, "; %s prog %d/%d: %u sstall, %u (ss), %u (sy), %d max_sun, %d loops\n",
-                       type, so->shader->id, so->id,
-                       so->info.sstall,
-                       so->info.ss,
-                       so->info.sy,
-                       so->max_sun,
-                       so->loops);
-
-       /* print shader type specific info: */
-       switch (so->type) {
-       case MESA_SHADER_VERTEX:
-               dump_output(out, so, VARYING_SLOT_POS, "pos");
-               dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
-               break;
-       case MESA_SHADER_FRAGMENT:
-               dump_reg(out, "pos (ij_pixel)",
-                       ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL));
-               dump_reg(out, "pos (ij_centroid)",
-                       ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID));
-               dump_reg(out, "pos (ij_size)",
-                       ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE));
-               dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
-               if (so->color0_mrt) {
-                       dump_output(out, so, FRAG_RESULT_COLOR, "color");
-               } else {
-                       dump_output(out, so, FRAG_RESULT_DATA0, "data0");
-                       dump_output(out, so, FRAG_RESULT_DATA1, "data1");
-                       dump_output(out, so, FRAG_RESULT_DATA2, "data2");
-                       dump_output(out, so, FRAG_RESULT_DATA3, "data3");
-                       dump_output(out, so, FRAG_RESULT_DATA4, "data4");
-                       dump_output(out, so, FRAG_RESULT_DATA5, "data5");
-                       dump_output(out, so, FRAG_RESULT_DATA6, "data6");
-                       dump_output(out, so, FRAG_RESULT_DATA7, "data7");
-               }
-               dump_reg(out, "fragcoord",
-                       ir3_find_sysval_regid(so, SYSTEM_VALUE_FRAG_COORD));
-               dump_reg(out, "fragface",
-                       ir3_find_sysval_regid(so, SYSTEM_VALUE_FRONT_FACE));
-               break;
-       default:
-               /* TODO */
-               break;
-       }
-
-       fprintf(out, "\n");
+   struct ir3 *ir = so->ir;
+   struct ir3_register *reg;
+   const char *type = ir3_shader_stage(so);
+   uint8_t regid;
+   unsigned i;
+
+   foreach_input_n (instr, i, ir) {
+      reg = instr->dsts[0];
+      regid = reg->num;
+      fprintf(out, "@in(%sr%d.%c)\tin%d",
+              (reg->flags & IR3_REG_HALF) ? "h" : "", (regid >> 2),
+              "xyzw"[regid & 0x3], i);
+
+      if (reg->wrmask > 0x1)
+         fprintf(out, " (wrmask=0x%x)", reg->wrmask);
+      fprintf(out, "\n");
+   }
+
+   /* print pre-dispatch texture fetches: */
+   for (i = 0; i < so->num_sampler_prefetch; i++) {
+      const struct ir3_sampler_prefetch *fetch = &so->sampler_prefetch[i];
+      fprintf(out,
+              "@tex(%sr%d.%c)\tsrc=%u, samp=%u, tex=%u, wrmask=0x%x, cmd=%u\n",
+              fetch->half_precision ? "h" : "", fetch->dst >> 2,
+              "xyzw"[fetch->dst & 0x3], fetch -> src, fetch -> samp_id,
+              fetch -> tex_id, fetch -> wrmask, fetch -> cmd);
+   }
+
+   const struct ir3_const_state *const_state = ir3_const_state(so);
+   for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) {
+      fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i);
+      fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+              const_state->immediates[i * 4 + 0],
+              const_state->immediates[i * 4 + 1],
+              const_state->immediates[i * 4 + 2],
+              const_state->immediates[i * 4 + 3]);
+   }
+
+   isa_decode(bin, so->info.sizedwords * 4, out,
+              &(struct isa_decode_options){
+                 .gpu_id = ir->compiler->gpu_id,
+                 .show_errors = true,
+                 .branch_labels = true,
+              });
+
+   fprintf(out, "; %s: outputs:", type);
+   for (i = 0; i < so->outputs_count; i++) {
+      uint8_t regid = so->outputs[i].regid;
+      const char *reg_type = so->outputs[i].half ? "hr" : "r";
+      fprintf(out, " %s%d.%c (%s)", reg_type, (regid >> 2), "xyzw"[regid & 0x3],
+              output_name(so, i));
+   }
+   fprintf(out, "\n");
+
+   fprintf(out, "; %s: inputs:", type);
+   for (i = 0; i < so->inputs_count; i++) {
+      uint8_t regid = so->inputs[i].regid;
+      fprintf(out, " r%d.%c (%s slot=%d cm=%x,il=%u,b=%u)", (regid >> 2),
+              "xyzw"[regid & 0x3], input_name(so, i), so -> inputs[i].slot,
+              so->inputs[i].compmask, so->inputs[i].inloc, so->inputs[i].bary);
+   }
+   fprintf(out, "\n");
+
+   /* print generic shader info: */
+   fprintf(
+      out,
+      "; %s prog %d/%d: %u instr, %u nops, %u non-nops, %u mov, %u cov, %u dwords\n",
+      type, so->shader->id, so->id, so->info.instrs_count, so->info.nops_count,
+      so->info.instrs_count - so->info.nops_count, so->info.mov_count,
+      so->info.cov_count, so->info.sizedwords);
+
+   fprintf(out,
+           "; %s prog %d/%d: %u last-baryf, %d half, %d full, %u constlen\n",
+           type, so->shader->id, so->id, so->info.last_baryf,
+           so->info.max_half_reg + 1, so->info.max_reg + 1, so->constlen);
+
+   fprintf(
+      out,
+      "; %s prog %d/%d: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, \n",
+      type, so->shader->id, so->id, so->info.instrs_per_cat[0],
+      so->info.instrs_per_cat[1], so->info.instrs_per_cat[2],
+      so->info.instrs_per_cat[3], so->info.instrs_per_cat[4],
+      so->info.instrs_per_cat[5], so->info.instrs_per_cat[6],
+      so->info.instrs_per_cat[7]);
+
+   fprintf(
+      out,
+      "; %s prog %d/%d: %u sstall, %u (ss), %u (sy), %d max_sun, %d loops\n",
+      type, so->shader->id, so->id, so->info.sstall, so->info.ss, so->info.sy,
+      so->max_sun, so->loops);
+
+   /* print shader type specific info: */
+   switch (so->type) {
+   case MESA_SHADER_VERTEX:
+      dump_output(out, so, VARYING_SLOT_POS, "pos");
+      dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
+      break;
+   case MESA_SHADER_FRAGMENT:
+      dump_reg(out, "pos (ij_pixel)",
+               ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL));
+      dump_reg(
+         out, "pos (ij_centroid)",
+         ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID));
+      dump_reg(out, "pos (ij_size)",
+               ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE));
+      dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
+      if (so->color0_mrt) {
+         dump_output(out, so, FRAG_RESULT_COLOR, "color");
+      } else {
+         dump_output(out, so, FRAG_RESULT_DATA0, "data0");
+         dump_output(out, so, FRAG_RESULT_DATA1, "data1");
+         dump_output(out, so, FRAG_RESULT_DATA2, "data2");
+         dump_output(out, so, FRAG_RESULT_DATA3, "data3");
+         dump_output(out, so, FRAG_RESULT_DATA4, "data4");
+         dump_output(out, so, FRAG_RESULT_DATA5, "data5");
+         dump_output(out, so, FRAG_RESULT_DATA6, "data6");
+         dump_output(out, so, FRAG_RESULT_DATA7, "data7");
+      }
+      dump_reg(out, "fragcoord",
+               ir3_find_sysval_regid(so, SYSTEM_VALUE_FRAG_COORD));
+      dump_reg(out, "fragface",
+               ir3_find_sysval_regid(so, SYSTEM_VALUE_FRONT_FACE));
+      break;
+   default:
+      /* TODO */
+      break;
+   }
+
+   fprintf(out, "\n");
  }
  
  uint64_t
  ir3_shader_outputs(const struct ir3_shader *so)
  {
-       return so->nir->info.outputs_written;
+   return so->nir->info.outputs_written;
  }
  
-
  /* Add any missing varyings needed for stream-out.  Otherwise varyings not
   * used by fragment shader will be stripped out.
   */
  void
-ir3_link_stream_out(struct ir3_shader_linkage *l, const struct ir3_shader_variant *v)
+ir3_link_stream_out(struct ir3_shader_linkage *l,
+                    const struct ir3_shader_variant *v)
  {
-       const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
-
-       /*
-        * First, any stream-out varyings not already in linkage map (ie. also
-        * consumed by frag shader) need to be added:
-        */
-       for (unsigned i = 0; i < strmout->num_outputs; i++) {
-               const struct ir3_stream_output *out = &strmout->output[i];
-               unsigned k = out->register_index;
-               unsigned compmask =
-                       (1 << (out->num_components + out->start_component)) - 1;
-               unsigned idx, nextloc = 0;
-
-               /* psize/pos need to be the last entries in linkage map, and will
-                * get added link_stream_out, so skip over them:
-                */
-               if ((v->outputs[k].slot == VARYING_SLOT_PSIZ) ||
-                               (v->outputs[k].slot == VARYING_SLOT_POS))
-                       continue;
-
-               for (idx = 0; idx < l->cnt; idx++) {
-                       if (l->var[idx].regid == v->outputs[k].regid)
-                               break;
-                       nextloc = MAX2(nextloc, l->var[idx].loc + 4);
-               }
-
-               /* add if not already in linkage map: */
-               if (idx == l->cnt)
-                       ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
-
-               /* expand component-mask if needed, ie streaming out all components
-                * but frag shader doesn't consume all components:
-                */
-               if (compmask & ~l->var[idx].compmask) {
-                       l->var[idx].compmask |= compmask;
-                       l->max_loc = MAX2(l->max_loc,
-                               l->var[idx].loc + util_last_bit(l->var[idx].compmask));
-               }
-       }
+   const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
+
+   /*
+    * First, any stream-out varyings not already in linkage map (ie. also
+    * consumed by frag shader) need to be added:
+    */
+   for (unsigned i = 0; i < strmout->num_outputs; i++) {
+      const struct ir3_stream_output *out = &strmout->output[i];
+      unsigned k = out->register_index;
+      unsigned compmask =
+         (1 << (out->num_components + out->start_component)) - 1;
+      unsigned idx, nextloc = 0;
+
+      /* psize/pos need to be the last entries in linkage map, and will
+       * get added link_stream_out, so skip over them:
+       */
+      if ((v->outputs[k].slot == VARYING_SLOT_PSIZ) ||
+          (v->outputs[k].slot == VARYING_SLOT_POS))
+         continue;
+
+      for (idx = 0; idx < l->cnt; idx++) {
+         if (l->var[idx].regid == v->outputs[k].regid)
+            break;
+         nextloc = MAX2(nextloc, l->var[idx].loc + 4);
+      }
+
+      /* add if not already in linkage map: */
+      if (idx == l->cnt)
+         ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
+
+      /* expand component-mask if needed, ie streaming out all components
+       * but frag shader doesn't consume all components:
+       */
+      if (compmask & ~l->var[idx].compmask) {
+         l->var[idx].compmask |= compmask;
+         l->max_loc = MAX2(
+            l->max_loc, l->var[idx].loc + util_last_bit(l->var[idx].compmask));
+      }
+   }
  }
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h

index abe8401..864ef22 100644 (file)
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -30,8 +30,8 @@
  #include <stdio.h>
  
  #include "c11/threads.h"
-#include "compiler/shader_enums.h"
  #include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
  #include "util/bitscan.h"
  #include "util/disk_cache.h"
  
@@ -41,63 +41,63 @@ struct glsl_type;
  
  /* driver param indices: */
  enum ir3_driver_param {
-       /* compute shader driver params: */
-       IR3_DP_NUM_WORK_GROUPS_X = 0,
-       IR3_DP_NUM_WORK_GROUPS_Y = 1,
-       IR3_DP_NUM_WORK_GROUPS_Z = 2,
-       IR3_DP_BASE_GROUP_X = 4,
-       IR3_DP_BASE_GROUP_Y = 5,
-       IR3_DP_BASE_GROUP_Z = 6,
-       IR3_DP_SUBGROUP_SIZE = 7,
-       IR3_DP_LOCAL_GROUP_SIZE_X = 8,
-       IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
-       IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
-       IR3_DP_SUBGROUP_ID_SHIFT = 11,
-       /* NOTE: gl_NumWorkGroups should be vec4 aligned because
-        * glDispatchComputeIndirect() needs to load these from
-        * the info->indirect buffer.  Keep that in mind when/if
-        * adding any addition CS driver params.
-        */
-       IR3_DP_CS_COUNT   = 12,   /* must be aligned to vec4 */
-
-       /* vertex shader driver params: */
-       IR3_DP_DRAWID = 0,
-       IR3_DP_VTXID_BASE = 1,
-       IR3_DP_INSTID_BASE = 2,
-       IR3_DP_VTXCNT_MAX = 3,
-       /* user-clip-plane components, up to 8x vec4's: */
-       IR3_DP_UCP0_X     = 4,
-       /* .... */
-       IR3_DP_UCP7_W     = 35,
-       IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
+   /* compute shader driver params: */
+   IR3_DP_NUM_WORK_GROUPS_X = 0,
+   IR3_DP_NUM_WORK_GROUPS_Y = 1,
+   IR3_DP_NUM_WORK_GROUPS_Z = 2,
+   IR3_DP_BASE_GROUP_X = 4,
+   IR3_DP_BASE_GROUP_Y = 5,
+   IR3_DP_BASE_GROUP_Z = 6,
+   IR3_DP_SUBGROUP_SIZE = 7,
+   IR3_DP_LOCAL_GROUP_SIZE_X = 8,
+   IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
+   IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
+   IR3_DP_SUBGROUP_ID_SHIFT = 11,
+   /* NOTE: gl_NumWorkGroups should be vec4 aligned because
+    * glDispatchComputeIndirect() needs to load these from
+    * the info->indirect buffer.  Keep that in mind when/if
+    * adding any addition CS driver params.
+    */
+   IR3_DP_CS_COUNT = 12, /* must be aligned to vec4 */
+
+   /* vertex shader driver params: */
+   IR3_DP_DRAWID = 0,
+   IR3_DP_VTXID_BASE = 1,
+   IR3_DP_INSTID_BASE = 2,
+   IR3_DP_VTXCNT_MAX = 3,
+   /* user-clip-plane components, up to 8x vec4's: */
+   IR3_DP_UCP0_X = 4,
+   /* .... */
+   IR3_DP_UCP7_W = 35,
+   IR3_DP_VS_COUNT = 36 /* must be aligned to vec4 */
  };
  
-#define IR3_MAX_SHADER_BUFFERS   32
-#define IR3_MAX_SHADER_IMAGES    32
-#define IR3_MAX_SO_BUFFERS        4
-#define IR3_MAX_SO_STREAMS        4
-#define IR3_MAX_SO_OUTPUTS       64
-#define IR3_MAX_UBO_PUSH_RANGES  32
+#define IR3_MAX_SHADER_BUFFERS  32
+#define IR3_MAX_SHADER_IMAGES   32
+#define IR3_MAX_SO_BUFFERS      4
+#define IR3_MAX_SO_STREAMS      4
+#define IR3_MAX_SO_OUTPUTS      64
+#define IR3_MAX_UBO_PUSH_RANGES 32
  
  /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
  enum ir3_bary {
-       IJ_PERSP_PIXEL,
-       IJ_PERSP_SAMPLE,
-       IJ_PERSP_CENTROID,
-       IJ_PERSP_SIZE,
-       IJ_LINEAR_PIXEL,
-       IJ_LINEAR_CENTROID,
-       IJ_LINEAR_SAMPLE,
-       IJ_COUNT,
+   IJ_PERSP_PIXEL,
+   IJ_PERSP_SAMPLE,
+   IJ_PERSP_CENTROID,
+   IJ_PERSP_SIZE,
+   IJ_LINEAR_PIXEL,
+   IJ_LINEAR_CENTROID,
+   IJ_LINEAR_SAMPLE,
+   IJ_COUNT,
  };
  
  /**
   * Description of a lowered UBO.
   */
  struct ir3_ubo_info {
-       uint32_t block; /* Which constant block */
-       uint16_t bindless_base; /* For bindless, which base register is used */
-       bool bindless;
+   uint32_t block;         /* Which constant block */
+   uint16_t bindless_base; /* For bindless, which base register is used */
+   bool bindless;
  };
  
  /**
@@ -107,16 +107,17 @@ struct ir3_ubo_info {
   * lowered ranges of a single UBO.
   */
  struct ir3_ubo_range {
-       struct ir3_ubo_info ubo;
-       uint32_t offset; /* start offset to push in the const register file */
-       uint32_t start, end; /* range of block that's actually used */
+   struct ir3_ubo_info ubo;
+   uint32_t offset;     /* start offset to push in the const register file */
+   uint32_t start, end; /* range of block that's actually used */
  };
  
  struct ir3_ubo_analysis_state {
-       struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
-       uint32_t num_enabled;
-       uint32_t size;
-       uint32_t cmdstream_size; /* for per-gen backend to stash required cmdstream size */
+   struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
+   uint32_t num_enabled;
+   uint32_t size;
+   uint32_t
+      cmdstream_size; /* for per-gen backend to stash required cmdstream size */
  };
  
  /**
@@ -159,86 +160,85 @@ struct ir3_ubo_analysis_state {
   * Note UBO size in bytes should be aligned to vec4
   */
  struct ir3_const_state {
-       unsigned num_ubos;
-       unsigned num_driver_params;   /* scalar */
-
-       /* UBO that should be mapped to the NIR shader's constant_data (or -1). */
-       int32_t constant_data_ubo;
-
-       struct {
-               /* user const start at zero */
-               unsigned ubo;
-               /* NOTE that a3xx might need a section for SSBO addresses too */
-               unsigned ssbo_sizes;
-               unsigned image_dims;
-               unsigned driver_param;
-               unsigned tfbo;
-               unsigned primitive_param;
-               unsigned primitive_map;
-               unsigned immediate;
-       } offsets;
-
-       struct {
-               uint32_t mask;  /* bitmask of SSBOs that have get_ssbo_size */
-               uint32_t count; /* number of consts allocated */
-               /* one const allocated per SSBO which has get_ssbo_size,
-                * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
-                * consts:
-                */
-               uint32_t off[IR3_MAX_SHADER_BUFFERS];
-       } ssbo_size;
-
-       struct {
-               uint32_t mask;  /* bitmask of images that have image_store */
-               uint32_t count; /* number of consts allocated */
-               /* three const allocated per image which has image_store:
-                *  + cpp         (bytes per pixel)
-                *  + pitch       (y pitch)
-                *  + array_pitch (z pitch)
-                */
-               uint32_t off[IR3_MAX_SHADER_IMAGES];
-       } image_dims;
-
-       unsigned immediates_count;
-       unsigned immediates_size;
-       uint32_t *immediates;
-
-       /* State of ubo access lowered to push consts: */
-       struct ir3_ubo_analysis_state ubo_state;
+   unsigned num_ubos;
+   unsigned num_driver_params; /* scalar */
+
+   /* UBO that should be mapped to the NIR shader's constant_data (or -1). */
+   int32_t constant_data_ubo;
+
+   struct {
+      /* user const start at zero */
+      unsigned ubo;
+      /* NOTE that a3xx might need a section for SSBO addresses too */
+      unsigned ssbo_sizes;
+      unsigned image_dims;
+      unsigned driver_param;
+      unsigned tfbo;
+      unsigned primitive_param;
+      unsigned primitive_map;
+      unsigned immediate;
+   } offsets;
+
+   struct {
+      uint32_t mask;  /* bitmask of SSBOs that have get_ssbo_size */
+      uint32_t count; /* number of consts allocated */
+      /* one const allocated per SSBO which has get_ssbo_size,
+       * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
+       * consts:
+       */
+      uint32_t off[IR3_MAX_SHADER_BUFFERS];
+   } ssbo_size;
+
+   struct {
+      uint32_t mask;  /* bitmask of images that have image_store */
+      uint32_t count; /* number of consts allocated */
+      /* three const allocated per image which has image_store:
+       *  + cpp         (bytes per pixel)
+       *  + pitch       (y pitch)
+       *  + array_pitch (z pitch)
+       */
+      uint32_t off[IR3_MAX_SHADER_IMAGES];
+   } image_dims;
+
+   unsigned immediates_count;
+   unsigned immediates_size;
+   uint32_t *immediates;
+
+   /* State of ubo access lowered to push consts: */
+   struct ir3_ubo_analysis_state ubo_state;
  };
  
  /**
   * A single output for vertex transform feedback.
   */
  struct ir3_stream_output {
-       unsigned register_index:6;  /**< 0 to 63 (OUT index) */
-       unsigned start_component:2; /** 0 to 3 */
-       unsigned num_components:3;  /** 1 to 4 */
-       unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
-       unsigned dst_offset:16;     /**< offset into the buffer in dwords */
-       unsigned stream:2;          /**< 0 to 3 */
+   unsigned register_index  : 6;  /**< 0 to 63 (OUT index) */
+   unsigned start_component : 2;  /** 0 to 3 */
+   unsigned num_components  : 3;  /** 1 to 4 */
+   unsigned output_buffer   : 3;  /**< 0 to PIPE_MAX_SO_BUFFERS */
+   unsigned dst_offset      : 16; /**< offset into the buffer in dwords */
+   unsigned stream          : 2;  /**< 0 to 3 */
  };
  
  /**
   * Stream output for vertex transform feedback.
   */
  struct ir3_stream_output_info {
-       unsigned num_outputs;
-       /** stride for an entire vertex for each buffer in dwords */
-       uint16_t stride[IR3_MAX_SO_BUFFERS];
-
-       /* These correspond to the VPC_SO_STREAM_CNTL fields */
-       uint8_t streams_written;
-       uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
-
-       /**
-        * Array of stream outputs, in the order they are to be written in.
-        * Selected components are tightly packed into the output buffer.
-        */
-       struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
+   unsigned num_outputs;
+   /** stride for an entire vertex for each buffer in dwords */
+   uint16_t stride[IR3_MAX_SO_BUFFERS];
+
+   /* These correspond to the VPC_SO_STREAM_CNTL fields */
+   uint8_t streams_written;
+   uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
+
+   /**
+    * Array of stream outputs, in the order they are to be written in.
+    * Selected components are tightly packed into the output buffer.
+    */
+   struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
  };
  
-
  /**
   * Starting from a4xx, HW supports pre-dispatching texture sampling
   * instructions prior to scheduling a shader stage, when the
@@ -255,25 +255,24 @@ struct ir3_stream_output_info {
   * This is the output stream value for 'cmd', as used by blob. It may
   * encode the return type (in 3 bits) but it hasn't been verified yet.
   */
-#define IR3_SAMPLER_PREFETCH_CMD 0x4
+#define IR3_SAMPLER_PREFETCH_CMD          0x4
  #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
  
  /**
   * Stream output for texture sampling pre-dispatches.
   */
  struct ir3_sampler_prefetch {
-       uint8_t src;
-       uint8_t samp_id;
-       uint8_t tex_id;
-       uint16_t samp_bindless_id;
-       uint16_t tex_bindless_id;
-       uint8_t dst;
-       uint8_t wrmask;
-       uint8_t half_precision;
-       uint8_t cmd;
+   uint8_t src;
+   uint8_t samp_id;
+   uint8_t tex_id;
+   uint16_t samp_bindless_id;
+   uint16_t tex_bindless_id;
+   uint8_t dst;
+   uint8_t wrmask;
+   uint8_t half_precision;
+   uint8_t cmd;
  };
  
-
  /* Configuration key used to identify a shader variant.. different
   * shader variants can be used to implement features not supported
   * in hw (two sided color), binning-pass vertex shader, etc.
@@ -282,127 +281,131 @@ struct ir3_sampler_prefetch {
   * output.
   */
  struct ir3_shader_key {
-       union {
-               struct {
-                       /*
-                        * Combined Vertex/Fragment shader parameters:
-                        */
-                       unsigned ucp_enables : 8;
-
-                       /* do we need to check {v,f}saturate_{s,t,r}? */
-                       unsigned has_per_samp : 1;
-
-                       /*
-                        * Fragment shader variant parameters:
-                        */
-                       unsigned sample_shading : 1;
-                       unsigned msaa           : 1;
-                       /* used when shader needs to handle flat varyings (a4xx)
-                        * for front/back color inputs to frag shader:
-                        */
-                       unsigned rasterflat : 1;
-
-                       /* Indicates that this is a tessellation pipeline which requires a
-                        * whole different kind of vertex shader.  In case of
-                        * tessellation, this field also tells us which kind of output
-                        * topology the TES uses, which the TCS needs to know.
-                        */
-#define IR3_TESS_NONE          0
-#define IR3_TESS_TRIANGLES     1
-#define IR3_TESS_QUADS         2
-#define IR3_TESS_ISOLINES      3
-                       unsigned tessellation : 2;
-
-                       unsigned has_gs : 1;
-
-                       /* Whether this variant sticks to the "safe" maximum constlen,
-                        * which guarantees that the combined stages will never go over
-                        * the limit:
-                        */
-                       unsigned safe_constlen : 1;
-
-                       /* Whether gl_Layer must be forced to 0 because it isn't written. */
-                       unsigned layer_zero : 1;
-
-                       /* Whether gl_ViewportIndex must be forced to 0 because it isn't written. */
-                       unsigned view_zero : 1;
-               };
-               uint32_t global;
-       };
-
-       /* bitmask of ms shifts (a3xx) */
-       uint32_t vsamples, fsamples;
-
-       /* bitmask of samplers which need astc srgb workaround (a4xx+a5xx): */
-       uint16_t vastc_srgb, fastc_srgb;
+   union {
+      struct {
+         /*
+          * Combined Vertex/Fragment shader parameters:
+          */
+         unsigned ucp_enables : 8;
+
+         /* do we need to check {v,f}saturate_{s,t,r}? */
+         unsigned has_per_samp : 1;
+
+         /*
+          * Fragment shader variant parameters:
+          */
+         unsigned sample_shading : 1;
+         unsigned msaa           : 1;
+         /* used when shader needs to handle flat varyings (a4xx)
+          * for front/back color inputs to frag shader:
+          */
+         unsigned rasterflat : 1;
+
+         /* Indicates that this is a tessellation pipeline which requires a
+          * whole different kind of vertex shader.  In case of
+          * tessellation, this field also tells us which kind of output
+          * topology the TES uses, which the TCS needs to know.
+          */
+#define IR3_TESS_NONE      0
+#define IR3_TESS_TRIANGLES 1
+#define IR3_TESS_QUADS     2
+#define IR3_TESS_ISOLINES  3
+         unsigned tessellation : 2;
+
+         unsigned has_gs : 1;
+
+         /* Whether this variant sticks to the "safe" maximum constlen,
+          * which guarantees that the combined stages will never go over
+          * the limit:
+          */
+         unsigned safe_constlen : 1;
+
+         /* Whether gl_Layer must be forced to 0 because it isn't written. */
+         unsigned layer_zero : 1;
+
+         /* Whether gl_ViewportIndex must be forced to 0 because it isn't
+          * written. */
+         unsigned view_zero : 1;
+      };
+      uint32_t global;
+   };
+
+   /* bitmask of ms shifts (a3xx) */
+   uint32_t vsamples, fsamples;
+
+   /* bitmask of samplers which need astc srgb workaround (a4xx+a5xx): */
+   uint16_t vastc_srgb, fastc_srgb;
  };
  
  static inline unsigned
  ir3_tess_mode(unsigned gl_tess_mode)
  {
-       switch (gl_tess_mode) {
-       case GL_ISOLINES:
-               return  IR3_TESS_ISOLINES;
-       case GL_TRIANGLES:
-               return IR3_TESS_TRIANGLES;
-       case GL_QUADS:
-               return IR3_TESS_QUADS;
-       default:
-               unreachable("bad tessmode");
-       }
+   switch (gl_tess_mode) {
+   case GL_ISOLINES:
+      return IR3_TESS_ISOLINES;
+   case GL_TRIANGLES:
+      return IR3_TESS_TRIANGLES;
+   case GL_QUADS:
+      return IR3_TESS_QUADS;
+   default:
+      unreachable("bad tessmode");
+   }
  }
  
  static inline bool
-ir3_shader_key_equal(const struct ir3_shader_key *a, const struct ir3_shader_key *b)
+ir3_shader_key_equal(const struct ir3_shader_key *a,
+                     const struct ir3_shader_key *b)
  {
-       /* slow-path if we need to check {v,f}saturate_{s,t,r} */
-       if (a->has_per_samp || b->has_per_samp)
-               return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
-       return a->global == b->global;
+   /* slow-path if we need to check {v,f}saturate_{s,t,r} */
+   if (a->has_per_samp || b->has_per_samp)
+      return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
+   return a->global == b->global;
  }
  
  /* will the two keys produce different lowering for a fragment shader? */
  static inline bool
-ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+ir3_shader_key_changes_fs(struct ir3_shader_key *key,
+                          struct ir3_shader_key *last_key)
  {
-       if (last_key->has_per_samp || key->has_per_samp) {
-               if ((last_key->fsamples != key->fsamples) ||
-                               (last_key->fastc_srgb != key->fastc_srgb))
-                       return true;
-       }
+   if (last_key->has_per_samp || key->has_per_samp) {
+      if ((last_key->fsamples != key->fsamples) ||
+          (last_key->fastc_srgb != key->fastc_srgb))
+         return true;
+   }
  
-       if (last_key->rasterflat != key->rasterflat)
-               return true;
+   if (last_key->rasterflat != key->rasterflat)
+      return true;
  
-       if (last_key->layer_zero != key->layer_zero)
-               return true;
+   if (last_key->layer_zero != key->layer_zero)
+      return true;
  
-       if (last_key->ucp_enables != key->ucp_enables)
-               return true;
+   if (last_key->ucp_enables != key->ucp_enables)
+      return true;
  
-       if (last_key->safe_constlen != key->safe_constlen)
-               return true;
+   if (last_key->safe_constlen != key->safe_constlen)
+      return true;
  
-       return false;
+   return false;
  }
  
  /* will the two keys produce different lowering for a vertex shader? */
  static inline bool
-ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+ir3_shader_key_changes_vs(struct ir3_shader_key *key,
+                          struct ir3_shader_key *last_key)
  {
-       if (last_key->has_per_samp || key->has_per_samp) {
-               if ((last_key->vsamples != key->vsamples) ||
-                               (last_key->vastc_srgb != key->vastc_srgb))
-                       return true;
-       }
+   if (last_key->has_per_samp || key->has_per_samp) {
+      if ((last_key->vsamples != key->vsamples) ||
+          (last_key->vastc_srgb != key->vastc_srgb))
+         return true;
+   }
  
-       if (last_key->ucp_enables != key->ucp_enables)
-               return true;
+   if (last_key->ucp_enables != key->ucp_enables)
+      return true;
  
-       if (last_key->safe_constlen != key->safe_constlen)
-               return true;
+   if (last_key->safe_constlen != key->safe_constlen)
+      return true;
  
-       return false;
+   return false;
  }
  
  /**
@@ -429,274 +432,281 @@ ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *las
   */
  struct ir3_ibo_mapping {
  #define IBO_INVALID 0xff
-       /* Maps logical SSBO state to hw tex state: */
-       uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
-
-       /* Maps logical Image state to hw tex state: */
-       uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
-
-       /* Maps hw state back to logical SSBO or Image state:
-        *
-        * note IBO_SSBO ORd into values to indicate that the
-        * hw slot is used for SSBO state vs Image state.
-        */
-#define IBO_SSBO    0x80
-       uint8_t tex_to_image[32];
-
-       /* including real textures */
-       uint8_t num_tex;
-       /* the number of real textures, ie. image/ssbo start here */
-       uint8_t tex_base;
+   /* Maps logical SSBO state to hw tex state: */
+   uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
+
+   /* Maps logical Image state to hw tex state: */
+   uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
+
+   /* Maps hw state back to logical SSBO or Image state:
+    *
+    * note IBO_SSBO ORd into values to indicate that the
+    * hw slot is used for SSBO state vs Image state.
+    */
+#define IBO_SSBO 0x80
+   uint8_t tex_to_image[32];
+
+   /* including real textures */
+   uint8_t num_tex;
+   /* the number of real textures, ie. image/ssbo start here */
+   uint8_t tex_base;
  };
  
  struct ir3_disasm_info {
-       bool write_disasm;
-       char *nir;
-       char *disasm;
+   bool write_disasm;
+   char *nir;
+   char *disasm;
  };
  
  /* Represents half register in regid */
-#define HALF_REG_ID    0x100
+#define HALF_REG_ID 0x100
  
  /**
   * Shader variant which contains the actual hw shader instructions,
   * and necessary info for shader state setup.
   */
  struct ir3_shader_variant {
-       struct fd_bo *bo;
-
-       /* variant id (for debug) */
-       uint32_t id;
-
-       struct ir3_shader_key key;
-
-       /* vertex shaders can have an extra version for hwbinning pass,
-        * which is pointed to by so->binning:
-        */
-       bool binning_pass;
-//     union {
-               struct ir3_shader_variant *binning;
-               struct ir3_shader_variant *nonbinning;
-//     };
-
-       struct ir3 *ir;     /* freed after assembling machine instructions */
-
-       /* shader variants form a linked list: */
-       struct ir3_shader_variant *next;
-
-       /* replicated here to avoid passing extra ptrs everywhere: */
-       gl_shader_stage type;
-       struct ir3_shader *shader;
-
-       /* variant's copy of nir->constant_data (since we don't track the NIR in
-        * the variant, and shader->nir is before the opt pass).  Moves to v->bin
-        * after assembly.
-        */
-       void *constant_data;
-
-       /*
-        * Below here is serialized when written to disk cache:
-        */
-
-       /* The actual binary shader instructions, size given by info.sizedwords: */
-       uint32_t *bin;
-
-       struct ir3_const_state *const_state;
-
-       /*
-        * The following macros are used by the shader disk cache save/
-        * restore paths to serialize/deserialize the variant.  Any
-        * pointers that require special handling in store_variant()
-        * and retrieve_variant() should go above here.
-        */
-#define VARIANT_CACHE_START    offsetof(struct ir3_shader_variant, info)
-#define VARIANT_CACHE_PTR(v)   (((char *)v) + VARIANT_CACHE_START)
-#define VARIANT_CACHE_SIZE     (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
-
-       struct ir3_info info;
-
-       uint32_t constant_data_size;
-
-       /* Levels of nesting of flow control:
-        */
-       unsigned branchstack;
-
-       unsigned max_sun;
-       unsigned loops;
-
-       /* the instructions length is in units of instruction groups
-        * (4 instructions for a3xx, 16 instructions for a4xx.. each
-        * instruction is 2 dwords):
-        */
-       unsigned instrlen;
-
-       /* the constants length is in units of vec4's, and is the sum of
-        * the uniforms and the built-in compiler constants
-        */
-       unsigned constlen;
-
-       /* The private memory size in bytes */
-       unsigned pvtmem_size;
-       /* Whether we should use the new per-wave layout rather than per-fiber. */
-       bool pvtmem_per_wave;
-
-       /* Size in bytes of required shared memory */
-       unsigned shared_size;
-
-       /* About Linkage:
-        *   + Let the frag shader determine the position/compmask for the
-        *     varyings, since it is the place where we know if the varying
-        *     is actually used, and if so, which components are used.  So
-        *     what the hw calls "outloc" is taken from the "inloc" of the
-        *     frag shader.
-        *   + From the vert shader, we only need the output regid
-        */
-
-       bool frag_face, color0_mrt;
-       uint8_t fragcoord_compmask;
-
-       /* NOTE: for input/outputs, slot is:
-        *   gl_vert_attrib  - for VS inputs
-        *   gl_varying_slot - for VS output / FS input
-        *   gl_frag_result  - for FS output
-        */
-
-       /* varyings/outputs: */
-       unsigned outputs_count;
-       struct {
-               uint8_t slot;
-               uint8_t regid;
-               uint8_t view;
-               bool    half : 1;
-       } outputs[32 + 2];  /* +POSITION +PSIZE */
-       bool writes_pos, writes_smask, writes_psize, writes_stencilref;
-
-       /* Size in dwords of all outputs for VS, size of entire patch for HS. */
-       uint32_t output_size;
-
-       /* Expected size of incoming output_loc for HS, DS, and GS */
-       uint32_t input_size;
-
-       /* Map from location to offset in per-primitive storage. In dwords for
-        * HS, where varyings are read in the next stage via ldg with a dword
-        * offset, and in bytes for all other stages.
-        */
-       unsigned output_loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
-
-       /* attributes (VS) / varyings (FS):
-        * Note that sysval's should come *after* normal inputs.
-        */
-       unsigned inputs_count;
-       struct {
-               uint8_t slot;
-               uint8_t regid;
-               uint8_t compmask;
-               /* location of input (ie. offset passed to bary.f, etc).  This
-                * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
-                * have the OUTLOCn value offset by 8, presumably to account
-                * for gl_Position/gl_PointSize)
-                */
-               uint8_t inloc;
-               /* vertex shader specific: */
-               bool    sysval     : 1;   /* slot is a gl_system_value */
-               /* fragment shader specific: */
-               bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
-               bool    rasterflat : 1;   /* special handling for emit->rasterflat */
-               bool    half       : 1;
-               bool    flat       : 1;
-       } inputs[32 + 2];  /* +POSITION +FACE */
-
-       /* sum of input components (scalar).  For frag shaders, it only counts
-        * the varying inputs:
-        */
-       unsigned total_in;
-
-       /* sum of sysval input components (scalar). */
-       unsigned sysval_in;
-
-       /* For frag shaders, the total number of inputs (not scalar,
-        * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
-        */
-       unsigned varying_in;
-
-       /* Remapping table to map Image and SSBO to hw state: */
-       struct ir3_ibo_mapping image_mapping;
-
-       /* number of samplers/textures (which are currently 1:1): */
-       int num_samp;
-
-       /* is there an implicit sampler to read framebuffer (FS only).. if
-        * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
-        * the last "real" texture)
-        */
-       bool fb_read;
-
-       /* do we have one or more SSBO instructions: */
-       bool has_ssbo;
-
-       /* Which bindless resources are used, for filling out sp_xs_config */
-       bool bindless_tex;
-       bool bindless_samp;
-       bool bindless_ibo;
-       bool bindless_ubo;
-
-       /* do we need derivatives: */
-       bool need_pixlod;
-
-       bool need_fine_derivatives;
-
-       /* do we need VS driver params? */
-       bool need_driver_params;
-
-       /* do we have image write, etc (which prevents early-z): */
-       bool no_earlyz;
-
-       /* do we have kill, which also prevents early-z, but not necessarily
-        * early-lrz (as long as lrz-write is disabled, which must be handled
-        * outside of ir3.  Unlike other no_earlyz cases, kill doesn't have
-        * side effects that prevent early-lrz discard.
-        */
-       bool has_kill;
-
-       bool per_samp;
-
-       /* Are we using split or merged register file? */
-       bool mergedregs;
-
-       uint8_t clip_mask, cull_mask;
-
-       /* for astc srgb workaround, the number/base of additional
-        * alpha tex states we need, and index of original tex states
-        */
-       struct {
-               unsigned base, count;
-               unsigned orig_idx[16];
-       } astc_srgb;
-
-       /* texture sampler pre-dispatches */
-       uint32_t num_sampler_prefetch;
-       struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
-
-       uint16_t local_size[3];
-       bool local_size_variable;
-
-       struct ir3_disasm_info disasm_info;
+   struct fd_bo *bo;
+
+   /* variant id (for debug) */
+   uint32_t id;
+
+   struct ir3_shader_key key;
+
+   /* vertex shaders can have an extra version for hwbinning pass,
+    * which is pointed to by so->binning:
+    */
+   bool binning_pass;
+   //  union {
+   struct ir3_shader_variant *binning;
+   struct ir3_shader_variant *nonbinning;
+   //  };
+
+   struct ir3 *ir; /* freed after assembling machine instructions */
+
+   /* shader variants form a linked list: */
+   struct ir3_shader_variant *next;
+
+   /* replicated here to avoid passing extra ptrs everywhere: */
+   gl_shader_stage type;
+   struct ir3_shader *shader;
+
+   /* variant's copy of nir->constant_data (since we don't track the NIR in
+    * the variant, and shader->nir is before the opt pass).  Moves to v->bin
+    * after assembly.
+    */
+   void *constant_data;
+
+   /*
+    * Below here is serialized when written to disk cache:
+    */
+
+   /* The actual binary shader instructions, size given by info.sizedwords: */
+   uint32_t *bin;
+
+   struct ir3_const_state *const_state;
+
+   /*
+    * The following macros are used by the shader disk cache save/
+    * restore paths to serialize/deserialize the variant.  Any
+    * pointers that require special handling in store_variant()
+    * and retrieve_variant() should go above here.
+    */
+#define VARIANT_CACHE_START  offsetof(struct ir3_shader_variant, info)
+#define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
+#define VARIANT_CACHE_SIZE                                                     \
+   (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
+
+   struct ir3_info info;
+
+   uint32_t constant_data_size;
+
+   /* Levels of nesting of flow control:
+    */
+   unsigned branchstack;
+
+   unsigned max_sun;
+   unsigned loops;
+
+   /* the instructions length is in units of instruction groups
+    * (4 instructions for a3xx, 16 instructions for a4xx.. each
+    * instruction is 2 dwords):
+    */
+   unsigned instrlen;
+
+   /* the constants length is in units of vec4's, and is the sum of
+    * the uniforms and the built-in compiler constants
+    */
+   unsigned constlen;
+
+   /* The private memory size in bytes */
+   unsigned pvtmem_size;
+   /* Whether we should use the new per-wave layout rather than per-fiber. */
+   bool pvtmem_per_wave;
+
+   /* Size in bytes of required shared memory */
+   unsigned shared_size;
+
+   /* About Linkage:
+    *   + Let the frag shader determine the position/compmask for the
+    *     varyings, since it is the place where we know if the varying
+    *     is actually used, and if so, which components are used.  So
+    *     what the hw calls "outloc" is taken from the "inloc" of the
+    *     frag shader.
+    *   + From the vert shader, we only need the output regid
+    */
+
+   bool frag_face, color0_mrt;
+   uint8_t fragcoord_compmask;
+
+   /* NOTE: for input/outputs, slot is:
+    *   gl_vert_attrib  - for VS inputs
+    *   gl_varying_slot - for VS output / FS input
+    *   gl_frag_result  - for FS output
+    */
+
+   /* varyings/outputs: */
+   unsigned outputs_count;
+   struct {
+      uint8_t slot;
+      uint8_t regid;
+      uint8_t view;
+      bool half : 1;
+   } outputs[32 + 2]; /* +POSITION +PSIZE */
+   bool writes_pos, writes_smask, writes_psize, writes_stencilref;
+
+   /* Size in dwords of all outputs for VS, size of entire patch for HS. */
+   uint32_t output_size;
+
+   /* Expected size of incoming output_loc for HS, DS, and GS */
+   uint32_t input_size;
+
+   /* Map from location to offset in per-primitive storage. In dwords for
+    * HS, where varyings are read in the next stage via ldg with a dword
+    * offset, and in bytes for all other stages.
+    */
+   unsigned output_loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
+
+   /* attributes (VS) / varyings (FS):
+    * Note that sysval's should come *after* normal inputs.
+    */
+   unsigned inputs_count;
+   struct {
+      uint8_t slot;
+      uint8_t regid;
+      uint8_t compmask;
+      /* location of input (ie. offset passed to bary.f, etc).  This
+       * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
+       * have the OUTLOCn value offset by 8, presumably to account
+       * for gl_Position/gl_PointSize)
+       */
+      uint8_t inloc;
+      /* vertex shader specific: */
+      bool sysval : 1; /* slot is a gl_system_value */
+      /* fragment shader specific: */
+      bool bary       : 1; /* fetched varying (vs one loaded into reg) */
+      bool rasterflat : 1; /* special handling for emit->rasterflat */
+      bool half       : 1;
+      bool flat       : 1;
+   } inputs[32 + 2]; /* +POSITION +FACE */
+
+   /* sum of input components (scalar).  For frag shaders, it only counts
+    * the varying inputs:
+    */
+   unsigned total_in;
+
+   /* sum of sysval input components (scalar). */
+   unsigned sysval_in;
+
+   /* For frag shaders, the total number of inputs (not scalar,
+    * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+    */
+   unsigned varying_in;
+
+   /* Remapping table to map Image and SSBO to hw state: */
+   struct ir3_ibo_mapping image_mapping;
+
+   /* number of samplers/textures (which are currently 1:1): */
+   int num_samp;
+
+   /* is there an implicit sampler to read framebuffer (FS only).. if
+    * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
+    * the last "real" texture)
+    */
+   bool fb_read;
+
+   /* do we have one or more SSBO instructions: */
+   bool has_ssbo;
+
+   /* Which bindless resources are used, for filling out sp_xs_config */
+   bool bindless_tex;
+   bool bindless_samp;
+   bool bindless_ibo;
+   bool bindless_ubo;
+
+   /* do we need derivatives: */
+   bool need_pixlod;
+
+   bool need_fine_derivatives;
+
+   /* do we need VS driver params? */
+   bool need_driver_params;
+
+   /* do we have image write, etc (which prevents early-z): */
+   bool no_earlyz;
+
+   /* do we have kill, which also prevents early-z, but not necessarily
+    * early-lrz (as long as lrz-write is disabled, which must be handled
+    * outside of ir3.  Unlike other no_earlyz cases, kill doesn't have
+    * side effects that prevent early-lrz discard.
+    */
+   bool has_kill;
+
+   bool per_samp;
+
+   /* Are we using split or merged register file? */
+   bool mergedregs;
+
+   uint8_t clip_mask, cull_mask;
+
+   /* for astc srgb workaround, the number/base of additional
+    * alpha tex states we need, and index of original tex states
+    */
+   struct {
+      unsigned base, count;
+      unsigned orig_idx[16];
+   } astc_srgb;
+
+   /* texture sampler pre-dispatches */
+   uint32_t num_sampler_prefetch;
+   struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
+
+   uint16_t local_size[3];
+   bool local_size_variable;
+
+   struct ir3_disasm_info disasm_info;
  };
  
  static inline const char *
  ir3_shader_stage(struct ir3_shader_variant *v)
  {
-       switch (v->type) {
-       case MESA_SHADER_VERTEX:     return v->binning_pass ? "BVERT" : "VERT";
-       case MESA_SHADER_TESS_CTRL:  return "TCS";
-       case MESA_SHADER_TESS_EVAL:  return "TES";
-       case MESA_SHADER_GEOMETRY:   return "GEOM";
-       case MESA_SHADER_FRAGMENT:   return "FRAG";
-       case MESA_SHADER_COMPUTE:    return "CL";
-       default:
-               unreachable("invalid type");
-               return NULL;
-       }
+   switch (v->type) {
+   case MESA_SHADER_VERTEX:
+      return v->binning_pass ? "BVERT" : "VERT";
+   case MESA_SHADER_TESS_CTRL:
+      return "TCS";
+   case MESA_SHADER_TESS_EVAL:
+      return "TES";
+   case MESA_SHADER_GEOMETRY:
+      return "GEOM";
+   case MESA_SHADER_FRAGMENT:
+      return "FRAG";
+   case MESA_SHADER_COMPUTE:
+      return "CL";
+   default:
+      unreachable("invalid type");
+      return NULL;
+   }
  }
  
  /* Currently we do not do binning for tess.  And for GS there is no
@@ -706,9 +716,9 @@ ir3_shader_stage(struct ir3_shader_variant *v)
  static inline bool
  ir3_has_binning_vs(const struct ir3_shader_key *key)
  {
-       if (key->tessellation || key->has_gs)
-               return false;
-       return true;
+   if (key->tessellation || key->has_gs)
+      return false;
+   return true;
  }
  
  /**
@@ -716,34 +726,34 @@ ir3_has_binning_vs(const struct ir3_shader_key *key)
   * generated.
   */
  struct ir3_shader {
-       gl_shader_stage type;
+   gl_shader_stage type;
  
-       /* shader id (for debug): */
-       uint32_t id;
-       uint32_t variant_count;
+   /* shader id (for debug): */
+   uint32_t id;
+   uint32_t variant_count;
  
-       /* Set by freedreno after shader_state_create, so we can emit debug info
-        * when recompiling a shader at draw time.
-        */
-       bool initial_variants_done;
+   /* Set by freedreno after shader_state_create, so we can emit debug info
+    * when recompiling a shader at draw time.
+    */
+   bool initial_variants_done;
  
-       struct ir3_compiler *compiler;
+   struct ir3_compiler *compiler;
  
-       unsigned num_reserved_user_consts;
+   unsigned num_reserved_user_consts;
  
-       bool nir_finalized;
-       struct nir_shader *nir;
-       struct ir3_stream_output_info stream_output;
+   bool nir_finalized;
+   struct nir_shader *nir;
+   struct ir3_stream_output_info stream_output;
  
-       struct ir3_shader_variant *variants;
-       mtx_t variants_lock;
+   struct ir3_shader_variant *variants;
+   mtx_t variants_lock;
  
-       cache_key cache_key;     /* shader disk-cache key */
+   cache_key cache_key; /* shader disk-cache key */
  
-       /* Bitmask of bits of the shader key used by this shader.  Used to avoid
-        * recompiles for GL NOS that doesn't actually apply to the shader.
-        */
-       struct ir3_shader_key key_mask;
+   /* Bitmask of bits of the shader key used by this shader.  Used to avoid
+    * recompiles for GL NOS that doesn't actually apply to the shader.
+    */
+   struct ir3_shader_key key_mask;
  };
  
  /**
@@ -754,9 +764,9 @@ struct ir3_shader {
  static inline struct ir3_const_state *
  ir3_const_state(const struct ir3_shader_variant *v)
  {
-       if (v->binning_pass)
-               return v->nonbinning->const_state;
-       return v->const_state;
+   if (v->binning_pass)
+      return v->nonbinning->const_state;
+   return v->const_state;
  }
  
  /* Given a variant, calculate the maximum constlen it can have.
@@ -765,32 +775,35 @@ ir3_const_state(const struct ir3_shader_variant *v)
  static inline unsigned
  ir3_max_const(const struct ir3_shader_variant *v)
  {
-       const struct ir3_compiler *compiler = v->shader->compiler;
-
-       if (v->shader->type == MESA_SHADER_COMPUTE) {
-               return compiler->max_const_compute;
-       } else if (v->key.safe_constlen) {
-               return compiler->max_const_safe;
-       } else if (v->shader->type == MESA_SHADER_FRAGMENT) {
-               return compiler->max_const_frag;
-       } else {
-               return compiler->max_const_geom;
-       }
+   const struct ir3_compiler *compiler = v->shader->compiler;
+
+   if (v->shader->type == MESA_SHADER_COMPUTE) {
+      return compiler->max_const_compute;
+   } else if (v->key.safe_constlen) {
+      return compiler->max_const_safe;
+   } else if (v->shader->type == MESA_SHADER_FRAGMENT) {
+      return compiler->max_const_frag;
+   } else {
+      return compiler->max_const_geom;
+   }
  }
  
-void * ir3_shader_assemble(struct ir3_shader_variant *v);
-struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
-               const struct ir3_shader_key *key, bool binning_pass, bool keep_ir, bool *created);
-struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
-               unsigned reserved_user_consts, struct ir3_stream_output_info *stream_output);
+void *ir3_shader_assemble(struct ir3_shader_variant *v);
+struct ir3_shader_variant *
+ir3_shader_get_variant(struct ir3_shader *shader,
+                       const struct ir3_shader_key *key, bool binning_pass,
+                       bool keep_ir, bool *created);
+struct ir3_shader *
+ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
+                    unsigned reserved_user_consts,
+                    struct ir3_stream_output_info *stream_output);
  uint32_t ir3_trim_constlen(struct ir3_shader_variant **variants,
-               const struct ir3_compiler *compiler);
+                           const struct ir3_compiler *compiler);
  void ir3_shader_destroy(struct ir3_shader *shader);
  void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
  uint64_t ir3_shader_outputs(const struct ir3_shader *so);
  
-int
-ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
+int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
  
  /*
   * Helper/util:
@@ -801,192 +814,192 @@ ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
  static inline void
  ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
  {
-       uint32_t *key_bits = (uint32_t *)key;
-       uint32_t *key_mask = (uint32_t *)&shader->key_mask;
-       STATIC_ASSERT(sizeof(*key) % 4 == 0);
-       for (int i = 0; i < sizeof(*key) >> 2; i++)
-               key_bits[i] &= key_mask[i];
+   uint32_t *key_bits = (uint32_t *)key;
+   uint32_t *key_mask = (uint32_t *)&shader->key_mask;
+   STATIC_ASSERT(sizeof(*key) % 4 == 0);
+   for (int i = 0; i < sizeof(*key) >> 2; i++)
+      key_bits[i] &= key_mask[i];
  }
  
  static inline int
  ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
  {
-       int j;
-
-       for (j = 0; j < so->outputs_count; j++)
-               if (so->outputs[j].slot == slot)
-                       return j;
-
-       /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
-        * in the vertex shader.. but the fragment shader doesn't know this
-        * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
-        * at link time if there is no matching OUT.BCOLOR[n], we must map
-        * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
-        * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
-        */
-       if (slot == VARYING_SLOT_BFC0) {
-               slot = VARYING_SLOT_COL0;
-       } else if (slot == VARYING_SLOT_BFC1) {
-               slot = VARYING_SLOT_COL1;
-       } else if (slot == VARYING_SLOT_COL0) {
-               slot = VARYING_SLOT_BFC0;
-       } else if (slot == VARYING_SLOT_COL1) {
-               slot = VARYING_SLOT_BFC1;
-       } else {
-               return -1;
-       }
-
-       for (j = 0; j < so->outputs_count; j++)
-               if (so->outputs[j].slot == slot)
-                       return j;
-
-       debug_assert(0);
-
-       return -1;
+   int j;
+
+   for (j = 0; j < so->outputs_count; j++)
+      if (so->outputs[j].slot == slot)
+         return j;
+
+   /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
+    * in the vertex shader.. but the fragment shader doesn't know this
+    * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
+    * at link time if there is no matching OUT.BCOLOR[n], we must map
+    * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
+    * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
+    */
+   if (slot == VARYING_SLOT_BFC0) {
+      slot = VARYING_SLOT_COL0;
+   } else if (slot == VARYING_SLOT_BFC1) {
+      slot = VARYING_SLOT_COL1;
+   } else if (slot == VARYING_SLOT_COL0) {
+      slot = VARYING_SLOT_BFC0;
+   } else if (slot == VARYING_SLOT_COL1) {
+      slot = VARYING_SLOT_BFC1;
+   } else {
+      return -1;
+   }
+
+   for (j = 0; j < so->outputs_count; j++)
+      if (so->outputs[j].slot == slot)
+         return j;
+
+   debug_assert(0);
+
+   return -1;
  }
  
  static inline int
  ir3_next_varying(const struct ir3_shader_variant *so, int i)
  {
-       while (++i < so->inputs_count)
-               if (so->inputs[i].compmask && so->inputs[i].bary)
-                       break;
-       return i;
+   while (++i < so->inputs_count)
+      if (so->inputs[i].compmask && so->inputs[i].bary)
+         break;
+   return i;
  }
  
  struct ir3_shader_linkage {
-       /* Maximum location either consumed by the fragment shader or produced by
-        * the last geometry stage, i.e. the size required for each vertex in the
-        * VPC in DWORD's.
-        */
-       uint8_t max_loc;
-
-       /* Number of entries in var. */
-       uint8_t cnt;
-
-       /* Bitset of locations used, including ones which are only used by the FS.
-        */
-       uint32_t varmask[4];
-
-       /* Map from VS output to location. */
-       struct {
-               uint8_t regid;
-               uint8_t compmask;
-               uint8_t loc;
-       } var[32];
-
-       /* location for fixed-function gl_PrimitiveID passthrough */
-       uint8_t primid_loc;
-
-       /* location for fixed-function gl_ViewIndex passthrough */
-       uint8_t viewid_loc;
-
-       /* location for combined clip/cull distance arrays */
-       uint8_t clip0_loc, clip1_loc;
+   /* Maximum location either consumed by the fragment shader or produced by
+    * the last geometry stage, i.e. the size required for each vertex in the
+    * VPC in DWORD's.
+    */
+   uint8_t max_loc;
+
+   /* Number of entries in var. */
+   uint8_t cnt;
+
+   /* Bitset of locations used, including ones which are only used by the FS.
+    */
+   uint32_t varmask[4];
+
+   /* Map from VS output to location. */
+   struct {
+      uint8_t regid;
+      uint8_t compmask;
+      uint8_t loc;
+   } var[32];
+
+   /* location for fixed-function gl_PrimitiveID passthrough */
+   uint8_t primid_loc;
+
+   /* location for fixed-function gl_ViewIndex passthrough */
+   uint8_t viewid_loc;
+
+   /* location for combined clip/cull distance arrays */
+   uint8_t clip0_loc, clip1_loc;
  };
  
  static inline void
-ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask, uint8_t loc)
+ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask,
+             uint8_t loc)
  {
-       for (int j = 0; j < util_last_bit(compmask); j++) {
-               uint8_t comploc = loc + j;
-               l->varmask[comploc / 32] |= 1 << (comploc % 32);
-       }
+   for (int j = 0; j < util_last_bit(compmask); j++) {
+      uint8_t comploc = loc + j;
+      l->varmask[comploc / 32] |= 1 << (comploc % 32);
+   }
  
-       l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
+   l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
  
-       if (regid_ != regid(63, 0)) {
-               int i = l->cnt++;
-               debug_assert(i < ARRAY_SIZE(l->var));
+   if (regid_ != regid(63, 0)) {
+      int i = l->cnt++;
+      debug_assert(i < ARRAY_SIZE(l->var));
  
-               l->var[i].regid    = regid_;
-               l->var[i].compmask = compmask;
-               l->var[i].loc      = loc;
-       }
+      l->var[i].regid = regid_;
+      l->var[i].compmask = compmask;
+      l->var[i].loc = loc;
+   }
  }
  
  static inline void
  ir3_link_shaders(struct ir3_shader_linkage *l,
-               const struct ir3_shader_variant *vs,
-               const struct ir3_shader_variant *fs,
-               bool pack_vs_out)
+                 const struct ir3_shader_variant *vs,
+                 const struct ir3_shader_variant *fs, bool pack_vs_out)
  {
-       /* On older platforms, varmask isn't programmed at all, and it appears
-        * that the hardware generates a mask of used VPC locations using the VS
-        * output map, and hangs if a FS bary instruction references a location
-        * not in the list. This means that we need to have a dummy entry in the
-        * VS out map for things like gl_PointCoord which aren't written by the
-        * VS. Furthermore we can't use r63.x, so just pick a random register to
-        * use if there is no VS output.
-        */
-       const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
-       int j = -1, k;
-
-       l->primid_loc = 0xff;
-       l->viewid_loc = 0xff;
-       l->clip0_loc = 0xff;
-       l->clip1_loc = 0xff;
-
-       while (l->cnt < ARRAY_SIZE(l->var)) {
-               j = ir3_next_varying(fs, j);
-
-               if (j >= fs->inputs_count)
-                       break;
-
-               if (fs->inputs[j].inloc >= fs->total_in)
-                       continue;
-
-               k = ir3_find_output(vs, fs->inputs[j].slot);
-
-               if (k < 0 && fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
-                       l->primid_loc = fs->inputs[j].inloc;
-               }
-
-               if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
-                       assert(k < 0);
-                       l->viewid_loc = fs->inputs[j].inloc;
-               }
-
-               if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
-                       l->clip0_loc = fs->inputs[j].inloc;
-
-               if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
-                       l->clip1_loc = fs->inputs[j].inloc;
-
-               ir3_link_add(l, k >= 0 ? vs->outputs[k].regid : default_regid,
-                       fs->inputs[j].compmask, fs->inputs[j].inloc);
-       }
+   /* On older platforms, varmask isn't programmed at all, and it appears
+    * that the hardware generates a mask of used VPC locations using the VS
+    * output map, and hangs if a FS bary instruction references a location
+    * not in the list. This means that we need to have a dummy entry in the
+    * VS out map for things like gl_PointCoord which aren't written by the
+    * VS. Furthermore we can't use r63.x, so just pick a random register to
+    * use if there is no VS output.
+    */
+   const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
+   int j = -1, k;
+
+   l->primid_loc = 0xff;
+   l->viewid_loc = 0xff;
+   l->clip0_loc = 0xff;
+   l->clip1_loc = 0xff;
+
+   while (l->cnt < ARRAY_SIZE(l->var)) {
+      j = ir3_next_varying(fs, j);
+
+      if (j >= fs->inputs_count)
+         break;
+
+      if (fs->inputs[j].inloc >= fs->total_in)
+         continue;
+
+      k = ir3_find_output(vs, fs->inputs[j].slot);
+
+      if (k < 0 && fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
+         l->primid_loc = fs->inputs[j].inloc;
+      }
+
+      if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
+         assert(k < 0);
+         l->viewid_loc = fs->inputs[j].inloc;
+      }
+
+      if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
+         l->clip0_loc = fs->inputs[j].inloc;
+
+      if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
+         l->clip1_loc = fs->inputs[j].inloc;
+
+      ir3_link_add(l, k >= 0 ? vs->outputs[k].regid : default_regid,
+                   fs->inputs[j].compmask, fs->inputs[j].inloc);
+   }
  }
  
  static inline uint32_t
  ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
  {
-       int j;
-       for (j = 0; j < so->outputs_count; j++)
-               if (so->outputs[j].slot == slot) {
-                       uint32_t regid = so->outputs[j].regid;
-                       if (so->outputs[j].half)
-                               regid |= HALF_REG_ID;
-                       return regid;
-               }
-       return regid(63, 0);
+   int j;
+   for (j = 0; j < so->outputs_count; j++)
+      if (so->outputs[j].slot == slot) {
+         uint32_t regid = so->outputs[j].regid;
+         if (so->outputs[j].half)
+            regid |= HALF_REG_ID;
+         return regid;
+      }
+   return regid(63, 0);
  }
  
-void ir3_link_stream_out(struct ir3_shader_linkage *l, const struct ir3_shader_variant *v);
-
-#define VARYING_SLOT_GS_HEADER_IR3                     (VARYING_SLOT_MAX + 0)
-#define VARYING_SLOT_GS_VERTEX_FLAGS_IR3       (VARYING_SLOT_MAX + 1)
-#define VARYING_SLOT_TCS_HEADER_IR3                    (VARYING_SLOT_MAX + 2)
+void ir3_link_stream_out(struct ir3_shader_linkage *l,
+                         const struct ir3_shader_variant *v);
  
+#define VARYING_SLOT_GS_HEADER_IR3       (VARYING_SLOT_MAX + 0)
+#define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
+#define VARYING_SLOT_TCS_HEADER_IR3      (VARYING_SLOT_MAX + 2)
  
  static inline uint32_t
  ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
  {
-       int j;
-       for (j = 0; j < so->inputs_count; j++)
-               if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
-                       return so->inputs[j].regid;
-       return regid(63, 0);
+   int j;
+   for (j = 0; j < so->inputs_count; j++)
+      if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
+         return so->inputs[j].regid;
+   return regid(63, 0);
  }
  
  /* calculate register footprint in terms of half-regs (ie. one full
@@ -995,35 +1008,35 @@ ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
  static inline uint32_t
  ir3_shader_halfregs(const struct ir3_shader_variant *v)
  {
-       return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
+   return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
  }
  
  static inline uint32_t
  ir3_shader_nibo(const struct ir3_shader_variant *v)
  {
-       /* The dummy variant used in binning mode won't have an actual shader. */
-       if (!v->shader)
-               return 0;
+   /* The dummy variant used in binning mode won't have an actual shader. */
+   if (!v->shader)
+      return 0;
  
-       return v->shader->nir->info.num_ssbos + v->shader->nir->info.num_images;
+   return v->shader->nir->info.num_ssbos + v->shader->nir->info.num_images;
  }
  
  static inline uint32_t
  ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
  {
-       /* Dummy shader */
-       if (!v->shader)
-               return 0;
-
-       if (v->shader->compiler->gpu_id < 500)
-               return v->branchstack;
-
-       if (v->branchstack > 0) {
-               uint32_t branchstack = v->branchstack / 2 + 1;
-               return MIN2(branchstack, v->shader->compiler->branchstack_size / 2);
-       } else {
-               return 0;
-       }
+   /* Dummy shader */
+   if (!v->shader)
+      return 0;
+
+   if (v->shader->compiler->gpu_id < 500)
+      return v->branchstack;
+
+   if (v->branchstack > 0) {
+      uint32_t branchstack = v->branchstack / 2 + 1;
+      return MIN2(branchstack, v->shader->compiler->branchstack_size / 2);
+   } else {
+      return 0;
+   }
  }
  
  #endif /* IR3_SHADER_H_ */
diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c

index 5f3dee7..2b7ff35 100644 (file)
--- a/src/freedreno/ir3/ir3_spill.c
+++ b/src/freedreno/ir3/ir3_spill.c
@@ -21,9 +21,9 @@
   * SOFTWARE.
   */
  
+#include "util/rb_tree.h"
  #include "ir3_ra.h"
  #include "ir3_shader.h"
-#include "util/rb_tree.h"
  
  /*
   * This pass does one thing so far:
@@ -36,326 +36,330 @@
   */
  
  struct ra_spill_interval {
-       struct ir3_reg_interval interval;
+   struct ir3_reg_interval interval;
  };
  
  struct ra_spill_ctx {
-       struct ir3_reg_ctx reg_ctx;
+   struct ir3_reg_ctx reg_ctx;
  
-       struct ra_spill_interval *intervals;
+   struct ra_spill_interval *intervals;
  
-       struct ir3_pressure cur_pressure, max_pressure;
+   struct ir3_pressure cur_pressure, max_pressure;
  
-       struct ir3_liveness *live;
+   struct ir3_liveness *live;
  
-       const struct ir3_compiler *compiler;
+   const struct ir3_compiler *compiler;
  };
  
  static void
-ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg)
+ra_spill_interval_init(struct ra_spill_interval *interval,
+                       struct ir3_register *reg)
  {
-       ir3_reg_interval_init(&interval->interval, reg);
+   ir3_reg_interval_init(&interval->interval, reg);
  }
  
  static void
-ra_pressure_add(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
+ra_pressure_add(struct ir3_pressure *pressure,
+                struct ra_spill_interval *interval)
  {
-       unsigned size = reg_size(interval->interval.reg);
-       if (interval->interval.reg->flags & IR3_REG_SHARED)
-               pressure->shared += size;
-       else if (interval->interval.reg->flags & IR3_REG_HALF)
-               pressure->half += size;
-       else
-               pressure->full += size;
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED)
+      pressure->shared += size;
+   else if (interval->interval.reg->flags & IR3_REG_HALF)
+      pressure->half += size;
+   else
+      pressure->full += size;
  }
  
  static void
-ra_pressure_sub(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
+ra_pressure_sub(struct ir3_pressure *pressure,
+                struct ra_spill_interval *interval)
  {
-       unsigned size = reg_size(interval->interval.reg);
-       if (interval->interval.reg->flags & IR3_REG_SHARED)
-               pressure->shared -= size;
-       else if (interval->interval.reg->flags & IR3_REG_HALF)
-               pressure->half -= size;
-       else
-               pressure->full -= size;
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED)
+      pressure->shared -= size;
+   else if (interval->interval.reg->flags & IR3_REG_HALF)
+      pressure->half -= size;
+   else
+      pressure->full -= size;
  }
  
  static struct ra_spill_interval *
  ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
  {
-       return rb_node_data(struct ra_spill_interval, interval, interval);
+   return rb_node_data(struct ra_spill_interval, interval, interval);
  }
  
  static struct ra_spill_ctx *
  ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
  {
-       return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
+   return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
  }
  
  static void
  interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
  {
-       struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
-       struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
+   struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
+   struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
  
-       ra_pressure_add(&ctx->cur_pressure, interval);
+   ra_pressure_add(&ctx->cur_pressure, interval);
  }
  
  static void
  interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
  {
-       struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
-       struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
+   struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
+   struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
  
-       ra_pressure_sub(&ctx->cur_pressure, interval);
+   ra_pressure_sub(&ctx->cur_pressure, interval);
  }
  
  static void
  interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
-                          struct ir3_reg_interval *_child)
+               struct ir3_reg_interval *_child)
  {
-       interval_add(_ctx, _child);
+   interval_add(_ctx, _child);
  }
  
  static void
  spill_ctx_init(struct ra_spill_ctx *ctx)
  {
-       rb_tree_init(&ctx->reg_ctx.intervals);
-       ctx->reg_ctx.interval_add = interval_add;
-       ctx->reg_ctx.interval_delete = interval_delete;
-       ctx->reg_ctx.interval_readd = interval_readd;
+   rb_tree_init(&ctx->reg_ctx.intervals);
+   ctx->reg_ctx.interval_add = interval_add;
+   ctx->reg_ctx.interval_delete = interval_delete;
+   ctx->reg_ctx.interval_readd = interval_readd;
  }
  
  static void
-ra_spill_ctx_insert(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
+ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
+                    struct ra_spill_interval *interval)
  {
-       ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
  }
  
  static void
-ra_spill_ctx_remove(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
+ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
+                    struct ra_spill_interval *interval)
  {
-       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+   ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
  }
  
  static void
  init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
  {
-       struct ra_spill_interval *interval = &ctx->intervals[dst->name];
-       ra_spill_interval_init(interval, dst);
+   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   ra_spill_interval_init(interval, dst);
  }
  
  static void
  insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
  {
-       struct ra_spill_interval *interval = &ctx->intervals[dst->name];
-       if (interval->interval.inserted)
-               return;
-
-       ra_spill_ctx_insert(ctx, interval);
-
-       /* For precolored inputs, make sure we leave enough registers to allow for
-        * holes in the inputs. It can happen that the binning shader has a lower
-        * register pressure than the main shader, but the main shader decided to
-        * add holes between the inputs which means that the binning shader has a
-        * higher register demand.
-        */
-       if (dst->instr->opc == OPC_META_INPUT &&
-               dst->num != INVALID_REG) {
-               physreg_t physreg = ra_reg_get_physreg(dst);
-               physreg_t max = physreg + reg_size(dst);
-
-               if (interval->interval.reg->flags & IR3_REG_SHARED)
-                       ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
-               else if (interval->interval.reg->flags & IR3_REG_HALF)
-                       ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
-               else
-                       ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
-       }
+   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   if (interval->interval.inserted)
+      return;
+
+   ra_spill_ctx_insert(ctx, interval);
+
+   /* For precolored inputs, make sure we leave enough registers to allow for
+    * holes in the inputs. It can happen that the binning shader has a lower
+    * register pressure than the main shader, but the main shader decided to
+    * add holes between the inputs which means that the binning shader has a
+    * higher register demand.
+    */
+   if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
+      physreg_t physreg = ra_reg_get_physreg(dst);
+      physreg_t max = physreg + reg_size(dst);
+
+      if (interval->interval.reg->flags & IR3_REG_SHARED)
+         ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
+      else if (interval->interval.reg->flags & IR3_REG_HALF)
+         ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
+      else
+         ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
+   }
  }
  
  static void
-remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
+remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+                 struct ir3_register *src)
  {
-       if (!(src->flags & IR3_REG_FIRST_KILL))
-               return;
+   if (!(src->flags & IR3_REG_FIRST_KILL))
+      return;
  
-       struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
  
-       if (!interval->interval.inserted || interval->interval.parent ||
-               !rb_tree_is_empty(&interval->interval.children))
-               return;
+   if (!interval->interval.inserted || interval->interval.parent ||
+       !rb_tree_is_empty(&interval->interval.children))
+      return;
  
-       ra_spill_ctx_remove(ctx, interval);
+   ra_spill_ctx_remove(ctx, interval);
  }
  
  static void
-remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
+remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+           struct ir3_register *src)
  {
-       if (!(src->flags & IR3_REG_FIRST_KILL))
-               return;
+   if (!(src->flags & IR3_REG_FIRST_KILL))
+      return;
  
-       struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
  
-       if (!interval->interval.inserted)
-               return;
+   if (!interval->interval.inserted)
+      return;
  
-       ra_spill_ctx_remove(ctx, interval);
+   ra_spill_ctx_remove(ctx, interval);
  }
  
  static void
  remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
  {
-       struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
  
-       if (!interval->interval.inserted)
-               return;
+   if (!interval->interval.inserted)
+      return;
  
-       ra_spill_ctx_remove(ctx, interval);
+   ra_spill_ctx_remove(ctx, interval);
  }
  
  static void
  update_max_pressure(struct ra_spill_ctx *ctx)
  {
-       d("pressure:");
-       d("\tfull: %u", ctx->cur_pressure.full);
-       d("\thalf: %u", ctx->cur_pressure.half);
-       d("\tshared: %u", ctx->cur_pressure.shared);
-
-       ctx->max_pressure.full =
-               MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
-       ctx->max_pressure.half =
-               MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
-       ctx->max_pressure.shared =
-               MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
+   d("pressure:");
+   d("\tfull: %u", ctx->cur_pressure.full);
+   d("\thalf: %u", ctx->cur_pressure.half);
+   d("\tshared: %u", ctx->cur_pressure.shared);
+
+   ctx->max_pressure.full =
+      MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
+   ctx->max_pressure.half =
+      MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
+   ctx->max_pressure.shared =
+      MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
  }
  
  static void
  handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
  {
-       if (RA_DEBUG) {
-               printf("processing: ");
-               ir3_print_instr(instr);
-       }
-
-       ra_foreach_dst(dst, instr) {
-               init_dst(ctx, dst);
-       }
-
-       /* Handle tied destinations. If a destination is tied to a source and that
-        * source is live-through, then we need to allocate a new register for the
-        * destination which is live-through itself and cannot overlap the
-        * sources.
-        */
-
-       ra_foreach_dst(dst, instr) {
-               struct ir3_register *tied_src = dst->tied;
-               if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
-                       insert_dst(ctx, dst);
-       }
-
-       update_max_pressure(ctx);
-
-       ra_foreach_src(src, instr) {
-               if (src->flags & IR3_REG_FIRST_KILL)
-                       remove_src_early(ctx, instr, src);
-       }
-
-
-       ra_foreach_dst(dst, instr) {
-               insert_dst(ctx, dst);
-       }
-
-       update_max_pressure(ctx);
-
-       for (unsigned i = 0; i < instr->srcs_count; i++) {
-               if (ra_reg_is_src(instr->srcs[i]) && 
-                       (instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
-                       remove_src(ctx, instr, instr->srcs[i]);
-       }
-       for (unsigned i = 0; i < instr->dsts_count; i++) {
-               if (ra_reg_is_dst(instr->dsts[i]) &&
-                                (instr->dsts[i]->flags & IR3_REG_UNUSED))
-                       remove_dst(ctx, instr->dsts[i]);
-       }
+   if (RA_DEBUG) {
+      printf("processing: ");
+      ir3_print_instr(instr);
+   }
+
+   ra_foreach_dst (dst, instr) {
+      init_dst(ctx, dst);
+   }
+
+   /* Handle tied destinations. If a destination is tied to a source and that
+    * source is live-through, then we need to allocate a new register for the
+    * destination which is live-through itself and cannot overlap the
+    * sources.
+    */
+
+   ra_foreach_dst (dst, instr) {
+      struct ir3_register *tied_src = dst->tied;
+      if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
+         insert_dst(ctx, dst);
+   }
+
+   update_max_pressure(ctx);
+
+   ra_foreach_src (src, instr) {
+      if (src->flags & IR3_REG_FIRST_KILL)
+         remove_src_early(ctx, instr, src);
+   }
+
+   ra_foreach_dst (dst, instr) {
+      insert_dst(ctx, dst);
+   }
+
+   update_max_pressure(ctx);
+
+   for (unsigned i = 0; i < instr->srcs_count; i++) {
+      if (ra_reg_is_src(instr->srcs[i]) &&
+          (instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
+         remove_src(ctx, instr, instr->srcs[i]);
+   }
+   for (unsigned i = 0; i < instr->dsts_count; i++) {
+      if (ra_reg_is_dst(instr->dsts[i]) &&
+          (instr->dsts[i]->flags & IR3_REG_UNUSED))
+         remove_dst(ctx, instr->dsts[i]);
+   }
  }
  
  static void
  handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
  {
-       init_dst(ctx, instr->dsts[0]);
-       insert_dst(ctx, instr->dsts[0]);
+   init_dst(ctx, instr->dsts[0]);
+   insert_dst(ctx, instr->dsts[0]);
  }
  
  static void
  remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
  {
-       ra_foreach_src(src, instr)
-               remove_src(ctx, instr, src);
-       if (instr->dsts[0]->flags & IR3_REG_UNUSED)
-               remove_dst(ctx, instr->dsts[0]);
+   ra_foreach_src (src, instr)
+      remove_src(ctx, instr, src);
+   if (instr->dsts[0]->flags & IR3_REG_UNUSED)
+      remove_dst(ctx, instr->dsts[0]);
  }
  
  static void
  handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
  {
-       struct ra_spill_interval *interval = &ctx->intervals[def->name];
-       ra_spill_interval_init(interval, def);
-       insert_dst(ctx, def);
+   struct ra_spill_interval *interval = &ctx->intervals[def->name];
+   ra_spill_interval_init(interval, def);
+   insert_dst(ctx, def);
  }
  
  static void
  handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
  {
-       memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
-       rb_tree_init(&ctx->reg_ctx.intervals);
-
-       unsigned name;
-       BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
-                                          ctx->live->definitions_count) {
-               struct ir3_register *reg = ctx->live->definitions[name];
-               handle_live_in(ctx, reg);
-       }
-
-       foreach_instr (instr, &block->instr_list) {
-               if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
-                       instr->opc != OPC_META_TEX_PREFETCH)
-                       break;
-               handle_input_phi(ctx, instr);
-       }
-
-       update_max_pressure(ctx);
-
-       foreach_instr (instr, &block->instr_list) {
-               if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
-                       instr->opc == OPC_META_TEX_PREFETCH)
-                       remove_input_phi(ctx, instr);
-               else
-                       handle_instr(ctx, instr);
-       }
+   memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
+   rb_tree_init(&ctx->reg_ctx.intervals);
+
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      handle_live_in(ctx, reg);
+   }
+
+   foreach_instr (instr, &block->instr_list) {
+      if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
+          instr->opc != OPC_META_TEX_PREFETCH)
+         break;
+      handle_input_phi(ctx, instr);
+   }
+
+   update_max_pressure(ctx);
+
+   foreach_instr (instr, &block->instr_list) {
+      if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
+          instr->opc == OPC_META_TEX_PREFETCH)
+         remove_input_phi(ctx, instr);
+      else
+         handle_instr(ctx, instr);
+   }
  }
  
  void
  ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
-                                 struct ir3_pressure *max_pressure)
+                  struct ir3_pressure *max_pressure)
  {
-       struct ra_spill_ctx ctx = {};
-       ctx.live = live;
-       ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
-       ctx.compiler = v->shader->compiler;
-       spill_ctx_init(&ctx);
+   struct ra_spill_ctx ctx = {};
+   ctx.live = live;
+   ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
+   ctx.compiler = v->shader->compiler;
+   spill_ctx_init(&ctx);
  
-       foreach_block (block, &v->ir->block_list) {
-               handle_block(&ctx, block);
-       }
+   foreach_block (block, &v->ir->block_list) {
+      handle_block(&ctx, block);
+   }
  
-       assert(ctx.cur_pressure.full == 0);
-       assert(ctx.cur_pressure.half == 0);
-       assert(ctx.cur_pressure.shared == 0);
+   assert(ctx.cur_pressure.full == 0);
+   assert(ctx.cur_pressure.half == 0);
+   assert(ctx.cur_pressure.shared == 0);
  
-       free(ctx.intervals);
+   free(ctx.intervals);
  
-       *max_pressure = ctx.max_pressure;
+   *max_pressure = ctx.max_pressure;
  }
-
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c

index 4eb1b67..4c07b02 100644 (file)
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -28,61 +28,64 @@
  #include "ir3.h"
  
  struct ir3_validate_ctx {
-       struct ir3 *ir;
+   struct ir3 *ir;
  
-       /* Current instruction being validated: */
-       struct ir3_instruction *current_instr;
+   /* Current instruction being validated: */
+   struct ir3_instruction *current_instr;
  
-       /* Set of instructions found so far, used to validate that we
-        * don't have SSA uses that occure before def's
-        */
-       struct set *defs;
+   /* Set of instructions found so far, used to validate that we
+    * don't have SSA uses that occure before def's
+    */
+   struct set *defs;
  };
  
  static void
  validate_error(struct ir3_validate_ctx *ctx, const char *condstr)
  {
-       fprintf(stderr, "validation fail: %s\n", condstr);
-       fprintf(stderr, "  -> for instruction: ");
-       ir3_print_instr(ctx->current_instr);
-       abort();
+   fprintf(stderr, "validation fail: %s\n", condstr);
+   fprintf(stderr, "  -> for instruction: ");
+   ir3_print_instr(ctx->current_instr);
+   abort();
  }
  
-#define validate_assert(ctx, cond) do { \
-       if (!(cond)) { \
-               validate_error(ctx, #cond); \
-       } } while (0)
+#define validate_assert(ctx, cond)                                             \
+   do {                                                                        \
+      if (!(cond)) {                                                           \
+         validate_error(ctx, #cond);                                           \
+      }                                                                        \
+   } while (0)
  
  static unsigned
  reg_class_flags(struct ir3_register *reg)
  {
-       return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+   return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
  }
  
  static void
  validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
-                        struct ir3_register *reg)
+             struct ir3_register *reg)
  {
-       if (!(reg->flags & IR3_REG_SSA) || !reg->def)
-               return;
-
-       struct ir3_register *src = reg->def;
-
-       validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
-       validate_assert(ctx, src->wrmask == reg->wrmask);
-       validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
-
-       if (reg->tied) {
-               validate_assert(ctx, reg->tied->tied == reg);
-               bool found = false;
-               foreach_dst (dst, instr) {
-                       if (dst == reg->tied) {
-                               found = true;
-                               break;
-                       }
-               }
-               validate_assert(ctx, found && "tied register not in the same instruction");
-       }
+   if (!(reg->flags & IR3_REG_SSA) || !reg->def)
+      return;
+
+   struct ir3_register *src = reg->def;
+
+   validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
+   validate_assert(ctx, src->wrmask == reg->wrmask);
+   validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
+
+   if (reg->tied) {
+      validate_assert(ctx, reg->tied->tied == reg);
+      bool found = false;
+      foreach_dst (dst, instr) {
+         if (dst == reg->tied) {
+            found = true;
+            break;
+         }
+      }
+      validate_assert(ctx,
+                      found && "tied register not in the same instruction");
+   }
  }
  
  /* phi sources are logically read at the end of the predecessor basic block,
@@ -90,275 +93,280 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
   * use comes after the definition for loop phis.
   */
  static void
-validate_phi_src(struct ir3_validate_ctx *ctx, struct ir3_block *block, struct ir3_block *pred)
+validate_phi_src(struct ir3_validate_ctx *ctx, struct ir3_block *block,
+                 struct ir3_block *pred)
  {
-       unsigned pred_idx = ir3_block_get_pred_index(block, pred);
+   unsigned pred_idx = ir3_block_get_pred_index(block, pred);
  
-       foreach_instr (phi, &block->instr_list) {
-               if (phi->opc != OPC_META_PHI)
-                       break;
+   foreach_instr (phi, &block->instr_list) {
+      if (phi->opc != OPC_META_PHI)
+         break;
  
-               ctx->current_instr = phi;
-               validate_assert(ctx, phi->srcs_count == block->predecessors_count);
-               validate_src(ctx, phi, phi->srcs[pred_idx]);
-       }
+      ctx->current_instr = phi;
+      validate_assert(ctx, phi->srcs_count == block->predecessors_count);
+      validate_src(ctx, phi, phi->srcs[pred_idx]);
+   }
  }
  
  static void
  validate_phi(struct ir3_validate_ctx *ctx, struct ir3_instruction *phi)
  {
-       _mesa_set_add(ctx->defs, phi);
-       validate_assert(ctx, phi->dsts_count == 1);
-       validate_assert(ctx, is_dest_gpr(phi->dsts[0]));
+   _mesa_set_add(ctx->defs, phi);
+   validate_assert(ctx, phi->dsts_count == 1);
+   validate_assert(ctx, is_dest_gpr(phi->dsts[0]));
  }
  
  static void
  validate_dst(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
-                        struct ir3_register *reg)
+             struct ir3_register *reg)
  {
-       if (reg->tied) {
-               validate_assert(ctx, reg->tied->tied == reg);
-               validate_assert(ctx, reg_class_flags(reg->tied) == reg_class_flags(reg));
-               validate_assert(ctx, reg->tied->wrmask == reg->wrmask);
-               if (reg->flags & IR3_REG_ARRAY) {
-                       validate_assert(ctx, reg->tied->array.base == reg->array.base);
-                       validate_assert(ctx, reg->tied->size == reg->size);
-               }
-               bool found = false;
-               foreach_src (src, instr) {
-                       if (src == reg->tied) {
-                               found = true;
-                               break;
-                       }
-               }
-               validate_assert(ctx, found && "tied register not in the same instruction");
-       }
-
-       if (reg->flags & IR3_REG_SSA)
-               validate_assert(ctx, reg->instr == instr);
-
-       if (reg->flags & IR3_REG_RELATIV)
-               validate_assert(ctx, instr->address);
+   if (reg->tied) {
+      validate_assert(ctx, reg->tied->tied == reg);
+      validate_assert(ctx, reg_class_flags(reg->tied) == reg_class_flags(reg));
+      validate_assert(ctx, reg->tied->wrmask == reg->wrmask);
+      if (reg->flags & IR3_REG_ARRAY) {
+         validate_assert(ctx, reg->tied->array.base == reg->array.base);
+         validate_assert(ctx, reg->tied->size == reg->size);
+      }
+      bool found = false;
+      foreach_src (src, instr) {
+         if (src == reg->tied) {
+            found = true;
+            break;
+         }
+      }
+      validate_assert(ctx,
+                      found && "tied register not in the same instruction");
+   }
+
+   if (reg->flags & IR3_REG_SSA)
+      validate_assert(ctx, reg->instr == instr);
+
+   if (reg->flags & IR3_REG_RELATIV)
+      validate_assert(ctx, instr->address);
  }
  
-#define validate_reg_size(ctx, reg, type) \
-       validate_assert(ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
+#define validate_reg_size(ctx, reg, type)                                      \
+   validate_assert(                                                            \
+      ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
  
  static void
  validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
  {
-       struct ir3_register *last_reg = NULL;
-
-       foreach_src_n (reg, n, instr) {
-               if (reg->flags & IR3_REG_RELATIV)
-                       validate_assert(ctx, instr->address);
-
-               validate_src(ctx, instr, reg);
-
-               /* Validate that all src's are either half of full.
-                *
-                * Note: tex instructions w/ .s2en are a bit special in that the
-                * tex/samp src reg is half-reg for non-bindless and full for
-                * bindless, irrespective of the precision of other srcs. The
-                * tex/samp src is the first src reg when .s2en is set
-                */
-               if (reg->tied) {
-                       /* must have the same size as the destination, handled in
-                        * validate_reg().
-                        */
-               } else if (reg == instr->address) {
-                       validate_assert(ctx, reg->flags & IR3_REG_HALF);
-               } else if ((instr->flags & IR3_INSTR_S2EN) && (n < 2)) {
-                       if (n == 0) {
-                               if (instr->flags & IR3_INSTR_B)
-                                       validate_assert(ctx, !(reg->flags & IR3_REG_HALF));
-                               else
-                                       validate_assert(ctx, reg->flags & IR3_REG_HALF);
-                       }
-               } else if (opc_cat(instr->opc) == 6) {
-                       /* handled below */
-               } else if (opc_cat(instr->opc) == 0) {
-                       /* end/chmask/etc are allowed to have different size sources */
-               } else if (n > 0) {
-                       validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == (reg->flags & IR3_REG_HALF));
-               }
-
-               last_reg = reg;
-       }
-       
-       for (unsigned i = 0; i < instr->dsts_count; i++) {
-               struct ir3_register *reg = instr->dsts[i];
-
-               validate_dst(ctx, instr, reg);
-       }
-
-       _mesa_set_add(ctx->defs, instr);
-
-       /* Check that src/dst types match the register types, and for
-        * instructions that have different opcodes depending on type,
-        * that the opcodes are correct.
-        */
-       switch (opc_cat(instr->opc)) {
-       case 1: /* move instructions */
-               if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
-                       validate_assert(ctx, instr->dsts_count == 1);
-                       validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
-                       validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
-                       validate_assert(ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
-               } else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
-                                  instr->opc == OPC_READ_FIRST_MACRO ||
-                                  instr->opc == OPC_READ_COND_MACRO) {
-                       /* nothing yet */
-               } else if (instr->opc == OPC_ELECT_MACRO) {
-                       validate_assert(ctx, instr->dsts_count == 1);
-                       validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
-               } else {
-                       foreach_dst (dst, instr)
-                               validate_reg_size(ctx, dst, instr->cat1.dst_type);
-                       foreach_src (src, instr) {
-                               if (!src->tied && src != instr->address)
-                                       validate_reg_size(ctx, src, instr->cat1.src_type);
-                       }
-
-                       switch (instr->opc) {
-                               case OPC_SWZ:
-                                       validate_assert(ctx, instr->srcs_count == 2);
-                                       validate_assert(ctx, instr->dsts_count == 2);
-                                       break;
-                               case OPC_GAT:
-                                       validate_assert(ctx, instr->srcs_count == 4);
-                                       validate_assert(ctx, instr->dsts_count == 1);
-                                       break;
-                               case OPC_SCT:
-                                       validate_assert(ctx, instr->srcs_count == 1);
-                                       validate_assert(ctx, instr->dsts_count == 4);
-                                       break;
-                               default:
-                                       break;
-                       }
-               }
-
-               if (instr->opc != OPC_MOV)
-                       validate_assert(ctx, !instr->address);
-
-               break;
-       case 3:
-               /* Validate that cat3 opc matches the src type.  We've already checked that all
-                * the src regs are same type
-                */
-               if (instr->srcs[0]->flags & IR3_REG_HALF) {
-                       validate_assert(ctx, instr->opc == cat3_half_opc(instr->opc));
-               } else {
-                       validate_assert(ctx, instr->opc == cat3_full_opc(instr->opc));
-               }
-               break;
-       case 4:
-               /* Validate that cat4 opc matches the dst type: */
-               if (instr->dsts[0]->flags & IR3_REG_HALF) {
-                       validate_assert(ctx, instr->opc == cat4_half_opc(instr->opc));
-               } else {
-                       validate_assert(ctx, instr->opc == cat4_full_opc(instr->opc));
-               }
-               break;
-       case 5:
-               validate_reg_size(ctx, instr->dsts[0], instr->cat5.type);
-               break;
-       case 6:
-               switch (instr->opc) {
-               case OPC_RESINFO:
-               case OPC_RESFMT:
-                       validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
-                       validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
-                       break;
-               case OPC_L2G:
-               case OPC_G2L:
-                       validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
-                       validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                       break;
-               case OPC_STG:
-                       validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                       validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
-                       validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
-                       validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
-                       break;
-               case OPC_STG_A:
-                       validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                       validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
-                       validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
-                       validate_reg_size(ctx, instr->srcs[4], instr->cat6.type);
-                       validate_assert(ctx, !(instr->srcs[5]->flags & IR3_REG_HALF));
-                       break;
-               case OPC_STL:
-               case OPC_STP:
-               case OPC_STLW:
-                       validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                       validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
-                       validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
-                       break;
-               case OPC_STIB:
-                       if (instr->flags & IR3_INSTR_B) {
-                               validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                               validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
-                               validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
-                       } else {
-                               validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                               validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
-                               validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
-                       }
-                       break;
-               default:
-                       validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
-                       validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
-                       if (instr->srcs_count > 1)
-                               validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
-                       break;
-               }
-       }
+   struct ir3_register *last_reg = NULL;
+
+   foreach_src_n (reg, n, instr) {
+      if (reg->flags & IR3_REG_RELATIV)
+         validate_assert(ctx, instr->address);
+
+      validate_src(ctx, instr, reg);
+
+      /* Validate that all src's are either half of full.
+       *
+       * Note: tex instructions w/ .s2en are a bit special in that the
+       * tex/samp src reg is half-reg for non-bindless and full for
+       * bindless, irrespective of the precision of other srcs. The
+       * tex/samp src is the first src reg when .s2en is set
+       */
+      if (reg->tied) {
+         /* must have the same size as the destination, handled in
+          * validate_reg().
+          */
+      } else if (reg == instr->address) {
+         validate_assert(ctx, reg->flags & IR3_REG_HALF);
+      } else if ((instr->flags & IR3_INSTR_S2EN) && (n < 2)) {
+         if (n == 0) {
+            if (instr->flags & IR3_INSTR_B)
+               validate_assert(ctx, !(reg->flags & IR3_REG_HALF));
+            else
+               validate_assert(ctx, reg->flags & IR3_REG_HALF);
+         }
+      } else if (opc_cat(instr->opc) == 6) {
+         /* handled below */
+      } else if (opc_cat(instr->opc) == 0) {
+         /* end/chmask/etc are allowed to have different size sources */
+      } else if (n > 0) {
+         validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) ==
+                                 (reg->flags & IR3_REG_HALF));
+      }
+
+      last_reg = reg;
+   }
+
+   for (unsigned i = 0; i < instr->dsts_count; i++) {
+      struct ir3_register *reg = instr->dsts[i];
+
+      validate_dst(ctx, instr, reg);
+   }
+
+   _mesa_set_add(ctx->defs, instr);
+
+   /* Check that src/dst types match the register types, and for
+    * instructions that have different opcodes depending on type,
+    * that the opcodes are correct.
+    */
+   switch (opc_cat(instr->opc)) {
+   case 1: /* move instructions */
+      if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
+         validate_assert(ctx, instr->dsts_count == 1);
+         validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
+         validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
+         validate_assert(
+            ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
+      } else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
+                 instr->opc == OPC_READ_FIRST_MACRO ||
+                 instr->opc == OPC_READ_COND_MACRO) {
+         /* nothing yet */
+      } else if (instr->opc == OPC_ELECT_MACRO) {
+         validate_assert(ctx, instr->dsts_count == 1);
+         validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
+      } else {
+         foreach_dst (dst, instr)
+            validate_reg_size(ctx, dst, instr->cat1.dst_type);
+         foreach_src (src, instr) {
+            if (!src->tied && src != instr->address)
+               validate_reg_size(ctx, src, instr->cat1.src_type);
+         }
+
+         switch (instr->opc) {
+         case OPC_SWZ:
+            validate_assert(ctx, instr->srcs_count == 2);
+            validate_assert(ctx, instr->dsts_count == 2);
+            break;
+         case OPC_GAT:
+            validate_assert(ctx, instr->srcs_count == 4);
+            validate_assert(ctx, instr->dsts_count == 1);
+            break;
+         case OPC_SCT:
+            validate_assert(ctx, instr->srcs_count == 1);
+            validate_assert(ctx, instr->dsts_count == 4);
+            break;
+         default:
+            break;
+         }
+      }
+
+      if (instr->opc != OPC_MOV)
+         validate_assert(ctx, !instr->address);
+
+      break;
+   case 3:
+      /* Validate that cat3 opc matches the src type.  We've already checked
+       * that all the src regs are same type
+       */
+      if (instr->srcs[0]->flags & IR3_REG_HALF) {
+         validate_assert(ctx, instr->opc == cat3_half_opc(instr->opc));
+      } else {
+         validate_assert(ctx, instr->opc == cat3_full_opc(instr->opc));
+      }
+      break;
+   case 4:
+      /* Validate that cat4 opc matches the dst type: */
+      if (instr->dsts[0]->flags & IR3_REG_HALF) {
+         validate_assert(ctx, instr->opc == cat4_half_opc(instr->opc));
+      } else {
+         validate_assert(ctx, instr->opc == cat4_full_opc(instr->opc));
+      }
+      break;
+   case 5:
+      validate_reg_size(ctx, instr->dsts[0], instr->cat5.type);
+      break;
+   case 6:
+      switch (instr->opc) {
+      case OPC_RESINFO:
+      case OPC_RESFMT:
+         validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
+         validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
+         break;
+      case OPC_L2G:
+      case OPC_G2L:
+         validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
+         validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+         break;
+      case OPC_STG:
+         validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+         validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
+         validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
+         validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
+         break;
+      case OPC_STG_A:
+         validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+         validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
+         validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
+         validate_reg_size(ctx, instr->srcs[4], instr->cat6.type);
+         validate_assert(ctx, !(instr->srcs[5]->flags & IR3_REG_HALF));
+         break;
+      case OPC_STL:
+      case OPC_STP:
+      case OPC_STLW:
+         validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+         validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
+         validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
+         break;
+      case OPC_STIB:
+         if (instr->flags & IR3_INSTR_B) {
+            validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+            validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
+            validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
+         } else {
+            validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+            validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
+            validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
+         }
+         break;
+      default:
+         validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
+         validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
+         if (instr->srcs_count > 1)
+            validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
+         break;
+      }
+   }
  }
  
  void
  ir3_validate(struct ir3 *ir)
  {
  #ifdef NDEBUG
-#  define VALIDATE 0
+#define VALIDATE 0
  #else
-#  define VALIDATE 1
+#define VALIDATE 1
  #endif
  
-       if (!VALIDATE)
-               return;
-
-       struct ir3_validate_ctx *ctx = ralloc_size(NULL, sizeof(*ctx));
-
-       ctx->ir = ir;
-       ctx->defs = _mesa_pointer_set_create(ctx);
-
-       foreach_block (block, &ir->block_list) {
-               /* We require that the first block does not have any predecessors,
-                * which allows us to assume that phi nodes and meta:input's do not
-                * appear in the same basic block.
-                */
-               validate_assert(ctx,
-                               block != ir3_start_block(ir) || block->predecessors_count == 0);
-
-               struct ir3_instruction *prev = NULL;
-               foreach_instr (instr, &block->instr_list) {
-                       ctx->current_instr = instr;
-                       if (instr->opc == OPC_META_PHI) {
-                               /* phis must be the first in the block */
-                               validate_assert(ctx, prev == NULL || prev->opc == OPC_META_PHI);
-                               validate_phi(ctx, instr);
-                       } else {
-                               validate_instr(ctx, instr);
-                       }
-                       prev = instr;
-               }
-
-               for (unsigned i = 0; i < 2; i++) {
-                       if (block->successors[i])
-                               validate_phi_src(ctx, block->successors[i], block);
-               }
-       }
-
-       ralloc_free(ctx);
+   if (!VALIDATE)
+      return;
+
+   struct ir3_validate_ctx *ctx = ralloc_size(NULL, sizeof(*ctx));
+
+   ctx->ir = ir;
+   ctx->defs = _mesa_pointer_set_create(ctx);
+
+   foreach_block (block, &ir->block_list) {
+      /* We require that the first block does not have any predecessors,
+       * which allows us to assume that phi nodes and meta:input's do not
+       * appear in the same basic block.
+       */
+      validate_assert(
+         ctx, block != ir3_start_block(ir) || block->predecessors_count == 0);
+
+      struct ir3_instruction *prev = NULL;
+      foreach_instr (instr, &block->instr_list) {
+         ctx->current_instr = instr;
+         if (instr->opc == OPC_META_PHI) {
+            /* phis must be the first in the block */
+            validate_assert(ctx, prev == NULL || prev->opc == OPC_META_PHI);
+            validate_phi(ctx, instr);
+         } else {
+            validate_instr(ctx, instr);
+         }
+         prev = instr;
+      }
+
+      for (unsigned i = 0; i < 2; i++) {
+         if (block->successors[i])
+            validate_phi_src(ctx, block->successors[i], block);
+      }
+   }
+
+   ralloc_free(ctx);
  }
diff --git a/src/freedreno/ir3/regmask.h b/src/freedreno/ir3/regmask.h

index 4dded90..ea84a36 100644 (file)
--- a/src/freedreno/ir3/regmask.h
+++ b/src/freedreno/ir3/regmask.h
@@ -32,100 +32,100 @@
  typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
  
  typedef struct {
-       bool mergedregs;
-       regmaskstate_t mask;
+   bool mergedregs;
+   regmaskstate_t mask;
  } regmask_t;
  
  static inline bool
  __regmask_get(regmask_t *regmask, bool half, unsigned n)
  {
-       if (regmask->mergedregs) {
-               /* a6xx+ case, with merged register file, we track things in terms
-                * of half-precision registers, with a full precisions register
-                * using two half-precision slots:
-                */
-               if (half) {
-                       return BITSET_TEST(regmask->mask, n);
-               } else {
-                       n *= 2;
-                       return BITSET_TEST(regmask->mask, n) ||
-                               BITSET_TEST(regmask->mask, n+1);
-               }
-       } else {
-               /* pre a6xx case, with separate register file for half and full
-                * precision:
-                */
-               if (half)
-                       n += MAX_REG;
-               return BITSET_TEST(regmask->mask, n);
-       }
+   if (regmask->mergedregs) {
+      /* a6xx+ case, with merged register file, we track things in terms
+       * of half-precision registers, with a full precisions register
+       * using two half-precision slots:
+       */
+      if (half) {
+         return BITSET_TEST(regmask->mask, n);
+      } else {
+         n *= 2;
+         return BITSET_TEST(regmask->mask, n) ||
+                BITSET_TEST(regmask->mask, n + 1);
+      }
+   } else {
+      /* pre a6xx case, with separate register file for half and full
+       * precision:
+       */
+      if (half)
+         n += MAX_REG;
+      return BITSET_TEST(regmask->mask, n);
+   }
  }
  
  static inline void
  __regmask_set(regmask_t *regmask, bool half, unsigned n)
  {
-       if (regmask->mergedregs) {
-               /* a6xx+ case, with merged register file, we track things in terms
-                * of half-precision registers, with a full precisions register
-                * using two half-precision slots:
-                */
-               if (half) {
-                       BITSET_SET(regmask->mask, n);
-               } else {
-                       n *= 2;
-                       BITSET_SET(regmask->mask, n);
-                       BITSET_SET(regmask->mask, n+1);
-               }
-       } else {
-               /* pre a6xx case, with separate register file for half and full
-                * precision:
-                */
-               if (half)
-                       n += MAX_REG;
-               BITSET_SET(regmask->mask, n);
-       }
+   if (regmask->mergedregs) {
+      /* a6xx+ case, with merged register file, we track things in terms
+       * of half-precision registers, with a full precisions register
+       * using two half-precision slots:
+       */
+      if (half) {
+         BITSET_SET(regmask->mask, n);
+      } else {
+         n *= 2;
+         BITSET_SET(regmask->mask, n);
+         BITSET_SET(regmask->mask, n + 1);
+      }
+   } else {
+      /* pre a6xx case, with separate register file for half and full
+       * precision:
+       */
+      if (half)
+         n += MAX_REG;
+      BITSET_SET(regmask->mask, n);
+   }
  }
  
  static inline void
  __regmask_clear(regmask_t *regmask, bool half, unsigned n)
  {
-       if (regmask->mergedregs) {
-               /* a6xx+ case, with merged register file, we track things in terms
-                * of half-precision registers, with a full precisions register
-                * using two half-precision slots:
-                */
-               if (half) {
-                       BITSET_CLEAR(regmask->mask, n);
-               } else {
-                       n *= 2;
-                       BITSET_CLEAR(regmask->mask, n);
-                       BITSET_CLEAR(regmask->mask, n+1);
-               }
-       } else {
-               /* pre a6xx case, with separate register file for half and full
-                * precision:
-                */
-               if (half)
-                       n += MAX_REG;
-               BITSET_CLEAR(regmask->mask, n);
-       }
+   if (regmask->mergedregs) {
+      /* a6xx+ case, with merged register file, we track things in terms
+       * of half-precision registers, with a full precisions register
+       * using two half-precision slots:
+       */
+      if (half) {
+         BITSET_CLEAR(regmask->mask, n);
+      } else {
+         n *= 2;
+         BITSET_CLEAR(regmask->mask, n);
+         BITSET_CLEAR(regmask->mask, n + 1);
+      }
+   } else {
+      /* pre a6xx case, with separate register file for half and full
+       * precision:
+       */
+      if (half)
+         n += MAX_REG;
+      BITSET_CLEAR(regmask->mask, n);
+   }
  }
  
  static inline void
  regmask_init(regmask_t *regmask, bool mergedregs)
  {
-       memset(&regmask->mask, 0, sizeof(regmask->mask));
-       regmask->mergedregs = mergedregs;
+   memset(&regmask->mask, 0, sizeof(regmask->mask));
+   regmask->mergedregs = mergedregs;
  }
  
  static inline void
  regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
  {
-       assert(dst->mergedregs == a->mergedregs);
-       assert(dst->mergedregs == b->mergedregs);
+   assert(dst->mergedregs == a->mergedregs);
+   assert(dst->mergedregs == b->mergedregs);
  
-       for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
-               dst->mask[i] = a->mask[i] | b->mask[i];
+   for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
+      dst->mask[i] = a->mask[i] | b->mask[i];
  }
  
  #endif /* REGMASK_H_ */
diff --git a/src/freedreno/ir3/tests/delay.c b/src/freedreno/ir3/tests/delay.c

index 11f56c1..252fd49 100644 (file)
--- a/src/freedreno/ir3/tests/delay.c
+++ b/src/freedreno/ir3/tests/delay.c
@@ -42,8 +42,8 @@
  /* clang-format on */
  
  static const struct test {
-       const char *asmstr;
-       unsigned expected_delay;
+   const char *asmstr;
+   unsigned expected_delay;
  } tests[] = {
     /* clang-format off */
     TEST(6,
@@ -101,16 +101,16 @@ static const struct test {
  static struct ir3_shader *
  parse_asm(struct ir3_compiler *c, const char *asmstr)
  {
-       struct ir3_kernel_info info = {};
-       FILE *in = fmemopen((void *)asmstr, strlen(asmstr), "r");
-       struct ir3_shader *shader = ir3_parse_asm(c, &info, in);
+   struct ir3_kernel_info info = {};
+   FILE *in = fmemopen((void *)asmstr, strlen(asmstr), "r");
+   struct ir3_shader *shader = ir3_parse_asm(c, &info, in);
  
-       fclose(in);
+   fclose(in);
  
-       if (!shader)
-               errx(-1, "assembler failed");
+   if (!shader)
+      errx(-1, "assembler failed");
  
-       return shader;
+   return shader;
  }
  
  /**
@@ -124,71 +124,70 @@ parse_asm(struct ir3_compiler *c, const char *asmstr)
  static void
  fixup_wrmask(struct ir3 *ir)
  {
-       struct ir3_block *block = ir3_start_block(ir);
-
-       foreach_instr_safe (instr, &block->instr_list) {
-               instr->dsts[0]->wrmask = MASK(instr->repeat + 1);
-               foreach_src (reg, instr) {
-                       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-                               continue;
-
-                       if (reg->flags & IR3_REG_R)
-                               reg->wrmask = MASK(instr->repeat + 1);
-                       else
-                               reg->wrmask = 1;
-               }
-       }
+   struct ir3_block *block = ir3_start_block(ir);
+
+   foreach_instr_safe (instr, &block->instr_list) {
+      instr->dsts[0]->wrmask = MASK(instr->repeat + 1);
+      foreach_src (reg, instr) {
+         if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+            continue;
+
+         if (reg->flags & IR3_REG_R)
+            reg->wrmask = MASK(instr->repeat + 1);
+         else
+            reg->wrmask = 1;
+      }
+   }
  }
  
-
  int
  main(int argc, char **argv)
  {
-       struct ir3_compiler *c;
-       int result = 0;
+   struct ir3_compiler *c;
+   int result = 0;
  
-       c = ir3_compiler_create(NULL, 630, false);
+   c = ir3_compiler_create(NULL, 630, false);
  
-       for (int i = 0; i < ARRAY_SIZE(tests); i++) {
-               const struct test *test = &tests[i];
-               struct ir3_shader *shader = parse_asm(c, test->asmstr);
-               struct ir3 *ir = shader->variants->ir;
+   for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+      const struct test *test = &tests[i];
+      struct ir3_shader *shader = parse_asm(c, test->asmstr);
+      struct ir3 *ir = shader->variants->ir;
  
-               fixup_wrmask(ir);
+      fixup_wrmask(ir);
  
-               ir3_debug_print(ir, "AFTER fixup_wrmask");
+      ir3_debug_print(ir, "AFTER fixup_wrmask");
  
-               struct ir3_block *block =
-                       list_first_entry(&ir->block_list, struct ir3_block, node);
-               struct ir3_instruction *last = NULL;
+      struct ir3_block *block =
+         list_first_entry(&ir->block_list, struct ir3_block, node);
+      struct ir3_instruction *last = NULL;
  
-               foreach_instr_rev (instr, &block->instr_list) {
-                       if (is_meta(instr))
-                               continue;
-                       last = instr;
-                       break;
-               }
+      foreach_instr_rev (instr, &block->instr_list) {
+         if (is_meta(instr))
+            continue;
+         last = instr;
+         break;
+      }
  
-               /* The delay calc is expecting the instr to not yet be added to the
-                * block, so remove it from the block so that it doesn't get counted
-                * in the distance from assigner:
-                */
-               list_delinit(&last->node);
+      /* The delay calc is expecting the instr to not yet be added to the
+       * block, so remove it from the block so that it doesn't get counted
+       * in the distance from assigner:
+       */
+      list_delinit(&last->node);
  
-               unsigned n = ir3_delay_calc_exact(block, last, true);
+      unsigned n = ir3_delay_calc_exact(block, last, true);
  
-               if (n != test->expected_delay) {
-                       printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n",
-                               i, test->expected_delay, n, test->asmstr);
-                       result = -1;
-               } else {
-                       printf("%d: PASS\n", i);
-               }
+      if (n != test->expected_delay) {
+         printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n", i,
+                test->expected_delay, n, test->asmstr);
+         result = -1;
+      } else {
+         printf("%d: PASS\n", i);
+      }
  
-               ir3_shader_destroy(shader);
-       }
+      ir3_shader_destroy(shader);
+   }
  
-       ir3_compiler_destroy(c);
+   ir3_compiler_destroy(c);
  
-       return result;
+   return result;
  }
diff --git a/src/freedreno/ir3/tests/disasm.c b/src/freedreno/ir3/tests/disasm.c

index 8f2c890..24f643a 100644 (file)
--- a/src/freedreno/ir3/tests/disasm.c
+++ b/src/freedreno/ir3/tests/disasm.c
@@ -48,15 +48,16 @@
  /* clang-format on */
  
  static const struct test {
-       int gpu_id;
-       const char *instr;
-       const char *expected;
-       /**
-        * Do we expect asm parse fail (ie. for things not (yet) supported by ir3_parser.y)
-        */
-       bool parse_fail;
+   int gpu_id;
+   const char *instr;
+   const char *expected;
+   /**
+    * Do we expect asm parse fail (ie. for things not (yet) supported by
+    * ir3_parser.y)
+    */
+   bool parse_fail;
  } tests[] = {
-/* clang-format off */
+   /* clang-format off */
         /* cat0 */
         INSTR_6XX(00000000_00000000, "nop"),
         INSTR_6XX(00000200_00000000, "(rpt2)nop"),
@@ -351,128 +352,132 @@ static const struct test {
     INSTR_6XX(e0fa0000_00000000, "fence.g.l.r.w"),
     INSTR_6XX(e09a0000_00000000, "fence.r.w"),
     INSTR_6XX(f0420000_00000000, "(sy)bar.g"),
-/* clang-format on */
+   /* clang-format on */
  };
  
  static void
  trim(char *string)
  {
-       for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
-               string[len - 1] = 0;
+   for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
+      string[len - 1] = 0;
  }
  
  int
  main(int argc, char **argv)
  {
-       int retval = 0;
-       int decode_fails = 0, asm_fails = 0, encode_fails = 0;
-       const int output_size = 4096;
-       char *disasm_output = malloc(output_size);
-       FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
-       if (!fdisasm) {
-               fprintf(stderr, "failed to fmemopen\n");
-               return 1;
-       }
-
-       struct ir3_compiler *compilers[10] = {};
-
-       for (int i = 0; i < ARRAY_SIZE(tests); i++) {
-               const struct test *test = &tests[i];
-               printf("Testing a%d %s: \"%s\"...\n",
-                               test->gpu_id, test->instr, test->expected);
-
-               rewind(fdisasm);
-               memset(disasm_output, 0, output_size);
-
-               /*
-                * Test disassembly:
-                */
-
-               uint32_t code[2] = {
-                       strtoll(&test->instr[9], NULL, 16),
-                       strtoll(&test->instr[0], NULL, 16),
-               };
-               isa_decode(code, 8, fdisasm, &(struct isa_decode_options){
-                       .gpu_id = test->gpu_id,
-                       .show_errors = true,
-               });
-               fflush(fdisasm);
-
-               trim(disasm_output);
-
-               if (strcmp(disasm_output, test->expected) != 0) {
-                       printf("FAIL: disasm\n");
-                       printf("  Expected: \"%s\"\n", test->expected);
-                       printf("  Got:      \"%s\"\n", disasm_output);
-                       retval = 1;
-                       decode_fails++;
-                       continue;
-               }
-
-               /*
-                * Test assembly, which should result in the identical binary:
-                */
-
-               unsigned gen = test->gpu_id / 100;
-               if (!compilers[gen]) {
-                       compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
-               }
-
-               FILE *fasm = fmemopen((void *)test->expected, strlen(test->expected), "r");
-
-               struct ir3_kernel_info info = {};
-               struct ir3_shader *shader = ir3_parse_asm(compilers[gen], &info, fasm);
-               fclose(fasm);
-               if (!shader) {
-                       printf("FAIL: %sexpected assembler fail\n", test->parse_fail ? "" : "un");
-                       asm_fails++;
-                       /* If this is an instruction that the asm parser is not expected
-                        * to handle, don't count it as a fail.
-                        */
-                       if (!test->parse_fail)
-                               retval = 1;
-                       continue;
-               } else if (test->parse_fail) {
-                       /* If asm parse starts passing, and we don't expect that, flag
-                        * it as a fail so we don't forget to update the test vector:
-                        */
-                       printf("FAIL: unexpected parse success, please remove '.parse_fail=true'\n");
-                       retval = 1;
-               }
-
-               struct ir3_shader_variant *v = shader->variants;
-               if (memcmp(v->bin, code, sizeof(code))) {
-                       printf("FAIL: assembler\n");
-                       printf("  Expected: %08x_%08x\n", code[1], code[0]);
-                       printf("  Got:      %08x_%08x\n", v->bin[1], v->bin[0]);
-                       retval = 1;
-                       encode_fails++;
-               }
-
-               ir3_shader_destroy(shader);
-       }
-
-       if (decode_fails)
-               printf("%d/%d decode fails\n", decode_fails, (int)ARRAY_SIZE(tests));
-       if (asm_fails)
-               printf("%d/%d assembler fails\n", asm_fails, (int)ARRAY_SIZE(tests));
-       if (encode_fails)
-               printf("%d/%d encode fails\n", encode_fails, (int)ARRAY_SIZE(tests));
-
-       if (retval) {
-               printf("FAILED!\n");
-       } else {
-               printf("PASSED!\n");
-       }
-
-       for (unsigned i = 0; i < ARRAY_SIZE(compilers); i++) {
-               if (!compilers[i])
-                       continue;
-               ir3_compiler_destroy(compilers[i]);
-       }
-
-       fclose(fdisasm);
-       free(disasm_output);
-
-       return retval;
+   int retval = 0;
+   int decode_fails = 0, asm_fails = 0, encode_fails = 0;
+   const int output_size = 4096;
+   char *disasm_output = malloc(output_size);
+   FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
+   if (!fdisasm) {
+      fprintf(stderr, "failed to fmemopen\n");
+      return 1;
+   }
+
+   struct ir3_compiler *compilers[10] = {};
+
+   for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+      const struct test *test = &tests[i];
+      printf("Testing a%d %s: \"%s\"...\n", test->gpu_id, test->instr,
+             test->expected);
+
+      rewind(fdisasm);
+      memset(disasm_output, 0, output_size);
+
+      /*
+       * Test disassembly:
+       */
+
+      uint32_t code[2] = {
+         strtoll(&test->instr[9], NULL, 16),
+         strtoll(&test->instr[0], NULL, 16),
+      };
+      isa_decode(code, 8, fdisasm,
+                 &(struct isa_decode_options){
+                    .gpu_id = test->gpu_id,
+                    .show_errors = true,
+                 });
+      fflush(fdisasm);
+
+      trim(disasm_output);
+
+      if (strcmp(disasm_output, test->expected) != 0) {
+         printf("FAIL: disasm\n");
+         printf("  Expected: \"%s\"\n", test->expected);
+         printf("  Got:      \"%s\"\n", disasm_output);
+         retval = 1;
+         decode_fails++;
+         continue;
+      }
+
+      /*
+       * Test assembly, which should result in the identical binary:
+       */
+
+      unsigned gen = test->gpu_id / 100;
+      if (!compilers[gen]) {
+         compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
+      }
+
+      FILE *fasm =
+         fmemopen((void *)test->expected, strlen(test->expected), "r");
+
+      struct ir3_kernel_info info = {};
+      struct ir3_shader *shader = ir3_parse_asm(compilers[gen], &info, fasm);
+      fclose(fasm);
+      if (!shader) {
+         printf("FAIL: %sexpected assembler fail\n",
+                test->parse_fail ? "" : "un");
+         asm_fails++;
+         /* If this is an instruction that the asm parser is not expected
+          * to handle, don't count it as a fail.
+          */
+         if (!test->parse_fail)
+            retval = 1;
+         continue;
+      } else if (test->parse_fail) {
+         /* If asm parse starts passing, and we don't expect that, flag
+          * it as a fail so we don't forget to update the test vector:
+          */
+         printf(
+            "FAIL: unexpected parse success, please remove '.parse_fail=true'\n");
+         retval = 1;
+      }
+
+      struct ir3_shader_variant *v = shader->variants;
+      if (memcmp(v->bin, code, sizeof(code))) {
+         printf("FAIL: assembler\n");
+         printf("  Expected: %08x_%08x\n", code[1], code[0]);
+         printf("  Got:      %08x_%08x\n", v->bin[1], v->bin[0]);
+         retval = 1;
+         encode_fails++;
+      }
+
+      ir3_shader_destroy(shader);
+   }
+
+   if (decode_fails)
+      printf("%d/%d decode fails\n", decode_fails, (int)ARRAY_SIZE(tests));
+   if (asm_fails)
+      printf("%d/%d assembler fails\n", asm_fails, (int)ARRAY_SIZE(tests));
+   if (encode_fails)
+      printf("%d/%d encode fails\n", encode_fails, (int)ARRAY_SIZE(tests));
+
+   if (retval) {
+      printf("FAILED!\n");
+   } else {
+      printf("PASSED!\n");
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(compilers); i++) {
+      if (!compilers[i])
+         continue;
+      ir3_compiler_destroy(compilers[i]);
+   }
+
+   fclose(fdisasm);
+   free(disasm_output);
+
+   return retval;
  }
author	Connor Abbott <cwabbott0@gmail.com>
	Fri, 9 Jul 2021 12:50:05 +0000 (14:50 +0200)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 12 Jul 2021 20:57:21 +0000 (20:57 +0000)
src/freedreno/ir3/disasm-a3xx.c		patch \| blob \| history
src/freedreno/ir3/instr-a3xx.h		patch \| blob \| history
src/freedreno/ir3/ir3.c		patch \| blob \| history
src/freedreno/ir3/ir3.h		patch \| blob \| history
src/freedreno/ir3/ir3_a4xx.c		patch \| blob \| history
src/freedreno/ir3/ir3_a6xx.c		patch \| blob \| history
src/freedreno/ir3/ir3_array_to_ssa.c		patch \| blob \| history
src/freedreno/ir3/ir3_assembler.c		patch \| blob \| history
src/freedreno/ir3/ir3_assembler.h		patch \| blob \| history
src/freedreno/ir3/ir3_cf.c		patch \| blob \| history
src/freedreno/ir3/ir3_compiler.c		patch \| blob \| history
src/freedreno/ir3/ir3_compiler.h		patch \| blob \| history
src/freedreno/ir3/ir3_compiler_nir.c		patch \| blob \| history
src/freedreno/ir3/ir3_context.c		patch \| blob \| history
src/freedreno/ir3/ir3_context.h		patch \| blob \| history
src/freedreno/ir3/ir3_cp.c		patch \| blob \| history
src/freedreno/ir3/ir3_cp_postsched.c		patch \| blob \| history
src/freedreno/ir3/ir3_cse.c		patch \| blob \| history
src/freedreno/ir3/ir3_dce.c		patch \| blob \| history
src/freedreno/ir3/ir3_delay.c		patch \| blob \| history
src/freedreno/ir3/ir3_disk_cache.c		patch \| blob \| history
src/freedreno/ir3/ir3_dominance.c		patch \| blob \| history
src/freedreno/ir3/ir3_image.c		patch \| blob \| history
src/freedreno/ir3/ir3_image.h		patch \| blob \| history
src/freedreno/ir3/ir3_legalize.c		patch \| blob \| history
src/freedreno/ir3/ir3_liveness.c		patch \| blob \| history
src/freedreno/ir3/ir3_lower_parallelcopy.c		patch \| blob \| history
src/freedreno/ir3/ir3_lower_subgroups.c		patch \| blob \| history
src/freedreno/ir3/ir3_merge_regs.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir.h		patch \| blob \| history
src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_lower_io_offsets.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_offset.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_lower_load_barycentric_at_sample.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_lower_tess.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir_move_varying_inputs.c		patch \| blob \| history
src/freedreno/ir3/ir3_postsched.c		patch \| blob \| history
src/freedreno/ir3/ir3_print.c		patch \| blob \| history
src/freedreno/ir3/ir3_ra.c		patch \| blob \| history
src/freedreno/ir3/ir3_ra.h		patch \| blob \| history
src/freedreno/ir3/ir3_ra_validate.c		patch \| blob \| history
src/freedreno/ir3/ir3_sched.c		patch \| blob \| history
src/freedreno/ir3/ir3_shader.c		patch \| blob \| history
src/freedreno/ir3/ir3_shader.h		patch \| blob \| history
src/freedreno/ir3/ir3_spill.c		patch \| blob \| history
src/freedreno/ir3/ir3_validate.c		patch \| blob \| history
src/freedreno/ir3/regmask.h		patch \| blob \| history
src/freedreno/ir3/tests/delay.c		patch \| blob \| history
src/freedreno/ir3/tests/disasm.c		patch \| blob \| history