freedreno: slurp in decode tools
authorRob Clark <robdclark@chromium.org>
Fri, 24 Jul 2020 00:32:36 +0000 (17:32 -0700)
committerMarge Bot <eric+marge@anholt.net>
Tue, 28 Jul 2020 09:45:08 +0000 (09:45 +0000)
cffdump, crashdec, etc

At this point there is some duplication with other files in-tree (ie.
a2xx and a3xx+ disassembly), which will be cleaned up in a later commit.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6070>

32 files changed:
src/freedreno/decode/buffers.c [new file with mode: 0644]
src/freedreno/decode/buffers.h [new file with mode: 0644]
src/freedreno/decode/cffdec.c [new file with mode: 0644]
src/freedreno/decode/cffdec.h [new file with mode: 0644]
src/freedreno/decode/cffdump.c [new file with mode: 0644]
src/freedreno/decode/crashdec.c [new file with mode: 0644]
src/freedreno/decode/disasm-a2xx.c [new file with mode: 0644]
src/freedreno/decode/disasm-a3xx.c [new file with mode: 0644]
src/freedreno/decode/disasm.h [new file with mode: 0644]
src/freedreno/decode/instr-a2xx.h [new file with mode: 0644]
src/freedreno/decode/instr-a3xx.h [new file with mode: 0644]
src/freedreno/decode/io.c [new file with mode: 0644]
src/freedreno/decode/io.h [new file with mode: 0644]
src/freedreno/decode/meson.build [new file with mode: 0644]
src/freedreno/decode/pager.c [new file with mode: 0644]
src/freedreno/decode/pager.h [new file with mode: 0644]
src/freedreno/decode/pgmdump.c [new file with mode: 0644]
src/freedreno/decode/pgmdump2.c [new file with mode: 0644]
src/freedreno/decode/redump.h [new file with mode: 0644]
src/freedreno/decode/rnnutil.c [new file with mode: 0644]
src/freedreno/decode/rnnutil.h [new file with mode: 0644]
src/freedreno/decode/script.c [new file with mode: 0644]
src/freedreno/decode/script.h [new file with mode: 0644]
src/freedreno/decode/scripts/analyze.lua [new file with mode: 0644]
src/freedreno/decode/scripts/parse-submits.lua [new file with mode: 0644]
src/freedreno/decode/scripts/sanity-a6xx.lua [new file with mode: 0644]
src/freedreno/decode/scripts/test.lua [new file with mode: 0644]
src/freedreno/decode/scripts/tex3d-layout.lua [new file with mode: 0644]
src/freedreno/decode/scripts/texturator-to-unit-test-5xx.lua [new file with mode: 0644]
src/freedreno/decode/scripts/texturator-to-unit-test.lua [new file with mode: 0644]
src/freedreno/decode/util.h [new file with mode: 0644]
src/freedreno/meson.build

diff --git a/src/freedreno/decode/buffers.c b/src/freedreno/decode/buffers.c
new file mode 100644 (file)
index 0000000..8e696f8
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Helper lib to track gpu buffers contents/address, and map between gpu and
+ * host address while decoding cmdstream/crashdumps
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "buffers.h"
+
+struct buffer {
+       void *hostptr;
+       unsigned int len;
+       uint64_t gpuaddr;
+
+       /* for 'once' mode, for buffers containing cmdstream keep track per offset
+        * into buffer of which modes it has already been dumped;
+        */
+       struct {
+               unsigned offset;
+               unsigned dumped_mask;
+       } offsets[64];
+       unsigned noffsets;
+};
+
+static struct buffer buffers[512];
+static int nbuffers;
+
+static int
+buffer_contains_gpuaddr(struct buffer *buf, uint64_t gpuaddr, uint32_t len)
+{
+       return (buf->gpuaddr <= gpuaddr) && (gpuaddr < (buf->gpuaddr + buf->len));
+}
+
+static int
+buffer_contains_hostptr(struct buffer *buf, void *hostptr)
+{
+       return (buf->hostptr <= hostptr) && (hostptr < (buf->hostptr + buf->len));
+}
+
+
+uint64_t
+gpuaddr(void *hostptr)
+{
+       int i;
+       for (i = 0; i < nbuffers; i++)
+               if (buffer_contains_hostptr(&buffers[i], hostptr))
+                       return buffers[i].gpuaddr + (hostptr - buffers[i].hostptr);
+       return 0;
+}
+
+uint64_t
+gpubaseaddr(uint64_t gpuaddr)
+{
+       int i;
+       if (!gpuaddr)
+               return 0;
+       for (i = 0; i < nbuffers; i++)
+               if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0))
+                       return buffers[i].gpuaddr;
+       return 0;
+}
+
+void *
+hostptr(uint64_t gpuaddr)
+{
+       int i;
+       if (!gpuaddr)
+               return 0;
+       for (i = 0; i < nbuffers; i++)
+               if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0))
+                       return buffers[i].hostptr + (gpuaddr - buffers[i].gpuaddr);
+       return 0;
+}
+
+unsigned
+hostlen(uint64_t gpuaddr)
+{
+       int i;
+       if (!gpuaddr)
+               return 0;
+       for (i = 0; i < nbuffers; i++)
+               if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0))
+                       return buffers[i].len + buffers[i].gpuaddr - gpuaddr;
+       return 0;
+}
+
+bool
+has_dumped(uint64_t gpuaddr, unsigned enable_mask)
+{
+       if (!gpuaddr)
+               return false;
+
+       for (int i = 0; i < nbuffers; i++) {
+               if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0)) {
+                       struct buffer *b = &buffers[i];
+                       assert(gpuaddr >= b->gpuaddr);
+                       unsigned offset = gpuaddr - b->gpuaddr;
+
+                       unsigned n = 0;
+                       while (n < b->noffsets) {
+                               if (offset == b->offsets[n].offset)
+                                       break;
+                               n++;
+                       }
+
+                       /* if needed, allocate a new offset entry: */
+                       if (n == b->noffsets) {
+                               b->noffsets++;
+                               assert(b->noffsets < ARRAY_SIZE(b->offsets));
+                               b->offsets[n].dumped_mask = 0;
+                               b->offsets[n].offset = offset;
+                       }
+
+                       if ((b->offsets[n].dumped_mask & enable_mask) == enable_mask)
+                               return true;
+
+                       b->offsets[n].dumped_mask |= enable_mask;
+
+                       return false;
+               }
+       }
+
+       return false;
+}
+
+void
+reset_buffers(void)
+{
+       for (int i = 0; i < nbuffers; i++) {
+               free(buffers[i].hostptr);
+               buffers[i].hostptr = NULL;
+               buffers[i].len = 0;
+               buffers[i].noffsets = 0;
+       }
+       nbuffers = 0;
+}
+
+/**
+ * Record buffer contents, takes ownership of hostptr (freed in
+ * reset_buffers())
+ */
+void
+add_buffer(uint64_t gpuaddr, unsigned int len, void *hostptr)
+{
+       int i;
+
+       for (i = 0; i < nbuffers; i++) {
+               if (buffers[i].gpuaddr == gpuaddr)
+                       break;
+       }
+
+       if (i == nbuffers) {
+               /* some traces, like test-perf, with some blob versions,
+                * seem to generate an unreasonable # of gpu buffers (a
+                * leak?), so just ignore them.
+                */
+               if (nbuffers >= ARRAY_SIZE(buffers)) {
+                       free(hostptr);
+                       return;
+               }
+               nbuffers++;
+       }
+
+       buffers[i].hostptr = hostptr;
+       buffers[i].len     = len;
+       buffers[i].gpuaddr = gpuaddr;
+}
diff --git a/src/freedreno/decode/buffers.h b/src/freedreno/decode/buffers.h
new file mode 100644 (file)
index 0000000..f63f3f3
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __BUFFERS_H__
+#define __BUFFERS_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+uint64_t gpuaddr(void *hostptr);
+uint64_t gpubaseaddr(uint64_t gpuaddr);
+void * hostptr(uint64_t gpuaddr);
+unsigned hostlen(uint64_t gpuaddr);
+bool has_dumped(uint64_t gpuaddr, unsigned enable_mask);
+
+void reset_buffers(void);
+void add_buffer(uint64_t gpuaddr, unsigned int len, void *hostptr);
+
+#ifndef ARRAY_SIZE
+#  define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+#endif /* __BUFFERS_H__ */
diff --git a/src/freedreno/decode/cffdec.c b/src/freedreno/decode/cffdec.c
new file mode 100644 (file)
index 0000000..d0b2695
--- /dev/null
@@ -0,0 +1,2717 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <err.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "script.h"
+#include "rnnutil.h"
+#include "buffers.h"
+#include "cffdec.h"
+
+/* ************************************************************************* */
+/* originally based on kernel recovery dump code: */
+
+static const struct cffdec_options *options;
+
+static bool needs_wfi = false;
+static bool summary = false;
+static bool in_summary = false;
+static int vertices;
+
+static inline unsigned regcnt(void)
+{
+       if (options->gpu_id >= 500)
+               return 0xffff;
+       else
+               return 0x7fff;
+}
+
+static int is_64b(void)
+{
+       return options->gpu_id >= 500;
+}
+
+
+static int draws[3];
+static struct {
+       uint64_t base;
+       uint32_t size;   /* in dwords */
+       /* Generally cmdstream consists of multiple IB calls to different
+        * buffers, which are themselves often re-used for each tile.  The
+        * triggered flag serves two purposes to help make it more clear
+        * what part of the cmdstream is before vs after the the GPU hang:
+        *
+        * 1) if in IB2 we are passed the point within the IB2 buffer where
+        *    the GPU hung, but IB1 is not passed the point within its
+        *    buffer where the GPU had hung, then we know the GPU hang
+        *    happens on a future use of that IB2 buffer.
+        *
+        * 2) if in an IB1 or IB2 buffer that is not the one where the GPU
+        *    hung, but we've already passed the trigger point at the same
+        *    IB level, we know that we are passed the point where the GPU
+        *    had hung.
+        *
+        * So this is a one way switch, false->true.  And a higher #'d
+        * IB level isn't considered triggered unless the lower #'d IB
+        * level is.
+        */
+       bool triggered;
+} ibs[4];
+static int ib;
+
+static int draw_count;
+static int current_draw_count;
+
+/* query mode.. to handle symbolic register name queries, we need to
+ * defer parsing query string until after gpu_id is know and rnn db
+ * loaded:
+ */
+static int *queryvals;
+
+static bool
+quiet(int lvl)
+{
+       if ((options->draw_filter != -1) && (options->draw_filter != current_draw_count))
+               return true;
+       if ((lvl >= 3) && (summary || options->querystrs || options->script))
+               return true;
+       if ((lvl >= 2) && (options->querystrs || options->script))
+               return true;
+       return false;
+}
+
+void
+printl(int lvl, const char *fmt, ...)
+{
+       va_list args;
+       if (quiet(lvl))
+               return;
+       va_start(args, fmt);
+       vprintf(fmt, args);
+       va_end(args);
+}
+
+static const char *levels[] = {
+               "\t",
+               "\t\t",
+               "\t\t\t",
+               "\t\t\t\t",
+               "\t\t\t\t\t",
+               "\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t\t",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+};
+
+enum state_src_t {
+       STATE_SRC_DIRECT,
+       STATE_SRC_INDIRECT,
+       STATE_SRC_BINDLESS,
+};
+
+/* SDS (CP_SET_DRAW_STATE) helpers: */
+static void load_all_groups(int level);
+static void disable_all_groups(void);
+
+static void dump_tex_samp(uint32_t *texsamp, enum state_src_t src, int num_unit, int level);
+static void dump_tex_const(uint32_t *texsamp, int num_unit, int level);
+
+static bool
+highlight_gpuaddr(uint64_t gpuaddr)
+{
+       if (!options->color)
+               return false;
+
+       if (!options->ibs[ib].base)
+               return false;
+
+       if ((ib > 0) && options->ibs[ib-1].base && !ibs[ib-1].triggered)
+               return false;
+
+       if (ibs[ib].triggered)
+               return true;
+
+       if (options->ibs[ib].base != ibs[ib].base)
+               return false;
+
+       uint64_t start = ibs[ib].base + 4 * (ibs[ib].size - options->ibs[ib].rem);
+       uint64_t end   = ibs[ib].base + 4 * ibs[ib].size;
+
+       bool triggered = (start <= gpuaddr) && (gpuaddr <= end);
+
+       ibs[ib].triggered |= triggered;
+
+       if (triggered)
+               printf("ESTIMATED CRASH LOCATION!\n");
+
+       return triggered;
+}
+
+static void
+dump_hex(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       int i, j;
+       int lastzero = 1;
+
+       if (quiet(2))
+               return;
+
+       for (i = 0; i < sizedwords; i += 8) {
+               int zero = 1;
+
+               /* always show first row: */
+               if (i == 0)
+                       zero = 0;
+
+               for (j = 0; (j < 8) && (i+j < sizedwords) && zero; j++)
+                       if (dwords[i+j])
+                               zero = 0;
+
+               if (zero && !lastzero)
+                       printf("*\n");
+
+               lastzero = zero;
+
+               if (zero)
+                       continue;
+
+               uint64_t addr = gpuaddr(&dwords[i]);
+               bool highlight = highlight_gpuaddr(addr);
+
+               if (highlight)
+                       printf("\x1b[0;1;31m");
+
+               if (is_64b()) {
+                       printf("%016lx:%s", addr, levels[level]);
+               } else {
+                       printf("%08x:%s", (uint32_t)addr, levels[level]);
+               }
+
+               if (highlight)
+                       printf("\x1b[0m");
+
+               printf("%04x:", i * 4);
+
+               for (j = 0; (j < 8) && (i+j < sizedwords); j++) {
+                       printf(" %08x", dwords[i+j]);
+               }
+
+               printf("\n");
+       }
+}
+
+static void
+dump_float(float *dwords, uint32_t sizedwords, int level)
+{
+       int i;
+       for (i = 0; i < sizedwords; i++) {
+               if ((i % 8) == 0) {
+                       if (is_64b()) {
+                               printf("%016lx:%s", gpuaddr(dwords), levels[level]);
+                       } else {
+                               printf("%08x:%s", (uint32_t)gpuaddr(dwords), levels[level]);
+                       }
+               } else {
+                       printf(" ");
+               }
+               printf("%8f", *(dwords++));
+               if ((i % 8) == 7)
+                       printf("\n");
+       }
+       if (i % 8)
+               printf("\n");
+}
+
+/* I believe the surface format is low bits:
+#define RB_COLOR_INFO__COLOR_FORMAT_MASK                   0x0000000fL
+comments in sys2gmem_tex_const indicate that address is [31:12], but
+looks like at least some of the bits above the format have different meaning..
+*/
+static void parse_dword_addr(uint32_t dword, uint32_t *gpuaddr,
+               uint32_t *flags, uint32_t mask)
+{
+       assert(!is_64b());  /* this is only used on a2xx */
+       *gpuaddr = dword & ~mask;
+       *flags   = dword & mask;
+}
+
+static uint32_t type0_reg_vals[0xffff + 1];
+static uint8_t type0_reg_rewritten[sizeof(type0_reg_vals)/8];  /* written since last draw */
+static uint8_t type0_reg_written[sizeof(type0_reg_vals)/8];
+static uint32_t lastvals[ARRAY_SIZE(type0_reg_vals)];
+
+static bool reg_rewritten(uint32_t regbase)
+{
+       return !!(type0_reg_rewritten[regbase/8] & (1 << (regbase % 8)));
+}
+
+bool reg_written(uint32_t regbase)
+{
+       return !!(type0_reg_written[regbase/8] & (1 << (regbase % 8)));
+}
+
+static void clear_rewritten(void)
+{
+       memset(type0_reg_rewritten, 0, sizeof(type0_reg_rewritten));
+}
+
+static void clear_written(void)
+{
+       memset(type0_reg_written, 0, sizeof(type0_reg_written));
+       clear_rewritten();
+}
+
+uint32_t reg_lastval(uint32_t regbase)
+{
+       return lastvals[regbase];
+}
+
+static void
+clear_lastvals(void)
+{
+       memset(lastvals, 0, sizeof(lastvals));
+}
+
+uint32_t
+reg_val(uint32_t regbase)
+{
+       return type0_reg_vals[regbase];
+}
+
+void
+reg_set(uint32_t regbase, uint32_t val)
+{
+       assert(regbase < regcnt());
+       type0_reg_vals[regbase] = val;
+       type0_reg_written[regbase/8] |= (1 << (regbase % 8));
+       type0_reg_rewritten[regbase/8] |= (1 << (regbase % 8));
+}
+
+static void
+reg_dump_scratch(const char *name, uint32_t dword, int level)
+{
+       unsigned r;
+
+       if (quiet(3))
+               return;
+
+       r = regbase("CP_SCRATCH[0].REG");
+
+       // if not, try old a2xx/a3xx version:
+       if (!r)
+               r = regbase("CP_SCRATCH_REG0");
+
+       if (!r)
+               return;
+
+       printf("%s:%u,%u,%u,%u\n", levels[level],
+                       reg_val(r + 4), reg_val(r + 5),
+                       reg_val(r + 6), reg_val(r + 7));
+}
+
+static void
+dump_gpuaddr_size(uint64_t gpuaddr, int level, int sizedwords, int quietlvl)
+{
+       void *buf;
+
+       if (quiet(quietlvl))
+               return;
+
+       buf = hostptr(gpuaddr);
+       if (buf) {
+               dump_hex(buf, sizedwords, level+1);
+       }
+}
+
+static void
+dump_gpuaddr(uint64_t gpuaddr, int level)
+{
+       dump_gpuaddr_size(gpuaddr, level, 64, 3);
+}
+
+static void
+reg_dump_gpuaddr(const char *name, uint32_t dword, int level)
+{
+       dump_gpuaddr(dword, level);
+}
+
+uint32_t gpuaddr_lo;
+static void
+reg_gpuaddr_lo(const char *name, uint32_t dword, int level)
+{
+       gpuaddr_lo = dword;
+}
+
+static void
+reg_dump_gpuaddr_hi(const char *name, uint32_t dword, int level)
+{
+       dump_gpuaddr(gpuaddr_lo | (((uint64_t)dword) << 32), level);
+}
+
+
+static void
+dump_shader(const char *ext, void *buf, int bufsz)
+{
+       if (options->dump_shaders) {
+               static int n = 0;
+               char filename[8];
+               int fd;
+               sprintf(filename, "%04d.%s", n++, ext);
+               fd = open(filename, O_WRONLY| O_TRUNC | O_CREAT, 0644);
+               write(fd, buf, bufsz);
+               close(fd);
+       }
+}
+
+static void
+disasm_gpuaddr(const char *name, uint64_t gpuaddr, int level)
+{
+       void *buf;
+
+       gpuaddr &= 0xfffffffffffffff0;
+
+       if (quiet(3))
+               return;
+
+       buf = hostptr(gpuaddr);
+       if (buf) {
+               uint32_t sizedwords = hostlen(gpuaddr) / 4;
+               const char *ext;
+
+               dump_hex(buf, min(64, sizedwords), level+1);
+               disasm_a3xx(buf, sizedwords, level+2, stdout, options->gpu_id);
+
+               /* this is a bit ugly way, but oh well.. */
+               if (strstr(name, "SP_VS_OBJ")) {
+                       ext = "vo3";
+               } else if (strstr(name, "SP_FS_OBJ")) {
+                       ext = "fo3";
+               } else if (strstr(name, "SP_GS_OBJ")) {
+                       ext = "go3";
+               } else if (strstr(name, "SP_CS_OBJ")) {
+                       ext = "co3";
+               } else {
+                       ext = NULL;
+               }
+
+               if (ext)
+                       dump_shader(ext, buf, sizedwords * 4);
+       }
+}
+
+static void
+reg_disasm_gpuaddr(const char *name, uint32_t dword, int level)
+{
+       disasm_gpuaddr(name, dword, level);
+}
+
+static void
+reg_disasm_gpuaddr_hi(const char *name, uint32_t dword, int level)
+{
+       disasm_gpuaddr(name, gpuaddr_lo | (((uint64_t)dword) << 32), level);
+}
+
+/* Find the value of the TEX_COUNT register that corresponds to the named
+ * TEX_SAMP/TEX_CONST reg.
+ *
+ * Note, this kinda assumes an equal # of samplers and textures, but not
+ * really sure if there is a much better option.  I suppose on a6xx we
+ * could instead decode the bitfields in SP_xS_CONFIG
+ */
+static int
+get_tex_count(const char *name)
+{
+       char count_reg[strlen(name) + 5];
+       char *p;
+
+       p = strstr(name, "CONST");
+       if (!p)
+               p = strstr(name, "SAMP");
+       if (!p)
+               return 0;
+
+       int n = p - name;
+       strncpy(count_reg, name, n);
+       strcpy(count_reg + n, "COUNT");
+
+       return reg_val(regbase(count_reg));
+}
+
+static void
+reg_dump_tex_samp_hi(const char *name, uint32_t dword, int level)
+{
+       if (!in_summary)
+               return;
+
+       int num_unit = get_tex_count(name);
+       uint64_t gpuaddr = gpuaddr_lo | (((uint64_t)dword) << 32);
+       void *buf = hostptr(gpuaddr);
+
+       if (!buf)
+               return;
+
+       dump_tex_samp(buf, STATE_SRC_DIRECT, num_unit, level+1);
+}
+
+static void
+reg_dump_tex_const_hi(const char *name, uint32_t dword, int level)
+{
+       if (!in_summary)
+               return;
+
+       int num_unit = get_tex_count(name);
+       uint64_t gpuaddr = gpuaddr_lo | (((uint64_t)dword) << 32);
+       void *buf = hostptr(gpuaddr);
+
+       if (!buf)
+               return;
+
+       dump_tex_const(buf, num_unit, level+1);
+}
+
+/*
+ * Registers with special handling (rnndec_decode() handles rest):
+ */
+#define REG(x, fxn) { #x, fxn }
+static struct {
+       const char *regname;
+       void (*fxn)(const char *name, uint32_t dword, int level);
+       uint32_t regbase;
+} reg_a2xx[] = {
+               REG(CP_SCRATCH_REG0, reg_dump_scratch),
+               REG(CP_SCRATCH_REG1, reg_dump_scratch),
+               REG(CP_SCRATCH_REG2, reg_dump_scratch),
+               REG(CP_SCRATCH_REG3, reg_dump_scratch),
+               REG(CP_SCRATCH_REG4, reg_dump_scratch),
+               REG(CP_SCRATCH_REG5, reg_dump_scratch),
+               REG(CP_SCRATCH_REG6, reg_dump_scratch),
+               REG(CP_SCRATCH_REG7, reg_dump_scratch),
+               {NULL},
+}, reg_a3xx[] = {
+               REG(CP_SCRATCH_REG0, reg_dump_scratch),
+               REG(CP_SCRATCH_REG1, reg_dump_scratch),
+               REG(CP_SCRATCH_REG2, reg_dump_scratch),
+               REG(CP_SCRATCH_REG3, reg_dump_scratch),
+               REG(CP_SCRATCH_REG4, reg_dump_scratch),
+               REG(CP_SCRATCH_REG5, reg_dump_scratch),
+               REG(CP_SCRATCH_REG6, reg_dump_scratch),
+               REG(CP_SCRATCH_REG7, reg_dump_scratch),
+               REG(VSC_SIZE_ADDRESS, reg_dump_gpuaddr),
+               REG(SP_VS_PVT_MEM_ADDR_REG, reg_dump_gpuaddr),
+               REG(SP_FS_PVT_MEM_ADDR_REG, reg_dump_gpuaddr),
+               REG(SP_VS_OBJ_START_REG, reg_disasm_gpuaddr),
+               REG(SP_FS_OBJ_START_REG, reg_disasm_gpuaddr),
+               REG(TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+               {NULL},
+}, reg_a4xx[] = {
+               REG(CP_SCRATCH[0].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x1].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x2].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x3].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
+               REG(SP_VS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+               REG(SP_FS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+               REG(SP_GS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+               REG(SP_HS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+               REG(SP_DS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+               REG(SP_CS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+               REG(SP_VS_OBJ_START, reg_disasm_gpuaddr),
+               REG(SP_FS_OBJ_START, reg_disasm_gpuaddr),
+               REG(SP_GS_OBJ_START, reg_disasm_gpuaddr),
+               REG(SP_HS_OBJ_START, reg_disasm_gpuaddr),
+               REG(SP_DS_OBJ_START, reg_disasm_gpuaddr),
+               REG(SP_CS_OBJ_START, reg_disasm_gpuaddr),
+               REG(TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+               REG(TPL1_TP_HS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+               REG(TPL1_TP_DS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+               REG(TPL1_TP_GS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+               REG(TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+               {NULL},
+}, reg_a5xx[] = {
+               REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
+               REG(SP_VS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_VS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_HS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_HS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_DS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_DS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_GS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_GS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_FS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_FS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_CS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_CS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(TPL1_VS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(TPL1_VS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(TPL1_VS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(TPL1_VS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(TPL1_HS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(TPL1_HS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(TPL1_HS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(TPL1_HS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(TPL1_DS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(TPL1_DS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(TPL1_DS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(TPL1_DS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(TPL1_GS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(TPL1_GS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(TPL1_GS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(TPL1_GS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(TPL1_FS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(TPL1_FS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(TPL1_FS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(TPL1_FS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(TPL1_CS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(TPL1_CS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(TPL1_CS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(TPL1_CS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(TPL1_TP_BORDER_COLOR_BASE_ADDR_LO,  reg_gpuaddr_lo),
+               REG(TPL1_TP_BORDER_COLOR_BASE_ADDR_HI,  reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[0].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[0].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[1].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[1].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[2].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[2].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[3].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[3].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[4].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[4].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[5].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[5].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[6].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[6].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT_FLAG_BUFFER[7].ADDR_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT_FLAG_BUFFER[7].ADDR_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_BLIT_FLAG_DST_LO, reg_gpuaddr_lo),
+//             REG(RB_BLIT_FLAG_DST_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_MRT[0].BASE_LO, reg_gpuaddr_lo),
+//             REG(RB_MRT[0].BASE_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_DEPTH_BUFFER_BASE_LO, reg_gpuaddr_lo),
+//             REG(RB_DEPTH_BUFFER_BASE_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_DEPTH_FLAG_BUFFER_BASE_LO, reg_gpuaddr_lo),
+//             REG(RB_DEPTH_FLAG_BUFFER_BASE_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_BLIT_DST_LO, reg_gpuaddr_lo),
+//             REG(RB_BLIT_DST_HI, reg_dump_gpuaddr_hi),
+
+//             REG(RB_2D_SRC_LO, reg_gpuaddr_lo),
+//             REG(RB_2D_SRC_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_2D_SRC_FLAGS_LO, reg_gpuaddr_lo),
+//             REG(RB_2D_SRC_FLAGS_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_2D_DST_LO, reg_gpuaddr_lo),
+//             REG(RB_2D_DST_HI, reg_dump_gpuaddr_hi),
+//             REG(RB_2D_DST_FLAGS_LO, reg_gpuaddr_lo),
+//             REG(RB_2D_DST_FLAGS_HI, reg_dump_gpuaddr_hi),
+
+               {NULL},
+}, reg_a6xx[] = {
+               REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
+               REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
+
+               REG(SP_VS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_VS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_HS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_HS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_DS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_DS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_GS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_GS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_FS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_FS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+               REG(SP_CS_OBJ_START_LO, reg_gpuaddr_lo),
+               REG(SP_CS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+
+               REG(SP_VS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(SP_VS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(SP_VS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(SP_VS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(SP_HS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(SP_HS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(SP_HS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(SP_HS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(SP_DS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(SP_DS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(SP_DS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(SP_DS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(SP_GS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(SP_GS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(SP_GS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(SP_GS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(SP_FS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(SP_FS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(SP_FS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(SP_FS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+               REG(SP_CS_TEX_CONST_LO, reg_gpuaddr_lo),
+               REG(SP_CS_TEX_CONST_HI, reg_dump_tex_const_hi),
+               REG(SP_CS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+               REG(SP_CS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+
+               {NULL},
+}, *type0_reg;
+
+static struct rnn *rnn;
+
+static void
+init_rnn(const char *gpuname)
+{
+       rnn = rnn_new(!options->color);
+
+       rnn_load(rnn, gpuname);
+
+       if (options->querystrs) {
+               int i;
+               queryvals = calloc(options->nquery, sizeof(queryvals[0]));
+
+               for (i = 0; i < options->nquery; i++) {
+                       int val = strtol(options->querystrs[i], NULL, 0);
+
+                       if (val == 0)
+                               val = regbase(options->querystrs[i]);
+
+                       queryvals[i] = val;
+                       printf("querystr: %s -> 0x%x\n", options->querystrs[i], queryvals[i]);
+               }
+       }
+
+       for (unsigned idx = 0; type0_reg[idx].regname; idx++) {
+               type0_reg[idx].regbase = regbase(type0_reg[idx].regname);
+               if (!type0_reg[idx].regbase) {
+                       printf("invalid register name: %s\n", type0_reg[idx].regname);
+                       exit(1);
+               }
+       }
+}
+
+void
+reset_regs(void)
+{
+       clear_written();
+       clear_lastvals();
+       memset(&ibs, 0, sizeof(ibs));
+}
+
+void
+cffdec_init(const struct cffdec_options *_options)
+{
+       options = _options;
+       summary = options->summary;
+
+       /* in case we're decoding multiple files: */
+       free(queryvals);
+       reset_regs();
+       draw_count = 0;
+
+       /* TODO we need an API to free/cleanup any previous rnn */
+
+       switch (options->gpu_id) {
+       case 200 ... 299:
+               type0_reg = reg_a2xx;
+               init_rnn("a2xx");
+               break;
+       case 300 ... 399:
+               type0_reg = reg_a3xx;
+               init_rnn("a3xx");
+               break;
+       case 400 ... 499:
+               type0_reg = reg_a4xx;
+               init_rnn("a4xx");
+               break;
+       case 500 ... 599:
+               type0_reg = reg_a5xx;
+               init_rnn("a5xx");
+               break;
+       case 600 ... 699:
+               type0_reg = reg_a6xx;
+               init_rnn("a6xx");
+               break;
+       default:
+               errx(-1, "unsupported gpu");
+       }
+}
+
+const char *
+pktname(unsigned opc)
+{
+       return rnn_enumname(rnn, "adreno_pm4_type3_packets", opc);
+}
+
+const char *
+regname(uint32_t regbase, int color)
+{
+       return rnn_regname(rnn, regbase, color);
+}
+
+uint32_t
+regbase(const char *name)
+{
+       return rnn_regbase(rnn, name);
+}
+
+static int
+endswith(uint32_t regbase, const char *suffix)
+{
+       const char *name = regname(regbase, 0);
+       const char *s = strstr(name, suffix);
+       if (!s)
+               return 0;
+       return (s - strlen(name) + strlen(suffix)) == name;
+}
+
+void
+dump_register_val(uint32_t regbase, uint32_t dword, int level)
+{
+       struct rnndecaddrinfo *info = rnn_reginfo(rnn, regbase);
+
+       if (info && info->typeinfo) {
+               uint64_t gpuaddr = 0;
+               char *decoded = rnndec_decodeval(rnn->vc, info->typeinfo, dword);
+               printf("%s%s: %s", levels[level], info->name, decoded);
+
+               /* Try and figure out if we are looking at a gpuaddr.. this
+                * might be useful for other gen's too, but at least a5xx has
+                * the _HI/_LO suffix we can look for.  Maybe a better approach
+                * would be some special annotation in the xml..
+                */
+               if (options->gpu_id >= 500) {
+                       if (endswith(regbase, "_HI") && endswith(regbase-1, "_LO")) {
+                               gpuaddr = (((uint64_t)dword) << 32) | reg_val(regbase-1);
+                       } else if (endswith(regbase, "_LO") && endswith(regbase+1, "_HI")) {
+                               gpuaddr = (((uint64_t)reg_val(regbase+1)) << 32) | dword;
+                       }
+               }
+
+               if (gpuaddr && hostptr(gpuaddr)) {
+                       printf("\t\tbase=%lx, offset=%lu, size=%u",
+                                       gpubaseaddr(gpuaddr),
+                                       gpuaddr - gpubaseaddr(gpuaddr),
+                                       hostlen(gpubaseaddr(gpuaddr)));
+               }
+
+               printf("\n");
+
+               free(decoded);
+       } else if (info) {
+               printf("%s%s: %08x\n", levels[level], info->name, dword);
+       } else {
+               printf("%s<%04x>: %08x\n", levels[level], regbase, dword);
+       }
+
+       if (info) {
+               free(info->name);
+               free(info);
+       }
+}
+
+static void
+dump_register(uint32_t regbase, uint32_t dword, int level)
+{
+       if (!quiet(3)) {
+               dump_register_val(regbase, dword, level);
+       }
+
+       for (unsigned idx = 0; type0_reg[idx].regname; idx++) {
+               if (type0_reg[idx].regbase == regbase) {
+                       type0_reg[idx].fxn(type0_reg[idx].regname, dword, level);
+                       break;
+               }
+       }
+}
+
+static bool
+is_banked_reg(uint32_t regbase)
+{
+       return (0x2000 <= regbase) && (regbase < 0x2400);
+}
+
+static void
+dump_registers(uint32_t regbase, uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       while (sizedwords--) {
+               int last_summary = summary;
+
+               /* access to non-banked registers needs a WFI:
+                * TODO banked register range for a2xx??
+                */
+               if (needs_wfi && !is_banked_reg(regbase))
+                       printl(2, "NEEDS WFI: %s (%x)\n", regname(regbase, 1), regbase);
+
+               reg_set(regbase, *dwords);
+               dump_register(regbase, *dwords, level);
+               regbase++;
+               dwords++;
+               summary = last_summary;
+       }
+}
+
+static void
+dump_domain(uint32_t *dwords, uint32_t sizedwords, int level,
+               const char *name)
+{
+       struct rnndomain *dom;
+       int i;
+
+       dom = rnn_finddomain(rnn->db, name);
+
+       if (!dom)
+               return;
+
+       if (script_packet)
+               script_packet(dwords, sizedwords, rnn, dom);
+
+       if (quiet(2))
+               return;
+
+       for (i = 0; i < sizedwords; i++) {
+               struct rnndecaddrinfo *info = rnndec_decodeaddr(rnn->vc, dom, i, 0);
+               char *decoded;
+               if (!(info && info->typeinfo))
+                       break;
+               uint64_t value = dwords[i];
+               if (info->typeinfo->high >= 32 && i < sizedwords - 1) {
+                       value |= (uint64_t) dwords[i + 1] << 32;
+                       i++; /* skip the next dword since we're printing it now */
+               }
+               decoded = rnndec_decodeval(rnn->vc, info->typeinfo, value);
+               /* Unlike the register printing path, we don't print the name
+                * of the register, so if it doesn't contain other named
+                * things (i.e. it isn't a bitset) then print the register
+                * name as if it's a bitset with a single entry. This avoids
+                * having to create a dummy register with a single entry to
+                * get a name in the decoding.
+                */
+               if (info->typeinfo->type == RNN_TTYPE_BITSET ||
+                   info->typeinfo->type == RNN_TTYPE_INLINE_BITSET) {
+                       printf("%s%s\n", levels[level], decoded);
+               } else {
+                       printf("%s{ %s%s%s = %s }\n", levels[level],
+                                       rnn->vc->colors->rname, info->name,
+                                       rnn->vc->colors->reset, decoded);
+               }
+               free(decoded);
+               free(info->name);
+               free(info);
+       }
+}
+
+
+static uint32_t bin_x1, bin_x2, bin_y1, bin_y2;
+static unsigned mode;
+static const char *render_mode;
+static enum {
+       MODE_BINNING = 0x1,
+       MODE_GMEM    = 0x2,
+       MODE_BYPASS  = 0x4,
+       MODE_ALL     = MODE_BINNING | MODE_GMEM | MODE_BYPASS,
+} enable_mask = MODE_ALL;
+static bool skip_ib2_enable_global;
+static bool skip_ib2_enable_local;
+
+static void
+print_mode(int level)
+{
+       if ((options->gpu_id >= 500) && !quiet(2)) {
+               printf("%smode: %s\n", levels[level], render_mode);
+               printf("%sskip_ib2: g=%d, l=%d\n", levels[level], skip_ib2_enable_global, skip_ib2_enable_local);
+       }
+}
+
+static bool
+skip_query(void)
+{
+       switch (options->query_mode) {
+       case QUERY_ALL:
+               /* never skip: */
+               return false;
+       case QUERY_WRITTEN:
+               for (int i = 0; i < options->nquery; i++) {
+                       uint32_t regbase = queryvals[i];
+                       if (!reg_written(regbase)) {
+                               continue;
+                       }
+                       if (reg_rewritten(regbase)) {
+                               return false;
+                       }
+               }
+               return true;
+       case QUERY_DELTA:
+               for (int i = 0; i < options->nquery; i++) {
+                       uint32_t regbase = queryvals[i];
+                       if (!reg_written(regbase)) {
+                               continue;
+                       }
+                       uint32_t lastval = reg_val(regbase);
+                       if (lastval != lastvals[regbase]) {
+                               return false;
+                       }
+               }
+               return true;
+       }
+       return true;
+}
+
+static void
+__do_query(const char *primtype, uint32_t num_indices)
+{
+       int n = 0;
+
+       if ((500 <= options->gpu_id) && (options->gpu_id < 700)) {
+               uint32_t scissor_tl = reg_val(regbase("GRAS_SC_WINDOW_SCISSOR_TL"));
+               uint32_t scissor_br = reg_val(regbase("GRAS_SC_WINDOW_SCISSOR_BR"));
+
+               bin_x1 = scissor_tl & 0xffff;
+               bin_y1 = scissor_tl >> 16;
+               bin_x2 = scissor_br & 0xffff;
+               bin_y2 = scissor_br >> 16;
+       }
+
+       for (int i = 0; i < options->nquery; i++) {
+               uint32_t regbase = queryvals[i];
+               if (reg_written(regbase)) {
+                       uint32_t lastval = reg_val(regbase);
+                       printf("%4d: %s(%u,%u-%u,%u):%u:", draw_count, primtype,
+                                       bin_x1, bin_y1, bin_x2, bin_y2, num_indices);
+                       if (options->gpu_id >= 500)
+                               printf("%s:", render_mode);
+                       printf("\t%08x", lastval);
+                       if (lastval != lastvals[regbase]) {
+                               printf("!");
+                       } else {
+                               printf(" ");
+                       }
+                       if (reg_rewritten(regbase)) {
+                               printf("+");
+                       } else {
+                               printf(" ");
+                       }
+                       dump_register_val(regbase, lastval, 0);
+                       n++;
+               }
+       }
+
+       if (n > 1)
+               printf("\n");
+}
+
+static void
+do_query_compare(const char *primtype, uint32_t num_indices)
+{
+       unsigned saved_enable_mask = enable_mask;
+       const char *saved_render_mode = render_mode;
+
+       /* in 'query-compare' mode, we want to see if the register is writtten
+        * or changed in any mode:
+        *
+        * (NOTE: this could cause false-positive for 'query-delta' if the reg
+        * is written with different values in binning vs sysmem/gmem mode, as
+        * we don't track previous values per-mode, but I think we can live with
+        * that)
+        */
+       enable_mask = MODE_ALL;
+
+       clear_rewritten();
+       load_all_groups(0);
+
+       if (!skip_query()) {
+               /* dump binning pass values: */
+               enable_mask = MODE_BINNING;
+               render_mode = "BINNING";
+               clear_rewritten();
+               load_all_groups(0);
+               __do_query(primtype, num_indices);
+
+               /* dump draw pass values: */
+               enable_mask = MODE_GMEM | MODE_BYPASS;
+               render_mode = "DRAW";
+               clear_rewritten();
+               load_all_groups(0);
+               __do_query(primtype, num_indices);
+
+               printf("\n");
+       }
+
+       enable_mask = saved_enable_mask;
+       render_mode = saved_render_mode;
+
+       disable_all_groups();
+}
+
+/* well, actually query and script..
+ * NOTE: call this before dump_register_summary()
+ */
+static void
+do_query(const char *primtype, uint32_t num_indices)
+{
+       if (script_draw)
+               script_draw(primtype, num_indices);
+
+       if (options->query_compare) {
+               do_query_compare(primtype, num_indices);
+               return;
+       }
+
+       if (skip_query())
+               return;
+
+       __do_query(primtype, num_indices);
+}
+
+static void
+cp_im_loadi(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t start = dwords[1] >> 16;
+       uint32_t size  = dwords[1] & 0xffff;
+       const char *type = NULL, *ext = NULL;
+       enum shader_t disasm_type;
+
+       switch (dwords[0]) {
+       case 0:
+               type = "vertex";
+               ext = "vo";
+               disasm_type = SHADER_VERTEX;
+               break;
+       case 1:
+               type = "fragment";
+               ext = "fo";
+               disasm_type = SHADER_FRAGMENT;
+               break;
+       default:
+               type = "<unknown>";
+               disasm_type = 0;
+               break;
+       }
+
+       printf("%s%s shader, start=%04x, size=%04x\n", levels[level], type, start, size);
+       disasm_a2xx(dwords + 2, sizedwords - 2, level+2, disasm_type);
+
+       /* dump raw shader: */
+       if (ext)
+               dump_shader(ext, dwords + 2, (sizedwords - 2) * 4);
+}
+
+static void
+cp_wide_reg_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t reg = dwords[0] & 0xffff;
+       int i;
+       for (i = 1; i < sizedwords; i++) {
+               dump_register(reg, dwords[i], level+1);
+               reg_set(reg, dwords[i]);
+               reg++;
+       }
+}
+
+enum state_t {
+       TEX_SAMP = 1,
+       TEX_CONST,
+       TEX_MIPADDR,  /* a3xx only */
+       SHADER_PROG,
+       SHADER_CONST,
+
+       // image/ssbo state:
+       SSBO_0,
+       SSBO_1,
+       SSBO_2,
+
+       UBO,
+
+       // unknown things, just to hexdumps:
+       UNKNOWN_DWORDS,
+       UNKNOWN_2DWORDS,
+       UNKNOWN_4DWORDS,
+};
+
+enum adreno_state_block {
+       SB_VERT_TEX = 0,
+       SB_VERT_MIPADDR = 1,
+       SB_FRAG_TEX = 2,
+       SB_FRAG_MIPADDR = 3,
+       SB_VERT_SHADER = 4,
+       SB_GEOM_SHADER = 5,
+       SB_FRAG_SHADER = 6,
+       SB_COMPUTE_SHADER = 7,
+};
+
+/* TODO there is probably a clever way to let rnndec parse things so
+ * we don't have to care about packet format differences across gens
+ */
+
+static void
+a3xx_get_state_type(uint32_t *dwords, enum shader_t *stage, enum state_t *state,
+                   enum state_src_t *src)
+{
+       unsigned state_block_id = (dwords[0] >> 19) & 0x7;
+       unsigned state_type = dwords[1] & 0x3;
+       static const struct {
+               enum shader_t stage;
+               enum state_t state;
+       } lookup[0xf][0x3] = {
+               [SB_VERT_TEX][0]    = { SHADER_VERTEX,    TEX_SAMP },
+               [SB_VERT_TEX][1]    = { SHADER_VERTEX,    TEX_CONST },
+               [SB_FRAG_TEX][0]    = { SHADER_FRAGMENT,  TEX_SAMP },
+               [SB_FRAG_TEX][1]    = { SHADER_FRAGMENT,  TEX_CONST },
+               [SB_VERT_SHADER][0] = { SHADER_VERTEX,    SHADER_PROG },
+               [SB_VERT_SHADER][1] = { SHADER_VERTEX,    SHADER_CONST },
+               [SB_FRAG_SHADER][0] = { SHADER_FRAGMENT,  SHADER_PROG },
+               [SB_FRAG_SHADER][1] = { SHADER_FRAGMENT,  SHADER_CONST },
+       };
+
+       *stage = lookup[state_block_id][state_type].stage;
+       *state = lookup[state_block_id][state_type].state;
+       unsigned state_src = (dwords[0] >> 16) & 0x7;
+       if (state_src == 0 /* SS_DIRECT */)
+               *src = STATE_SRC_DIRECT;
+       else
+               *src = STATE_SRC_INDIRECT;
+}
+
+static enum state_src_t
+_get_state_src(unsigned dword0)
+{
+       switch ((dword0 >> 16) & 0x3) {
+       case 0: /* SS4_DIRECT / SS6_DIRECT */
+               return STATE_SRC_DIRECT;
+       case 2: /* SS4_INDIRECT / SS6_INDIRECT */
+               return STATE_SRC_INDIRECT;
+       case 1: /* SS6_BINDLESS */
+               return STATE_SRC_BINDLESS;
+       default:
+               return STATE_SRC_DIRECT;
+       }
+}
+
+static void
+_get_state_type(unsigned state_block_id, unsigned state_type,
+               enum shader_t *stage, enum state_t *state)
+{
+       static const struct {
+               enum shader_t stage;
+               enum state_t  state;
+       } lookup[0x10][0x4] = {
+               // SB4_VS_TEX:
+               [0x0][0] = { SHADER_VERTEX,    TEX_SAMP },
+               [0x0][1] = { SHADER_VERTEX,    TEX_CONST },
+               [0x0][2] = { SHADER_VERTEX,    UBO },
+               // SB4_HS_TEX:
+               [0x1][0] = { SHADER_TCS,       TEX_SAMP },
+               [0x1][1] = { SHADER_TCS,       TEX_CONST },
+               [0x1][2] = { SHADER_TCS,       UBO },
+               // SB4_DS_TEX:
+               [0x2][0] = { SHADER_TES,       TEX_SAMP },
+               [0x2][1] = { SHADER_TES,       TEX_CONST },
+               [0x2][2] = { SHADER_TES,       UBO },
+               // SB4_GS_TEX:
+               [0x3][0] = { SHADER_GEOM,      TEX_SAMP },
+               [0x3][1] = { SHADER_GEOM,      TEX_CONST },
+               [0x3][2] = { SHADER_GEOM,      UBO },
+               // SB4_FS_TEX:
+               [0x4][0] = { SHADER_FRAGMENT,  TEX_SAMP },
+               [0x4][1] = { SHADER_FRAGMENT,  TEX_CONST },
+               [0x4][2] = { SHADER_FRAGMENT,  UBO },
+               // SB4_CS_TEX:
+               [0x5][0] = { SHADER_COMPUTE,   TEX_SAMP },
+               [0x5][1] = { SHADER_COMPUTE,   TEX_CONST },
+               [0x5][2] = { SHADER_COMPUTE,   UBO },
+               // SB4_VS_SHADER:
+               [0x8][0] = { SHADER_VERTEX,    SHADER_PROG },
+               [0x8][1] = { SHADER_VERTEX,    SHADER_CONST },
+               [0x8][2] = { SHADER_VERTEX,    UBO },
+               // SB4_HS_SHADER
+               [0x9][0] = { SHADER_TCS,       SHADER_PROG },
+               [0x9][1] = { SHADER_TCS,       SHADER_CONST },
+               [0x9][2] = { SHADER_TCS,       UBO },
+               // SB4_DS_SHADER
+               [0xa][0] = { SHADER_TES,       SHADER_PROG },
+               [0xa][1] = { SHADER_TES,       SHADER_CONST },
+               [0xa][2] = { SHADER_TES,       UBO },
+               // SB4_GS_SHADER
+               [0xb][0] = { SHADER_GEOM,      SHADER_PROG },
+               [0xb][1] = { SHADER_GEOM,      SHADER_CONST },
+               [0xb][2] = { SHADER_GEOM,      UBO },
+               // SB4_FS_SHADER:
+               [0xc][0] = { SHADER_FRAGMENT,  SHADER_PROG },
+               [0xc][1] = { SHADER_FRAGMENT,  SHADER_CONST },
+               [0xc][2] = { SHADER_FRAGMENT,  UBO },
+               // SB4_CS_SHADER:
+               [0xd][0] = { SHADER_COMPUTE,   SHADER_PROG },
+               [0xd][1] = { SHADER_COMPUTE,   SHADER_CONST },
+               [0xd][2] = { SHADER_COMPUTE,   UBO },
+               [0xd][3] = { SHADER_COMPUTE,   SSBO_0 },      /* a6xx location */
+               // SB4_SSBO (shared across all stages)
+               [0xe][0] = { 0, SSBO_0 },                     /* a5xx (and a4xx?) location */
+               [0xe][1] = { 0, SSBO_1 },
+               [0xe][2] = { 0, SSBO_2 },
+               // SB4_CS_SSBO
+               [0xf][0] = { SHADER_COMPUTE, SSBO_0 },
+               [0xf][1] = { SHADER_COMPUTE, SSBO_1 },
+               [0xf][2] = { SHADER_COMPUTE, SSBO_2 },
+               // unknown things
+               /* This looks like combined UBO state for 3d stages (a5xx and
+                * before??  I think a6xx has UBO state per shader stage:
+                */
+               [0x6][2] = { 0, UBO },
+               [0x7][1] = { 0, UNKNOWN_2DWORDS },
+       };
+
+       *stage = lookup[state_block_id][state_type].stage;
+       *state = lookup[state_block_id][state_type].state;
+}
+
+static void
+a4xx_get_state_type(uint32_t *dwords, enum shader_t *stage, enum state_t *state,
+                   enum state_src_t *src)
+{
+       unsigned state_block_id = (dwords[0] >> 18) & 0xf;
+       unsigned state_type = dwords[1] & 0x3;
+       _get_state_type(state_block_id, state_type, stage, state);
+       *src = _get_state_src(dwords[0]);
+}
+
+static void
+a6xx_get_state_type(uint32_t *dwords, enum shader_t *stage, enum state_t *state,
+                   enum state_src_t *src)
+{
+       unsigned state_block_id = (dwords[0] >> 18) & 0xf;
+       unsigned state_type = (dwords[0] >> 14) & 0x3;
+       _get_state_type(state_block_id, state_type, stage, state);
+       *src = _get_state_src(dwords[0]);
+}
+
+static void
+dump_tex_samp(uint32_t *texsamp, enum state_src_t src, int num_unit, int level)
+{
+       for (int i = 0; i < num_unit; i++) {
+               /* work-around to reduce noise for opencl blob which always
+                * writes the max # regardless of # of textures used
+                */
+               if ((num_unit == 16) && (texsamp[0] == 0) && (texsamp[1] == 0))
+                       break;
+
+               if ((300 <= options->gpu_id) && (options->gpu_id < 400)) {
+                       dump_domain(texsamp, 2, level+2, "A3XX_TEX_SAMP");
+                       dump_hex(texsamp, 2, level+1);
+                       texsamp += 2;
+               } else if ((400 <= options->gpu_id) && (options->gpu_id < 500)) {
+                       dump_domain(texsamp, 2, level+2, "A4XX_TEX_SAMP");
+                       dump_hex(texsamp, 2, level+1);
+                       texsamp += 2;
+               } else if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
+                       dump_domain(texsamp, 4, level+2, "A5XX_TEX_SAMP");
+                       dump_hex(texsamp, 4, level+1);
+                       texsamp += 4;
+               } else if ((600 <= options->gpu_id) && (options->gpu_id < 700)) {
+                       dump_domain(texsamp, 4, level+2, "A6XX_TEX_SAMP");
+                       dump_hex(texsamp, 4, level+1);
+                       texsamp += src == STATE_SRC_BINDLESS ? 16 : 4;
+               }
+       }
+}
+
+static void
+dump_tex_const(uint32_t *texconst, int num_unit, int level)
+{
+       for (int i = 0; i < num_unit; i++) {
+               /* work-around to reduce noise for opencl blob which always
+                * writes the max # regardless of # of textures used
+                */
+               if ((num_unit == 16) &&
+                       (texconst[0] == 0) && (texconst[1] == 0) &&
+                       (texconst[2] == 0) && (texconst[3] == 0))
+                       break;
+
+               if ((300 <= options->gpu_id) && (options->gpu_id < 400)) {
+                       dump_domain(texconst, 4, level+2, "A3XX_TEX_CONST");
+                       dump_hex(texconst, 4, level+1);
+                       texconst += 4;
+               } else if ((400 <= options->gpu_id) && (options->gpu_id < 500)) {
+                       dump_domain(texconst, 8, level+2, "A4XX_TEX_CONST");
+                       if (options->dump_textures) {
+                               uint32_t addr = texconst[4] & ~0x1f;
+                               dump_gpuaddr(addr, level-2);
+                       }
+                       dump_hex(texconst, 8, level+1);
+                       texconst += 8;
+               } else if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
+                       dump_domain(texconst, 12, level+2, "A5XX_TEX_CONST");
+                       if (options->dump_textures) {
+                               uint64_t addr = (((uint64_t)texconst[5] & 0x1ffff) << 32) | texconst[4];
+                               dump_gpuaddr_size(addr, level-2, hostlen(addr) / 4, 3);
+                       }
+                       dump_hex(texconst, 12, level+1);
+                       texconst += 12;
+               } else if ((600 <= options->gpu_id) && (options->gpu_id < 700)) {
+                       dump_domain(texconst, 16, level+2, "A6XX_TEX_CONST");
+                       if (options->dump_textures) {
+                               uint64_t addr = (((uint64_t)texconst[5] & 0x1ffff) << 32) | texconst[4];
+                               dump_gpuaddr_size(addr, level-2, hostlen(addr) / 4, 3);
+                       }
+                       dump_hex(texconst, 16, level+1);
+                       texconst += 16;
+               }
+       }
+}
+
+static void
+cp_load_state(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       enum shader_t stage;
+       enum state_t state;
+       enum state_src_t src;
+       uint32_t num_unit = (dwords[0] >> 22) & 0x1ff;
+       uint64_t ext_src_addr;
+       void *contents;
+       int i;
+
+       if (quiet(2) && !options->script)
+               return;
+
+       if (options->gpu_id >= 600)
+               a6xx_get_state_type(dwords, &stage, &state, &src);
+       else if (options->gpu_id >= 400)
+               a4xx_get_state_type(dwords, &stage, &state, &src);
+       else
+               a3xx_get_state_type(dwords, &stage, &state, &src);
+
+       switch (src) {
+       case STATE_SRC_DIRECT: ext_src_addr = 0; break;
+       case STATE_SRC_INDIRECT:
+               if (is_64b()) {
+                       ext_src_addr = dwords[1] & 0xfffffffc;
+                       ext_src_addr |= ((uint64_t)dwords[2]) << 32;
+               } else {
+                       ext_src_addr = dwords[1] & 0xfffffffc;
+               }
+
+               break;
+       case STATE_SRC_BINDLESS: {
+               const unsigned base_reg =
+                       stage == SHADER_COMPUTE ? regbase("HLSQ_CS_BINDLESS_BASE[0]") : regbase("HLSQ_BINDLESS_BASE[0]");
+
+               if (is_64b()) {
+                       const unsigned reg = base_reg + (dwords[1] >> 28) * 2;
+                       ext_src_addr = reg_val(reg) & 0xfffffffc;
+                       ext_src_addr |= ((uint64_t)reg_val(reg + 1)) << 32;
+               } else {
+                       const unsigned reg = base_reg + (dwords[1] >> 28);
+                       ext_src_addr = reg_val(reg) & 0xfffffffc;
+               }
+
+               ext_src_addr += 4 * (dwords[1] & 0xffffff);
+               break;
+       }
+       }
+
+       if (ext_src_addr)
+               contents = hostptr(ext_src_addr);
+       else
+               contents = is_64b() ? dwords + 3 : dwords + 2;
+
+       if (!contents)
+               return;
+
+       switch (state) {
+       case SHADER_PROG: {
+               const char *ext = NULL;
+
+               if (quiet(2))
+                       return;
+
+               if (options->gpu_id >= 400)
+                       num_unit *= 16;
+               else if (options->gpu_id >= 300)
+                       num_unit *= 4;
+
+               /* shaders:
+                *
+                * note: num_unit seems to be # of instruction groups, where
+                * an instruction group has 4 64bit instructions.
+                */
+               if (stage == SHADER_VERTEX) {
+                       ext = "vo3";
+               } else if (stage == SHADER_GEOM) {
+                       ext = "go3";
+               } else if (stage == SHADER_COMPUTE) {
+                       ext = "co3";
+               } else if (stage == SHADER_FRAGMENT){
+                       ext = "fo3";
+               }
+
+               if (contents)
+                       disasm_a3xx(contents, num_unit * 2, level+2, stdout, options->gpu_id);
+
+               /* dump raw shader: */
+               if (ext)
+                       dump_shader(ext, contents, num_unit * 2 * 4);
+
+               break;
+       }
+       case SHADER_CONST: {
+               if (quiet(2))
+                       return;
+
+               /* uniforms/consts:
+                *
+                * note: num_unit seems to be # of pairs of dwords??
+                */
+
+               if (options->gpu_id >= 400)
+                       num_unit *= 2;
+
+               dump_float(contents, num_unit*2, level+1);
+               dump_hex(contents, num_unit*2, level+1);
+
+               break;
+       }
+       case TEX_MIPADDR: {
+               uint32_t *addrs = contents;
+
+               if (quiet(2))
+                       return;
+
+               /* mipmap consts block just appears to be array of num_unit gpu addr's: */
+               for (i = 0; i < num_unit; i++) {
+                       void *ptr = hostptr(addrs[i]);
+                       printf("%s%2d: %08x\n", levels[level+1], i, addrs[i]);
+                       if (options->dump_textures) {
+                               printf("base=%08x\n", (uint32_t)gpubaseaddr(addrs[i]));
+                               dump_hex(ptr, hostlen(addrs[i])/4, level+1);
+                       }
+               }
+               break;
+       }
+       case TEX_SAMP: {
+               dump_tex_samp(contents, src, num_unit, level);
+               break;
+       }
+       case TEX_CONST: {
+               dump_tex_const(contents, num_unit, level);
+               break;
+       }
+       case SSBO_0: {
+               uint32_t *ssboconst = (uint32_t *)contents;
+
+               for (i = 0; i < num_unit; i++) {
+                       int sz = 4;
+                       if (400 <= options->gpu_id && options->gpu_id < 500) {
+                               dump_domain(ssboconst, 4, level+2, "A4XX_SSBO_0");
+                       } else if (500 <= options->gpu_id && options->gpu_id < 600) {
+                               dump_domain(ssboconst, 4, level+2, "A5XX_SSBO_0");
+                       } else if (600 <= options->gpu_id && options->gpu_id < 700) {
+                               sz = 16;
+                               dump_domain(ssboconst, 16, level+2, "A6XX_IBO");
+                       }
+                       dump_hex(ssboconst, sz, level+1);
+                       ssboconst += sz;
+               }
+               break;
+       }
+       case SSBO_1: {
+               uint32_t *ssboconst = (uint32_t *)contents;
+
+               for (i = 0; i < num_unit; i++) {
+                       if (400 <= options->gpu_id && options->gpu_id < 500)
+                               dump_domain(ssboconst, 2, level+2, "A4XX_SSBO_1");
+                       else if (500 <= options->gpu_id && options->gpu_id < 600)
+                               dump_domain(ssboconst, 2, level+2, "A5XX_SSBO_1");
+                       dump_hex(ssboconst, 2, level+1);
+                       ssboconst += 2;
+               }
+               break;
+       }
+       case SSBO_2: {
+               uint32_t *ssboconst = (uint32_t *)contents;
+
+               for (i = 0; i < num_unit; i++) {
+                       /* TODO a4xx and a5xx might be same: */
+                       if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
+                               dump_domain(ssboconst, 2, level+2, "A5XX_SSBO_2");
+                               dump_hex(ssboconst, 2, level+1);
+                       }
+                       if (options->dump_textures) {
+                               uint64_t addr = (((uint64_t)ssboconst[1] & 0x1ffff) << 32) | ssboconst[0];
+                               dump_gpuaddr_size(addr, level-2, hostlen(addr) / 4, 3);
+                       }
+                       ssboconst += 2;
+               }
+               break;
+       }
+       case UBO: {
+               uint32_t *uboconst = (uint32_t *)contents;
+
+               for (i = 0; i < num_unit; i++) {
+                       // TODO probably similar on a4xx..
+                       if (500 <= options->gpu_id && options->gpu_id < 600)
+                               dump_domain(uboconst, 2, level+2, "A5XX_UBO");
+                       else if (600 <= options->gpu_id && options->gpu_id < 700)
+                               dump_domain(uboconst, 2, level+2, "A6XX_UBO");
+                       dump_hex(uboconst, 2, level+1);
+                       uboconst += src == STATE_SRC_BINDLESS ? 16 : 2;
+               }
+               break;
+       }
+       case UNKNOWN_DWORDS: {
+               if (quiet(2))
+                       return;
+               dump_hex(contents, num_unit, level+1);
+               break;
+       }
+       case UNKNOWN_2DWORDS: {
+               if (quiet(2))
+                       return;
+               dump_hex(contents, num_unit * 2, level+1);
+               break;
+       }
+       case UNKNOWN_4DWORDS: {
+               if (quiet(2))
+                       return;
+               dump_hex(contents, num_unit * 4, level+1);
+               break;
+       }
+       default:
+               if (quiet(2))
+                       return;
+               /* hmm.. */
+               dump_hex(contents, num_unit, level+1);
+               break;
+       }
+}
+
+static void
+cp_set_bin(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       bin_x1 = dwords[1] & 0xffff;
+       bin_y1 = dwords[1] >> 16;
+       bin_x2 = dwords[2] & 0xffff;
+       bin_y2 = dwords[2] >> 16;
+}
+
+static void
+dump_a2xx_tex_const(uint32_t *dwords, uint32_t sizedwords, uint32_t val, int level)
+{
+       uint32_t w, h, p;
+       uint32_t gpuaddr, flags, mip_gpuaddr, mip_flags;
+       uint32_t min, mag, swiz, clamp_x, clamp_y, clamp_z;
+       static const char *filter[] = {
+                       "point", "bilinear", "bicubic",
+       };
+       static const char *clamp[] = {
+                       "wrap", "mirror", "clamp-last-texel",
+       };
+       static const char swiznames[] = "xyzw01??";
+
+       /* see sys2gmem_tex_const[] in adreno_a2xxx.c */
+
+       /* Texture, FormatXYZW=Unsigned, ClampXYZ=Wrap/Repeat,
+        * RFMode=ZeroClamp-1, Dim=1:2d, pitch
+        */
+       p = (dwords[0] >> 22) << 5;
+       clamp_x = (dwords[0] >> 10) & 0x3;
+       clamp_y = (dwords[0] >> 13) & 0x3;
+       clamp_z = (dwords[0] >> 16) & 0x3;
+
+       /* Format=6:8888_WZYX, EndianSwap=0:None, ReqSize=0:256bit, DimHi=0,
+        * NearestClamp=1:OGL Mode
+        */
+       parse_dword_addr(dwords[1], &gpuaddr, &flags, 0xfff);
+
+       /* Width, Height, EndianSwap=0:None */
+       w = (dwords[2] & 0x1fff) + 1;
+       h = ((dwords[2] >> 13) & 0x1fff) + 1;
+
+       /* NumFormat=0:RF, DstSelXYZW=XYZW, ExpAdj=0, MagFilt=MinFilt=0:Point,
+        * Mip=2:BaseMap
+        */
+       mag = (dwords[3] >> 19) & 0x3;
+       min = (dwords[3] >> 21) & 0x3;
+       swiz = (dwords[3] >> 1) & 0xfff;
+
+       /* VolMag=VolMin=0:Point, MinMipLvl=0, MaxMipLvl=1, LodBiasH=V=0,
+        * Dim3d=0
+        */
+       // XXX
+
+       /* BorderColor=0:ABGRBlack, ForceBC=0:diable, TriJuice=0, Aniso=0,
+        * Dim=1:2d, MipPacking=0
+        */
+       parse_dword_addr(dwords[5], &mip_gpuaddr, &mip_flags, 0xfff);
+
+       printf("%sset texture const %04x\n", levels[level], val);
+       printf("%sclamp x/y/z: %s/%s/%s\n", levels[level+1],
+                       clamp[clamp_x], clamp[clamp_y], clamp[clamp_z]);
+       printf("%sfilter min/mag: %s/%s\n", levels[level+1], filter[min], filter[mag]);
+       printf("%sswizzle: %c%c%c%c\n", levels[level+1],
+                       swiznames[(swiz >> 0) & 0x7], swiznames[(swiz >> 3) & 0x7],
+                       swiznames[(swiz >> 6) & 0x7], swiznames[(swiz >> 9) & 0x7]);
+       printf("%saddr=%08x (flags=%03x), size=%dx%d, pitch=%d, format=%s\n",
+                       levels[level+1], gpuaddr, flags, w, h, p,
+                       rnn_enumname(rnn, "a2xx_sq_surfaceformat", flags & 0xf));
+       printf("%smipaddr=%08x (flags=%03x)\n", levels[level+1],
+                       mip_gpuaddr, mip_flags);
+}
+
+static void
+dump_a2xx_shader_const(uint32_t *dwords, uint32_t sizedwords, uint32_t val, int level)
+{
+       int i;
+       printf("%sset shader const %04x\n", levels[level], val);
+       for (i = 0; i < sizedwords; ) {
+               uint32_t gpuaddr, flags;
+               parse_dword_addr(dwords[i++], &gpuaddr, &flags, 0xf);
+               void *addr = hostptr(gpuaddr);
+               if (addr) {
+                       const char * fmt =
+                               rnn_enumname(rnn, "a2xx_sq_surfaceformat", flags & 0xf);
+                       uint32_t size = dwords[i++];
+                       printf("%saddr=%08x, size=%d, format=%s\n", levels[level+1],
+                                       gpuaddr, size, fmt);
+                       // TODO maybe dump these as bytes instead of dwords?
+                       size = (size + 3) / 4; // for now convert to dwords
+                       dump_hex(addr, min(size, 64), level + 1);
+                       if (size > min(size, 64))
+                               printf("%s\t\t...\n", levels[level+1]);
+                       dump_float(addr, min(size, 64), level + 1);
+                       if (size > min(size, 64))
+                               printf("%s\t\t...\n", levels[level+1]);
+               }
+       }
+}
+
+static void
+cp_set_const(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t val = dwords[0] & 0xffff;
+       switch((dwords[0] >> 16) & 0xf) {
+       case 0x0:
+               dump_float((float *)(dwords+1), sizedwords-1, level+1);
+               break;
+       case 0x1:
+               /* need to figure out how const space is partitioned between
+                * attributes, textures, etc..
+                */
+               if (val < 0x78) {
+                       dump_a2xx_tex_const(dwords+1, sizedwords-1, val, level);
+               } else {
+                       dump_a2xx_shader_const(dwords+1, sizedwords-1, val, level);
+               }
+               break;
+       case 0x2:
+               printf("%sset bool const %04x\n", levels[level], val);
+               break;
+       case 0x3:
+               printf("%sset loop const %04x\n", levels[level], val);
+               break;
+       case 0x4:
+               val += 0x2000;
+               if (dwords[0] & 0x80000000) {
+                       uint32_t srcreg = dwords[1];
+                       uint32_t dstval = dwords[2];
+
+                       /* TODO: not sure what happens w/ payload != 2.. */
+                       assert(sizedwords == 3);
+                       assert(srcreg < ARRAY_SIZE(type0_reg_vals));
+
+                       /* note: rnn_regname uses a static buf so we can't do
+                        * two regname() calls for one printf..
+                        */
+                       printf("%s%s = %08x + ", levels[level], regname(val, 1), dstval);
+                       printf("%s (%08x)\n", regname(srcreg, 1), type0_reg_vals[srcreg]);
+
+                       dstval += type0_reg_vals[srcreg];
+
+                       dump_registers(val, &dstval, 1, level+1);
+               } else {
+                       dump_registers(val, dwords+1, sizedwords-1, level+1);
+               }
+               break;
+       }
+}
+
+static void dump_register_summary(int level);
+
+static void
+cp_event_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       const char *name = rnn_enumname(rnn, "vgt_event_type", dwords[0]);
+       printl(2, "%sevent %s\n", levels[level], name);
+
+       if (name && (options->gpu_id > 500)) {
+               char eventname[64];
+               snprintf(eventname, sizeof(eventname), "EVENT:%s", name);
+               if (!strcmp(name, "BLIT")) {
+                       do_query(eventname, 0);
+                       print_mode(level);
+                       dump_register_summary(level);
+               }
+       }
+}
+
+static void
+dump_register_summary(int level)
+{
+       uint32_t i;
+       bool saved_summary = summary;
+       summary = false;
+
+       in_summary = true;
+
+       /* dump current state of registers: */
+       printl(2, "%sdraw[%i] register values\n", levels[level], draw_count);
+       for (i = 0; i < regcnt(); i++) {
+               uint32_t regbase = i;
+               uint32_t lastval = reg_val(regbase);
+               /* skip registers that haven't been updated since last draw/blit: */
+               if (!(options->allregs || reg_rewritten(regbase)))
+                       continue;
+               if (!reg_written(regbase))
+                       continue;
+               if (lastval != lastvals[regbase]) {
+                       printl(2, "!");
+                       lastvals[regbase] = lastval;
+               } else {
+                       printl(2, " ");
+               }
+               if (reg_rewritten(regbase)) {
+                       printl(2, "+");
+               } else {
+                       printl(2, " ");
+               }
+               printl(2, "\t%08x", lastval);
+               if (!quiet(2)) {
+                       dump_register(regbase, lastval, level);
+               }
+       }
+
+       clear_rewritten();
+
+       in_summary = false;
+
+       draw_count++;
+       summary = saved_summary;
+}
+
+static uint32_t
+draw_indx_common(uint32_t *dwords, int level)
+{
+       uint32_t prim_type     = dwords[1] & 0x1f;
+       uint32_t source_select = (dwords[1] >> 6) & 0x3;
+       uint32_t num_indices   = dwords[2];
+       const char *primtype;
+
+       primtype = rnn_enumname(rnn, "pc_di_primtype", prim_type);
+
+       do_query(primtype, num_indices);
+
+       printl(2, "%sdraw:          %d\n", levels[level], draws[ib]);
+       printl(2, "%sprim_type:     %s (%d)\n", levels[level], primtype,
+                       prim_type);
+       printl(2, "%ssource_select: %s (%d)\n", levels[level],
+                       rnn_enumname(rnn, "pc_di_src_sel", source_select),
+                       source_select);
+       printl(2, "%snum_indices:   %d\n", levels[level], num_indices);
+
+       vertices += num_indices;
+
+       draws[ib]++;
+
+       return num_indices;
+}
+
+enum pc_di_index_size {
+       INDEX_SIZE_IGN = 0,
+       INDEX_SIZE_16_BIT = 0,
+       INDEX_SIZE_32_BIT = 1,
+       INDEX_SIZE_8_BIT = 2,
+       INDEX_SIZE_INVALID = 0,
+};
+
+static void
+cp_draw_indx(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t num_indices = draw_indx_common(dwords, level);
+
+       assert(!is_64b());
+
+       /* if we have an index buffer, dump that: */
+       if (sizedwords == 5) {
+               void *ptr = hostptr(dwords[3]);
+               printl(2, "%sgpuaddr:       %08x\n", levels[level], dwords[3]);
+               printl(2, "%sidx_size:      %d\n", levels[level], dwords[4]);
+               if (ptr) {
+                       enum pc_di_index_size size =
+                                       ((dwords[1] >> 11) & 1) | ((dwords[1] >> 12) & 2);
+                       if (!quiet(2)) {
+                               int i;
+                               printf("%sidxs:         ", levels[level]);
+                               if (size == INDEX_SIZE_8_BIT) {
+                                       uint8_t *idx = ptr;
+                                       for (i = 0; i < dwords[4]; i++)
+                                               printf(" %u", idx[i]);
+                               } else if (size == INDEX_SIZE_16_BIT) {
+                                       uint16_t *idx = ptr;
+                                       for (i = 0; i < dwords[4]/2; i++)
+                                               printf(" %u", idx[i]);
+                               } else if (size == INDEX_SIZE_32_BIT) {
+                                       uint32_t *idx = ptr;
+                                       for (i = 0; i < dwords[4]/4; i++)
+                                               printf(" %u", idx[i]);
+                               }
+                               printf("\n");
+                               dump_hex(ptr, dwords[4]/4, level+1);
+                       }
+               }
+       }
+
+       /* don't bother dumping registers for the dummy draw_indx's.. */
+       if (num_indices > 0)
+               dump_register_summary(level);
+
+       needs_wfi = true;
+}
+
+static void
+cp_draw_indx_2(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t num_indices = draw_indx_common(dwords, level);
+       enum pc_di_index_size size =
+                       ((dwords[1] >> 11) & 1) | ((dwords[1] >> 12) & 2);
+       void *ptr = &dwords[3];
+       int sz = 0;
+
+       assert(!is_64b());
+
+       /* CP_DRAW_INDX_2 has embedded/inline idx buffer: */
+       if (!quiet(2)) {
+               int i;
+               printf("%sidxs:         ", levels[level]);
+               if (size == INDEX_SIZE_8_BIT) {
+                       uint8_t *idx = ptr;
+                       for (i = 0; i < num_indices; i++)
+                               printf(" %u", idx[i]);
+                       sz = num_indices;
+               } else if (size == INDEX_SIZE_16_BIT) {
+                       uint16_t *idx = ptr;
+                       for (i = 0; i < num_indices; i++)
+                               printf(" %u", idx[i]);
+                       sz = num_indices * 2;
+               } else if (size == INDEX_SIZE_32_BIT) {
+                       uint32_t *idx = ptr;
+                       for (i = 0; i < num_indices; i++)
+                               printf(" %u", idx[i]);
+                       sz = num_indices * 4;
+               }
+               printf("\n");
+               dump_hex(ptr, sz / 4, level+1);
+       }
+
+       /* don't bother dumping registers for the dummy draw_indx's.. */
+       if (num_indices > 0)
+               dump_register_summary(level);
+}
+
+static void
+cp_draw_indx_offset(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t num_indices = dwords[2];
+       uint32_t prim_type = dwords[0] & 0x1f;
+
+       do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), num_indices);
+       print_mode(level);
+
+       /* don't bother dumping registers for the dummy draw_indx's.. */
+       if (num_indices > 0)
+               dump_register_summary(level);
+}
+
+static void
+cp_draw_indx_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t prim_type = dwords[0] & 0x1f;
+       uint64_t addr;
+
+       do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
+       print_mode(level);
+
+       if (is_64b())
+               addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
+       else
+               addr = dwords[1];
+       dump_gpuaddr_size(addr, level, 0x10, 2);
+
+       if (is_64b())
+               addr = (((uint64_t)dwords[5] & 0x1ffff) << 32) | dwords[4];
+       else
+               addr = dwords[3];
+       dump_gpuaddr_size(addr, level, 0x10, 2);
+
+       dump_register_summary(level);
+}
+
+static void
+cp_draw_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t prim_type = dwords[0] & 0x1f;
+       uint64_t addr;
+
+       do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
+       print_mode(level);
+
+       addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
+       dump_gpuaddr_size(addr, level, 0x10, 2);
+
+       dump_register_summary(level);
+}
+
+static void
+cp_run_cl(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       do_query("COMPUTE", 1);
+       dump_register_summary(level);
+}
+
+static void
+cp_nop(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       const char *buf = (void *)dwords;
+       int i;
+
+       if (quiet(3))
+               return;
+
+       // blob doesn't use CP_NOP for string_marker but it does
+       // use it for things that end up looking like, but aren't
+       // ascii chars:
+       if (!options->decode_markers)
+               return;
+
+       for (i = 0; i < 4 * sizedwords; i++) {
+               if (buf[i] == '\0')
+                       break;
+               if (isascii(buf[i]))
+                       printf("%c", buf[i]);
+       }
+       printf("\n");
+}
+
+static void
+cp_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       /* traverse indirect buffers */
+       uint64_t ibaddr;
+       uint32_t ibsize;
+       uint32_t *ptr = NULL;
+
+       if (is_64b()) {
+               /* a5xx+.. high 32b of gpu addr, then size: */
+               ibaddr = dwords[0];
+               ibaddr |= ((uint64_t)dwords[1]) << 32;
+               ibsize = dwords[2];
+       } else {
+               ibaddr = dwords[0];
+               ibsize = dwords[1];
+       }
+
+       if (!quiet(3)) {
+               if (is_64b()) {
+                       printf("%sibaddr:%016lx\n", levels[level], ibaddr);
+               } else {
+                       printf("%sibaddr:%08x\n", levels[level], (uint32_t)ibaddr);
+               }
+               printf("%sibsize:%08x\n", levels[level], ibsize);
+       }
+
+       if (options->once && has_dumped(ibaddr, enable_mask))
+               return;
+
+       /* 'query-compare' mode implies 'once' mode, although we need only to
+        * process the cmdstream for *any* enable_mask mode, since we are
+        * comparing binning vs draw reg values at the same time, ie. it is
+        * not useful to process the same draw in both binning and draw pass.
+        */
+       if (options->query_compare && has_dumped(ibaddr, MODE_ALL))
+               return;
+
+       /* map gpuaddr back to hostptr: */
+       ptr = hostptr(ibaddr);
+
+       if (ptr) {
+               /* If the GPU hung within the target IB, the trigger point will be
+                * just after the current CP_INDIRECT_BUFFER.  Because the IB is
+                * executed but never returns.  Account for this by checking if
+                * the IB returned:
+                */
+               highlight_gpuaddr(gpuaddr(&dwords[is_64b() ? 3 : 2]));
+
+               ib++;
+               ibs[ib].base = ibaddr;
+               ibs[ib].size = ibsize;
+
+               dump_commands(ptr, ibsize, level);
+               ib--;
+       } else {
+               fprintf(stderr, "could not find: %016"PRIx64" (%d)\n", ibaddr, ibsize);
+       }
+}
+
+static void
+cp_wfi(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       needs_wfi = false;
+}
+
+static void
+cp_mem_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       if (quiet(2))
+               return;
+
+       if (is_64b()) {
+               uint64_t gpuaddr = dwords[0] | (((uint64_t)dwords[1]) << 32);
+               printf("%sgpuaddr:%016lx\n", levels[level], gpuaddr);
+               dump_hex(&dwords[2], sizedwords-2, level+1);
+
+               if (pkt_is_type4(dwords[2]) || pkt_is_type7(dwords[2]))
+                       dump_commands(&dwords[2], sizedwords-2, level+1);
+       } else {
+               uint32_t gpuaddr = dwords[0];
+               printf("%sgpuaddr:%08x\n", levels[level], gpuaddr);
+               dump_float((float *)&dwords[1], sizedwords-1, level+1);
+       }
+}
+
+static void
+cp_rmw(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t val = dwords[0] & 0xffff;
+       uint32_t and = dwords[1];
+       uint32_t or  = dwords[2];
+       printl(3, "%srmw (%s & 0x%08x) | 0x%08x)\n", levels[level], regname(val, 1), and, or);
+       if (needs_wfi)
+               printl(2, "NEEDS WFI: rmw (%s & 0x%08x) | 0x%08x)\n", regname(val, 1), and, or);
+       reg_set(val, (reg_val(val) & and) | or);
+}
+
+static void
+cp_reg_mem(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t val = dwords[0] & 0xffff;
+       printl(3, "%sbase register: %s\n", levels[level], regname(val, 1));
+
+       if (quiet(2))
+               return;
+
+       uint64_t gpuaddr = dwords[1] | (((uint64_t)dwords[2]) << 32);
+       printf("%sgpuaddr:%016lx\n", levels[level], gpuaddr);
+       void *ptr = hostptr(gpuaddr);
+       if (ptr) {
+               uint32_t cnt = (dwords[0] >> 19) & 0x3ff;
+               dump_hex(ptr, cnt, level + 1);
+       }
+}
+
+struct draw_state {
+       uint16_t enable_mask;
+       uint16_t flags;
+       uint32_t count;
+       uint64_t addr;
+};
+
+struct draw_state state[32];
+
+#define FLAG_DIRTY              0x1
+#define FLAG_DISABLE            0x2
+#define FLAG_DISABLE_ALL_GROUPS 0x4
+#define FLAG_LOAD_IMMED         0x8
+
+static int draw_mode;
+
+static void
+disable_group(unsigned group_id)
+{
+       struct draw_state *ds = &state[group_id];
+       memset(ds, 0, sizeof(*ds));
+}
+
+static void
+disable_all_groups(void)
+{
+       for (unsigned i = 0; i < ARRAY_SIZE(state); i++)
+               disable_group(i);
+}
+
+static void
+load_group(unsigned group_id, int level)
+{
+       struct draw_state *ds = &state[group_id];
+
+       if (!ds->count)
+               return;
+
+       printl(2, "%sgroup_id: %u\n", levels[level], group_id);
+       printl(2, "%scount: %d\n", levels[level], ds->count);
+       printl(2, "%saddr: %016llx\n", levels[level], ds->addr);
+       printl(2, "%sflags: %x\n", levels[level], ds->flags);
+
+       if (options->gpu_id >= 600) {
+               printl(2, "%senable_mask: 0x%x\n", levels[level], ds->enable_mask);
+
+               if (!(ds->enable_mask & enable_mask)) {
+                       printl(2, "%s\tskipped!\n\n", levels[level]);
+                       return;
+               }
+       }
+
+       void *ptr = hostptr(ds->addr);
+       if (ptr) {
+               if (!quiet(2))
+                       dump_hex(ptr, ds->count, level+1);
+
+               ib++;
+               dump_commands(ptr, ds->count, level+1);
+               ib--;
+       }
+}
+
+static void
+load_all_groups(int level)
+{
+       /* sanity check, we should never recursively hit recursion here, and if
+        * we do bad things happen:
+        */
+       static bool loading_groups = false;
+       if (loading_groups) {
+               printf("ERROR: nothing in draw state should trigger recursively loading groups!\n");
+               return;
+       }
+       loading_groups = true;
+       for (unsigned i = 0; i < ARRAY_SIZE(state); i++)
+               load_group(i, level);
+       loading_groups = false;
+
+       /* in 'query-compare' mode, defer disabling all groups until we have a
+        * chance to process the query:
+        */
+       if (!options->query_compare)
+               disable_all_groups();
+}
+
+static void
+cp_set_draw_state(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t i;
+
+       for (i = 0; i < sizedwords; ) {
+               struct draw_state *ds;
+               uint32_t count = dwords[i] & 0xffff;
+               uint32_t group_id = (dwords[i] >> 24) & 0x1f;
+               uint32_t enable_mask = (dwords[i] >> 20) & 0xf;
+               uint32_t flags = (dwords[i] >> 16) & 0xf;
+               uint64_t addr;
+
+               if (is_64b()) {
+                       addr = dwords[i + 1];
+                       addr |= ((uint64_t)dwords[i + 2]) << 32;
+                       i += 3;
+               } else {
+                       addr = dwords[i + 1];
+                       i += 2;
+               }
+
+               if (flags & FLAG_DISABLE_ALL_GROUPS) {
+                       disable_all_groups();
+                       continue;
+               }
+
+               if (flags & FLAG_DISABLE) {
+                       disable_group(group_id);
+                       continue;
+               }
+
+               assert(group_id < ARRAY_SIZE(state));
+               disable_group(group_id);
+
+               ds = &state[group_id];
+
+               ds->enable_mask = enable_mask;
+               ds->flags = flags;
+               ds->count = count;
+               ds->addr  = addr;
+
+               if (flags & FLAG_LOAD_IMMED) {
+                       load_group(group_id, level);
+                       disable_group(group_id);
+               }
+       }
+}
+
+static void
+cp_set_mode(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       draw_mode = dwords[0];
+}
+
+/* execute compute shader */
+static void
+cp_exec_cs(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       do_query("compute", 0);
+       dump_register_summary(level);
+}
+
+static void
+cp_exec_cs_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint64_t addr;
+
+       if (is_64b()) {
+               addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
+       } else {
+               addr = dwords[1];
+       }
+
+       printl(3, "%saddr: %016llx\n", levels[level], addr);
+       dump_gpuaddr_size(addr, level, 0x10, 2);
+
+       do_query("compute", 0);
+       dump_register_summary(level);
+}
+
+static void
+cp_set_marker(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       render_mode = rnn_enumname(rnn, "a6xx_render_mode", dwords[0] & 0xf);
+
+       if (!strcmp(render_mode, "RM6_BINNING")) {
+               enable_mask = MODE_BINNING;
+       } else if (!strcmp(render_mode, "RM6_GMEM")) {
+               enable_mask = MODE_GMEM;
+       } else if (!strcmp(render_mode, "RM6_BYPASS")) {
+               enable_mask = MODE_BYPASS;
+       }
+}
+
+static void
+cp_set_render_mode(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint64_t addr;
+       uint32_t *ptr, len;
+
+       assert(is_64b());
+
+       /* TODO seems to have two ptrs, 9 dwords total (incl pkt7 hdr)..
+        * not sure if this can come in different sizes.
+        *
+        * First ptr doesn't seem to be cmdstream, second one does.
+        *
+        * Comment from downstream kernel:
+        *
+        * SRM -- set render mode (ex binning, direct render etc)
+        * SRM is set by UMD usually at start of IB to tell CP the type of
+        * preemption.
+        * KMD needs to set SRM to NULL to indicate CP that rendering is
+        * done by IB.
+        * ------------------------------------------------------------------
+        *
+        * Seems to always be one of these two:
+        * 70ec0008 00000001 001c0000 00000000 00000010 00000003 0000000d 001c2000 00000000
+        * 70ec0008 00000001 001c0000 00000000 00000000 00000003 0000000d 001c2000 00000000
+        *
+        */
+
+       assert(options->gpu_id >= 500);
+
+       render_mode = rnn_enumname(rnn, "render_mode_cmd", dwords[0]);
+
+       if (sizedwords == 1)
+               return;
+
+       addr = dwords[1];
+       addr |= ((uint64_t)dwords[2]) << 32;
+
+       mode = dwords[3];
+
+       dump_gpuaddr(addr, level+1);
+
+       if (sizedwords == 5)
+               return;
+
+       assert(sizedwords == 8);
+
+       len = dwords[5];
+       addr = dwords[6];
+       addr |= ((uint64_t)dwords[7]) << 32;
+
+       printl(3, "%saddr: 0x%016lx\n", levels[level], addr);
+       printl(3, "%slen:  0x%x\n", levels[level], len);
+
+       ptr = hostptr(addr);
+
+       if (ptr) {
+               if (!quiet(2)) {
+                       ib++;
+                       dump_commands(ptr, len, level+1);
+                       ib--;
+                       dump_hex(ptr, len, level+1);
+               }
+       }
+}
+
+static void
+cp_compute_checkpoint(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint64_t addr;
+       uint32_t *ptr, len;
+
+       assert(is_64b());
+       assert(options->gpu_id >= 500);
+
+       assert(sizedwords == 8);
+
+       addr = dwords[5];
+       addr |= ((uint64_t)dwords[6]) << 32;
+       len = dwords[7];
+
+       printl(3, "%saddr: 0x%016lx\n", levels[level], addr);
+       printl(3, "%slen:  0x%x\n", levels[level], len);
+
+       ptr = hostptr(addr);
+
+       if (ptr) {
+               if (!quiet(2)) {
+                       ib++;
+                       dump_commands(ptr, len, level+1);
+                       ib--;
+                       dump_hex(ptr, len, level+1);
+               }
+       }
+}
+
+static void
+cp_blit(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       do_query(rnn_enumname(rnn, "cp_blit_cmd", dwords[0]), 0);
+       print_mode(level);
+       dump_register_summary(level);
+}
+
+static void
+cp_context_reg_bunch(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       int i;
+
+       /* NOTE: seems to write same reg multiple times.. not sure if different parts of
+        * these are triggered by the FLUSH_SO_n events?? (if that is what they actually
+        * are?)
+        */
+       bool saved_summary = summary;
+       summary = false;
+
+       for (i = 0; i < sizedwords; i += 2) {
+               dump_register(dwords[i+0], dwords[i+1], level+1);
+               reg_set(dwords[i+0], dwords[i+1]);
+       }
+
+       summary = saved_summary;
+}
+
+static void
+cp_reg_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint32_t reg = dwords[1] & 0xffff;
+
+       dump_register(reg, dwords[2], level+1);
+       reg_set(reg, dwords[2]);
+}
+
+static void
+cp_set_ctxswitch_ib(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       uint64_t addr;
+       uint32_t size = dwords[2] & 0xffff;
+       void *ptr;
+
+       addr = dwords[0] | ((uint64_t)dwords[1] << 32);
+
+       printf("addr=%lx\n", addr);
+       ptr = hostptr(addr);
+       if (ptr) {
+               dump_commands(ptr, size, level+1);
+       }
+}
+
+static void
+cp_skip_ib2_enable_global(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       skip_ib2_enable_global = dwords[0];
+}
+
+static void
+cp_skip_ib2_enable_local(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       skip_ib2_enable_local = dwords[0];
+}
+
+#define CP(x, fxn, ...)   { "CP_" #x, fxn, ##__VA_ARGS__ }
+static const struct type3_op {
+       const char *name;
+       void (*fxn)(uint32_t *dwords, uint32_t sizedwords, int level);
+       struct {
+               bool load_all_groups;
+       } options;
+} type3_op[] = {
+               CP(NOP, cp_nop),
+               CP(INDIRECT_BUFFER, cp_indirect),
+               CP(INDIRECT_BUFFER_PFD, cp_indirect),
+               CP(WAIT_FOR_IDLE, cp_wfi),
+               CP(REG_RMW, cp_rmw),
+               CP(REG_TO_MEM, cp_reg_mem),
+               CP(MEM_TO_REG, cp_reg_mem),  /* same layout as CP_REG_TO_MEM */
+               CP(MEM_WRITE, cp_mem_write),
+               CP(EVENT_WRITE, cp_event_write),
+               CP(RUN_OPENCL, cp_run_cl),
+               CP(DRAW_INDX, cp_draw_indx, {.load_all_groups=true}),
+               CP(DRAW_INDX_2, cp_draw_indx_2, {.load_all_groups=true}),
+               CP(SET_CONSTANT, cp_set_const),
+               CP(IM_LOAD_IMMEDIATE, cp_im_loadi),
+               CP(WIDE_REG_WRITE, cp_wide_reg_write),
+
+               /* for a3xx */
+               CP(LOAD_STATE, cp_load_state),
+               CP(SET_BIN, cp_set_bin),
+
+               /* for a4xx */
+               CP(LOAD_STATE4, cp_load_state),
+               CP(SET_DRAW_STATE, cp_set_draw_state),
+               CP(DRAW_INDX_OFFSET, cp_draw_indx_offset, {.load_all_groups=true}),
+               CP(EXEC_CS, cp_exec_cs, {.load_all_groups=true}),
+               CP(EXEC_CS_INDIRECT, cp_exec_cs_indirect, {.load_all_groups=true}),
+
+               /* for a5xx */
+               CP(SET_RENDER_MODE, cp_set_render_mode),
+               CP(COMPUTE_CHECKPOINT, cp_compute_checkpoint),
+               CP(BLIT, cp_blit),
+               CP(CONTEXT_REG_BUNCH, cp_context_reg_bunch),
+               CP(DRAW_INDIRECT, cp_draw_indirect, {.load_all_groups=true}),
+               CP(DRAW_INDX_INDIRECT, cp_draw_indx_indirect, {.load_all_groups=true}),
+               CP(SKIP_IB2_ENABLE_GLOBAL, cp_skip_ib2_enable_global),
+               CP(SKIP_IB2_ENABLE_LOCAL, cp_skip_ib2_enable_local),
+
+               /* for a6xx */
+               CP(LOAD_STATE6_GEOM, cp_load_state),
+               CP(LOAD_STATE6_FRAG, cp_load_state),
+               CP(LOAD_STATE6, cp_load_state),
+               CP(SET_MODE, cp_set_mode),
+               CP(SET_MARKER, cp_set_marker),
+               CP(REG_WRITE, cp_reg_write),
+
+               CP(SET_CTXSWITCH_IB, cp_set_ctxswitch_ib),
+};
+
+static void
+noop_fxn(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+}
+
+static const struct type3_op *
+get_type3_op(unsigned opc)
+{
+       static const struct type3_op dummy_op = {
+               .fxn = noop_fxn,
+       };
+       const char *name = pktname(opc);
+
+       if (!name)
+               return &dummy_op;
+
+       for (unsigned i = 0; i < ARRAY_SIZE(type3_op); i++)
+               if (!strcmp(name, type3_op[i].name))
+                       return &type3_op[i];
+
+       return &dummy_op;
+}
+
+void
+dump_commands(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+       int dwords_left = sizedwords;
+       uint32_t count = 0; /* dword count including packet header */
+       uint32_t val;
+
+//     assert(dwords);
+       if (!dwords) {
+               printf("NULL cmd buffer!\n");
+               return;
+       }
+
+       draws[ib] = 0;
+
+       while (dwords_left > 0) {
+
+               current_draw_count = draw_count;
+
+               /* hack, this looks like a -1 underflow, in some versions
+                * when it tries to write zero registers via pkt0
+                */
+//             if ((dwords[0] >> 16) == 0xffff)
+//                     goto skip;
+
+               if (pkt_is_type0(dwords[0])) {
+                       printl(3, "t0");
+                       count = type0_pkt_size(dwords[0]) + 1;
+                       val = type0_pkt_offset(dwords[0]);
+                       assert(val < regcnt());
+                       printl(3, "%swrite %s%s (%04x)\n", levels[level+1], regname(val, 1),
+                                       (dwords[0] & 0x8000) ? " (same register)" : "", val);
+                       dump_registers(val, dwords+1, count-1, level+2);
+                       if (!quiet(3))
+                               dump_hex(dwords, count, level+1);
+               } else if (pkt_is_type4(dwords[0])) {
+                       /* basically the same(ish) as type0 prior to a5xx */
+                       printl(3, "t4");
+                       count = type4_pkt_size(dwords[0]) + 1;
+                       val = type4_pkt_offset(dwords[0]);
+                       assert(val < regcnt());
+                       printl(3, "%swrite %s (%04x)\n", levels[level+1], regname(val, 1), val);
+                       dump_registers(val, dwords+1, count-1, level+2);
+                       if (!quiet(3))
+                               dump_hex(dwords, count, level+1);
+#if 0
+               } else if (pkt_is_type1(dwords[0])) {
+                       printl(3, "t1");
+                       count = 3;
+                       val = dwords[0] & 0xfff;
+                       printl(3, "%swrite %s\n", levels[level+1], regname(val, 1));
+                       dump_registers(val, dwords+1, 1, level+2);
+                       val = (dwords[0] >> 12) & 0xfff;
+                       printl(3, "%swrite %s\n", levels[level+1], regname(val, 1));
+                       dump_registers(val, dwords+2, 1, level+2);
+                       if (!quiet(3))
+                               dump_hex(dwords, count, level+1);
+               } else if (pkt_is_type2(dwords[0])) {
+                       printl(3, "t2");
+                       printf("%sNOP\n", levels[level+1]);
+                       count = 1;
+                       if (!quiet(3))
+                               dump_hex(dwords, count, level+1);
+#endif
+               } else if (pkt_is_type3(dwords[0])) {
+                       count = type3_pkt_size(dwords[0]) + 1;
+                       val = cp_type3_opcode(dwords[0]);
+                       const struct type3_op *op = get_type3_op(val);
+                       if (op->options.load_all_groups)
+                               load_all_groups(level+1);
+                       printl(3, "t3");
+                       const char *name = pktname(val);
+                       if (!quiet(2)) {
+                               printf("\t%sopcode: %s%s%s (%02x) (%d dwords)%s\n", levels[level],
+                                               rnn->vc->colors->bctarg, name, rnn->vc->colors->reset,
+                                               val, count, (dwords[0] & 0x1) ? " (predicated)" : "");
+                       }
+                       if (name)
+                               dump_domain(dwords+1, count-1, level+2, name);
+                       op->fxn(dwords+1, count-1, level+1);
+                       if (!quiet(2))
+                               dump_hex(dwords, count, level+1);
+               } else if (pkt_is_type7(dwords[0])) {
+                       count = type7_pkt_size(dwords[0]) + 1;
+                       val = cp_type7_opcode(dwords[0]);
+                       const struct type3_op *op = get_type3_op(val);
+                       if (op->options.load_all_groups)
+                               load_all_groups(level+1);
+                       printl(3, "t7");
+                       const char *name = pktname(val);
+                       if (!quiet(2)) {
+                               printf("\t%sopcode: %s%s%s (%02x) (%d dwords)\n", levels[level],
+                                               rnn->vc->colors->bctarg, name, rnn->vc->colors->reset,
+                                               val, count);
+                       }
+                       if (name) {
+                               /* special hack for two packets that decode the same way
+                                * on a6xx:
+                                */
+                               if (!strcmp(name, "CP_LOAD_STATE6_FRAG") ||
+                                               !strcmp(name, "CP_LOAD_STATE6_GEOM"))
+                                       name = "CP_LOAD_STATE6";
+                               dump_domain(dwords+1, count-1, level+2, name);
+                       }
+                       op->fxn(dwords+1, count-1, level+1);
+                       if (!quiet(2))
+                               dump_hex(dwords, count, level+1);
+               } else if (pkt_is_type2(dwords[0])) {
+                       printl(3, "t2");
+                       printl(3, "%snop\n", levels[level+1]);
+               } else {
+                       /* for 5xx+ we can do a passable job of looking for start of next valid packet: */
+                       if (options->gpu_id >= 500) {
+                               while (dwords_left > 0) {
+                                       if (pkt_is_type7(dwords[0]) || pkt_is_type4(dwords[0]))
+                                               break;
+                                       printf("bad type! %08x\n", dwords[0]);
+                                       dwords++;
+                                       dwords_left--;
+                               }
+                       } else {
+                               printf("bad type! %08x\n", dwords[0]);
+                               return;
+                       }
+               }
+
+               dwords += count;
+               dwords_left -= count;
+
+       }
+
+       if (dwords_left < 0)
+               printf("**** this ain't right!! dwords_left=%d\n", dwords_left);
+}
diff --git a/src/freedreno/decode/cffdec.h b/src/freedreno/decode/cffdec.h
new file mode 100644 (file)
index 0000000..695aec3
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CFFDEC_H__
+#define __CFFDEC_H__
+
+#include <stdbool.h>
+
+enum query_mode {
+       /* default mode, dump all queried regs on each draw: */
+       QUERY_ALL = 0,
+
+       /* only dump if any of the queried regs were written
+        * since last draw:
+        */
+       QUERY_WRITTEN,
+
+       /* only dump if any of the queried regs changed since
+        * last draw:
+        */
+       QUERY_DELTA,
+};
+
+struct cffdec_options {
+       unsigned gpu_id;
+       int draw_filter;
+       int color;
+       int dump_shaders;
+       int summary;
+       int allregs;
+       int dump_textures;
+       int decode_markers;
+       char *script;
+
+       int query_compare;  /* binning vs SYSMEM/GMEM compare mode */
+       int query_mode;     /* enum query_mode */
+       char **querystrs;
+       int nquery;
+
+       /* In "once" mode, only decode a cmdstream buffer once (per draw
+        * mode, in the case of a6xx+ where a single cmdstream buffer can
+        * be used for both binning and draw pass), rather than each time
+        * encountered (ie. once per tile/bin in GMEM draw passes)
+        */
+       int once;
+
+       /* for crashdec, where we know CP_IBx_REM_SIZE, we can use this
+        * to highlight the cmdstream not parsed yet, to make it easier
+        * to see how far along the CP is.
+        */
+       struct {
+               uint64_t base;
+               uint32_t rem;
+       } ibs[4];
+};
+
+void printl(int lvl, const char *fmt, ...);
+const char * pktname(unsigned opc);
+uint32_t regbase(const char *name);
+const char * regname(uint32_t regbase, int color);
+bool reg_written(uint32_t regbase);
+uint32_t reg_lastval(uint32_t regbase);
+uint32_t reg_val(uint32_t regbase);
+void reg_set(uint32_t regbase, uint32_t val);
+void reset_regs(void);
+void cffdec_init(const struct cffdec_options *options);
+void dump_register_val(uint32_t regbase, uint32_t dword, int level);
+void dump_commands(uint32_t *dwords, uint32_t sizedwords, int level);
+
+/*
+ * Helpers for packet parsing:
+ */
+
+
+#define CP_TYPE0_PKT 0x00000000
+#define CP_TYPE2_PKT 0x80000000
+#define CP_TYPE3_PKT 0xc0000000
+#define CP_TYPE4_PKT 0x40000000
+#define CP_TYPE7_PKT 0x70000000
+
+#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT)
+#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
+#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
+
+#define pkt_is_type2(pkt) ((pkt) == CP_TYPE2_PKT)
+
+/*
+ * Check both for the type3 opcode and make sure that the reserved bits [1:7]
+ * and 15 are 0
+ */
+
+static inline uint pm4_calc_odd_parity_bit(uint val)
+{
+       return (0x9669 >> (0xf & ((val) ^
+                       ((val) >> 4) ^ ((val) >> 8) ^ ((val) >> 12) ^
+                       ((val) >> 16) ^ ((val) >> 20) ^ ((val) >> 24) ^
+                       ((val) >> 28)))) & 1;
+}
+
+#define pkt_is_type3(pkt) \
+        ((((pkt) & 0xC0000000) == CP_TYPE3_PKT) && \
+         (((pkt) & 0x80FE) == 0))
+
+#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
+#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
+
+#define pkt_is_type4(pkt) \
+        ((((pkt) & 0xF0000000) == CP_TYPE4_PKT) && \
+         ((((pkt) >> 27) & 0x1) == \
+         pm4_calc_odd_parity_bit(type4_pkt_offset(pkt))) \
+         && ((((pkt) >> 7) & 0x1) == \
+         pm4_calc_odd_parity_bit(type4_pkt_size(pkt))))
+
+#define type4_pkt_offset(pkt) (((pkt) >> 8) & 0x7FFFF)
+#define type4_pkt_size(pkt) ((pkt) & 0x7F)
+
+#define pkt_is_type7(pkt) \
+        ((((pkt) & 0xF0000000) == CP_TYPE7_PKT) && \
+         (((pkt) & 0x0F000000) == 0) && \
+         ((((pkt) >> 23) & 0x1) == \
+         pm4_calc_odd_parity_bit(cp_type7_opcode(pkt))) \
+         && ((((pkt) >> 15) & 0x1) == \
+         pm4_calc_odd_parity_bit(type7_pkt_size(pkt))))
+
+#define cp_type7_opcode(pkt) (((pkt) >> 16) & 0x7F)
+#define type7_pkt_size(pkt) ((pkt) & 0x3FFF)
+
+#endif /* __CFFDEC_H__ */
diff --git a/src/freedreno/decode/cffdump.c b/src/freedreno/decode/cffdump.c
new file mode 100644 (file)
index 0000000..7fec7dc
--- /dev/null
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <err.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "script.h"
+#include "io.h"
+#include "rnnutil.h"
+#include "pager.h"
+#include "buffers.h"
+#include "cffdec.h"
+
+static struct cffdec_options options = {
+       .gpu_id = 220,
+};
+
+static bool needs_wfi = false;
+static bool is_blob = false;
+static int show_comp = false;
+static int interactive;
+static int vertices;
+
+static int handle_file(const char *filename, int start, int end, int draw);
+
+static void print_usage(const char *name)
+{
+       fprintf(stderr, "Usage:\n\n"
+                       "\t%s [OPTSIONS]... FILE...\n\n"
+                       "Options:\n"
+                       "\t-v, --verbose    - more verbose disassembly\n"
+                       "\t--dump-shaders   - dump each shader to a raw file\n"
+                       "\t--no-color       - disable colorized output (default for non-console\n"
+                       "\t                   output)\n"
+                       "\t--color          - enable colorized output (default for tty output)\n"
+                       "\t--no-pager       - disable pager (default for non-console output)\n"
+                       "\t--pager          - enable pager (default for tty output)\n"
+                       "\t-s, --summary    - don't show individual register writes, but just\n"
+                       "\t                   register values on draws\n"
+                       "\t-a, --allregs    - show all registers (including ones not written\n"
+                       "\t                   since previous draw) on each draw\n"
+                       "\t-S, --start=N    - start decoding from frame N\n"
+                       "\t-E, --end=N      - stop decoding after frame N\n"
+                       "\t-F, --frame=N    - decode only frame N\n"
+                       "\t-D, --draw=N     - decode only draw N\n"
+                       "\t--textures       - dump texture contents (if possible)\n"
+                       "\t-L, --script=LUA - run specified lua script to analyze state\n"
+                       "\t-q, --query=REG  - query mode, dump only specified query registers on\n"
+                       "\t                   each draw; multiple --query/-q args can be given to\n"
+                       "\t                   dump multiple registers; register can be specified\n"
+                       "\t                   either by name or numeric offset\n"
+                       "\t--query-all      - in query mode, show all queried regs on each draw\n"
+                       "\t                   (default query mode)\n"
+                       "\t--query-written  - in query mode, show queried regs on draws if any of\n"
+                       "\t                   them have been written since previous draw\n"
+                       "\t--query-delta    - in query mode, show queried regs on draws if any of\n"
+                       "\t                   them have changed since previous draw\n"
+                       "\t--query-compare  - dump registers for BINNING vs GMEM/BYPASS per draw;\n"
+                       "\t                   only applicable for regs set via SDS group (a6xx+),\n"
+                       "\t                   implies --once, can be combined with --query-all,\n"
+                       "\t                   --query-written, or --query-delta\n"
+                       "\t--once           - decode cmdstream only once (per draw mode); if same\n"
+                       "\t                   cmdstream is executed for each tile, this will decode\n"
+                       "\t                   it only for the first tile and skip the remainder,\n"
+                       "\t                   which can be useful when looking at state that does\n"
+                       "\t                   not change per tile\n"
+                       "\t--not-once       - decode cmdstream for each IB (default)\n"
+                       "\t-h, --help       - show this message\n"
+                       , name);
+       exit(2);
+}
+
+static const struct option opts[] = {
+       /* Long opts that simply set a flag (no corresponding short alias: */
+       { "dump-shaders",    no_argument, &options.dump_shaders,  1 },
+       { "no-color",        no_argument, &options.color,         0 },
+       { "color",           no_argument, &options.color,         1 },
+       { "no-pager",        no_argument, &interactive,           0 },
+       { "pager",           no_argument, &interactive,           1 },
+       { "textures",        no_argument, &options.dump_textures, 1 },
+       { "show-compositor", no_argument, &show_comp,             1 },
+       { "query-all",       no_argument, &options.query_mode,    QUERY_ALL },
+       { "query-written",   no_argument, &options.query_mode,    QUERY_WRITTEN },
+       { "query-delta",     no_argument, &options.query_mode,    QUERY_DELTA },
+       { "query-compare",   no_argument, &options.query_compare, 1 },
+       { "once",            no_argument, &options.once,          1 },
+       { "not-once",        no_argument, &options.once,          0 },
+
+       /* Long opts with short alias: */
+       { "verbose",   no_argument,       0, 'v' },
+       { "summary",   no_argument,       0, 's' },
+       { "allregs",   no_argument,       0, 'a' },
+       { "start",     required_argument, 0, 'S' },
+       { "end",       required_argument, 0, 'E' },
+       { "frame",     required_argument, 0, 'F' },
+       { "draw",      required_argument, 0, 'D' },
+       { "script",    required_argument, 0, 'L' },
+       { "query",     required_argument, 0, 'q' },
+       { "help",      no_argument,       0, 'h' },
+};
+
+int main(int argc, char **argv)
+{
+       int ret = -1;
+       int start = 0, end = 0x7ffffff, draw = -1;
+       int c;
+
+       interactive = isatty(STDOUT_FILENO);
+
+       options.color = interactive;
+
+       while ((c = getopt_long(argc, argv, "vsaS:E:F:D:L:q:h", opts, NULL)) != -1) {
+               switch (c) {
+               case 0:
+                       /* option that set a flag, nothing to do */
+                       break;
+               case 'v':
+                       disasm_set_debug(PRINT_RAW | EXPAND_REPEAT | PRINT_VERBOSE);
+                       break;
+               case 's':
+                       options.summary = true;
+                       break;
+               case 'a':
+                       options.allregs = true;
+                       break;
+               case 'S':
+                       start = atoi(optarg);
+                       break;
+               case 'E':
+                       end = atoi(optarg);
+                       break;
+               case 'F':
+                       start = end = atoi(optarg);
+                       break;
+               case 'D':
+                       draw = atoi(optarg);
+                       break;
+               case 'L':
+                       options.script = optarg;
+                       if (script_load(options.script)) {
+                               errx(-1, "error loading %s\n", options.script);
+                       }
+                       break;
+               case 'q':
+                       options.querystrs = realloc(options.querystrs,
+                                       (options.nquery + 1) * sizeof(*options.querystrs));
+                       options.querystrs[options.nquery] = optarg;
+                       options.nquery++;
+                       interactive = 0;
+                       break;
+               case 'h':
+               default:
+                       print_usage(argv[0]);
+               }
+       }
+
+       if (interactive) {
+               pager_open();
+       }
+
+       while (optind < argc) {
+               ret = handle_file(argv[optind], start, end, draw);
+               if (ret) {
+                       fprintf(stderr, "error reading: %s\n", argv[optind]);
+                       fprintf(stderr, "continuing..\n");
+               }
+               optind++;
+       }
+
+       if (ret)
+               print_usage(argv[0]);
+
+       if ((options.query_mode || options.query_compare) && !options.nquery) {
+               fprintf(stderr, "query options only valid in query mode!\n");
+               print_usage(argv[0]);
+       }
+
+       script_finish();
+
+       if (interactive) {
+               pager_close();
+       }
+
+       return ret;
+}
+
+static void parse_addr(uint32_t *buf, int sz, unsigned int *len, uint64_t *gpuaddr)
+{
+       *gpuaddr = buf[0];
+       *len = buf[1];
+       if (sz > 8)
+               *gpuaddr |= ((uint64_t)(buf[2])) << 32;
+}
+
+static int handle_file(const char *filename, int start, int end, int draw)
+{
+       enum rd_sect_type type = RD_NONE;
+       void *buf = NULL;
+       struct io *io;
+       int submit = 0, got_gpu_id = 0;
+       int sz, ret = 0;
+       bool needs_reset = false;
+       bool skip = false;
+
+       options.draw_filter = draw;
+
+       cffdec_init(&options);
+
+       printf("Reading %s...\n", filename);
+
+       script_start_cmdstream(filename);
+
+       if (!strcmp(filename, "-"))
+               io = io_openfd(0);
+       else
+               io = io_open(filename);
+
+       if (!io) {
+               fprintf(stderr, "could not open: %s\n", filename);
+               return -1;
+       }
+
+       struct {
+               unsigned int len;
+               uint64_t gpuaddr;
+       } gpuaddr = {0};
+
+       while (true) {
+               uint32_t arr[2];
+
+               ret = io_readn(io, arr, 8);
+               if (ret <= 0)
+                       goto end;
+
+               while ((arr[0] == 0xffffffff) && (arr[1] == 0xffffffff)) {
+                       ret = io_readn(io, arr, 8);
+                       if (ret <= 0)
+                               goto end;
+               }
+
+               type = arr[0];
+               sz = arr[1];
+
+               if (sz < 0) {
+                       ret = -1;
+                       goto end;
+               }
+
+               free(buf);
+
+               needs_wfi = false;
+
+               buf = malloc(sz + 1);
+               ((char *)buf)[sz] = '\0';
+               ret = io_readn(io, buf, sz);
+               if (ret < 0)
+                       goto end;
+
+               switch(type) {
+               case RD_TEST:
+                       printl(1, "test: %s\n", (char *)buf);
+                       break;
+               case RD_CMD:
+                       is_blob = true;
+                       printl(2, "cmd: %s\n", (char *)buf);
+                       skip = false;
+                       if (!show_comp) {
+                               skip |= (strstr(buf, "fdperf") == buf);
+                               skip |= (strstr(buf, "chrome") == buf);
+                               skip |= (strstr(buf, "surfaceflinger") == buf);
+                               skip |= ((char *)buf)[0] == 'X';
+                       }
+                       break;
+               case RD_VERT_SHADER:
+                       printl(2, "vertex shader:\n%s\n", (char *)buf);
+                       break;
+               case RD_FRAG_SHADER:
+                       printl(2, "fragment shader:\n%s\n", (char *)buf);
+                       break;
+               case RD_GPUADDR:
+                       if (needs_reset) {
+                               reset_buffers();
+                               needs_reset = false;
+                       }
+                       parse_addr(buf, sz, &gpuaddr.len, &gpuaddr.gpuaddr);
+                       break;
+               case RD_BUFFER_CONTENTS:
+                       add_buffer(gpuaddr.gpuaddr, gpuaddr.len, buf);
+                       buf = NULL;
+                       break;
+               case RD_CMDSTREAM_ADDR:
+                       if ((start <= submit) && (submit <= end)) {
+                               unsigned int sizedwords;
+                               uint64_t gpuaddr;
+                               parse_addr(buf, sz, &sizedwords, &gpuaddr);
+                               printl(2, "############################################################\n");
+                               printl(2, "cmdstream: %d dwords\n", sizedwords);
+                               if (!skip) {
+                                       script_start_submit();
+                                       dump_commands(hostptr(gpuaddr), sizedwords, 0);
+                                       script_end_submit();
+                               }
+                               printl(2, "############################################################\n");
+                               printl(2, "vertices: %d\n", vertices);
+                       }
+                       needs_reset = true;
+                       submit++;
+                       break;
+               case RD_GPU_ID:
+                       if (!got_gpu_id) {
+                               options.gpu_id = *((unsigned int *)buf);
+                               printl(2, "gpu_id: %d\n", options.gpu_id);
+                               cffdec_init(&options);
+                               got_gpu_id = 1;
+                       }
+                       break;
+               default:
+                       break;
+               }
+       }
+
+end:
+       script_end_cmdstream();
+
+       io_close(io);
+       fflush(stdout);
+
+       if (ret < 0) {
+               printf("corrupt file\n");
+       }
+       return 0;
+}
diff --git a/src/freedreno/decode/crashdec.c b/src/freedreno/decode/crashdec.c
new file mode 100644 (file)
index 0000000..3b17d83
--- /dev/null
@@ -0,0 +1,1114 @@
+/*
+ * Copyright Â© 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Decoder for devcoredump traces from drm/msm.  In case of a gpu crash/hang,
+ * the coredump should be found in:
+ *
+ *    /sys/class/devcoredump/devcd<n>/data
+ *
+ * The crashdump will hang around for 5min, it can be cleared by writing to
+ * the file, ie:
+ *
+ *    echo 1 > /sys/class/devcoredump/devcd<n>/data
+ *
+ * (the driver won't log any new crashdumps until the previous one is cleared
+ * or times out after 5min)
+ */
+
+
+#include <assert.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <setjmp.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "buffers.h"
+#include "cffdec.h"
+#include "disasm.h"
+#include "pager.h"
+#include "rnnutil.h"
+#include "util.h"
+#include "instr-a3xx.h"
+
+
+static FILE *in;
+static bool verbose;
+
+static struct rnn *rnn_gmu;
+static struct rnn *rnn_control;
+static struct rnn *rnn_pipe;
+
+static struct cffdec_options options = {
+       .draw_filter = -1,
+};
+
+static inline bool is_a6xx(void) { return (600 <= options.gpu_id) && (options.gpu_id < 700); }
+static inline bool is_a5xx(void) { return (500 <= options.gpu_id) && (options.gpu_id < 600); }
+static inline bool is_64b(void)  { return options.gpu_id >= 500; }
+
+/*
+ * Helpers to read register values:
+ */
+
+/* read registers that are 64b on 64b GPUs (ie. a5xx+) */
+static uint64_t
+regval64(const char *name)
+{
+       unsigned reg = regbase(name);
+       assert(reg);
+       uint64_t val = reg_val(reg);
+       if (is_64b())
+               val |= ((uint64_t)reg_val(reg + 1)) << 32;
+       return val;
+}
+
+static uint32_t
+regval(const char *name)
+{
+       unsigned reg = regbase(name);
+       assert(reg);
+       return reg_val(reg);
+}
+
+/*
+ * Line reading and string helpers:
+ */
+
+static char *lastline;
+static char *pushedline;
+
+static const char *
+popline(void)
+{
+       char *r = pushedline;
+
+       if (r) {
+               pushedline = NULL;
+               return r;
+       }
+
+       free(lastline);
+
+       size_t n = 0;
+       if (getline(&r, &n, in) < 0)
+               exit(0);
+
+       lastline = r;
+       return r;
+}
+
+static void
+pushline(void)
+{
+       assert(!pushedline);
+       pushedline = lastline;
+}
+
+static uint32_t *
+popline_ascii85(uint32_t sizedwords)
+{
+       const char *line = popline();
+
+       /* At this point we exepct the ascii85 data to be indented *some*
+        * amount, and to terminate at the end of the line.  So just eat
+        * up the leading whitespace.
+        */
+       assert(*line == ' ');
+       while (*line == ' ')
+               line++;
+
+       uint32_t *buf = calloc(1, 4 * sizedwords);
+       int idx = 0;
+
+       while (*line != '\n') {
+               if (*line == 'z') {
+                       buf[idx++] = 0;
+                       line++;
+                       continue;
+               }
+
+               uint32_t accum = 0;
+               for (int i = 0; (i < 5) && (*line != '\n'); i++) {
+                       accum *= 85;
+                       accum += *line - '!';
+                       line++;
+               }
+
+               buf[idx++] = accum;
+       }
+
+       return buf;
+}
+
+static bool
+startswith(const char *line, const char *start)
+{
+       return strstr(line, start) == line;
+}
+
+static void
+parseline(const char *line, const char *fmt, ...)
+{
+       int fmtlen = strlen(fmt);
+       int n = 0;
+       int l = 0;
+
+       /* scan fmt string to extract expected # of conversions: */
+       for (int i = 0; i < fmtlen; i++) {
+               if (fmt[i] == '%') {
+                       if (i == (l - 1)) { /* prev char was %, ie. we have %% */
+                               n--;
+                               l = 0;
+                       } else {
+                               n++;
+                               l = i;
+                       }
+               }
+       }
+
+       va_list ap;
+       va_start(ap, fmt);
+       if (vsscanf(line, fmt, ap) != n) {
+               fprintf(stderr, "parse error scanning: '%s'\n", fmt);
+               exit(1);
+       }
+       va_end(ap);
+}
+
+#define foreach_line_in_section(_line) \
+       for (const char *_line = popline(); _line; _line = popline()) \
+               /* check for start of next section */                     \
+               if (_line[0] != ' ') {                                    \
+                       pushline();                                           \
+                       break;                                                \
+               } else
+
+/*
+ * Provide our own disasm assert() handler, so that we can recover
+ * after attempting to disassemble things that might not be valid
+ * instructions:
+ */
+
+static bool jmp_env_valid;
+static jmp_buf jmp_env;
+
+void
+ir3_assert_handler(const char *expr, const char *file, int line,
+               const char *func)
+{
+       printf("%s:%u: %s: Assertion `%s' failed.\n", file, line, func, expr);
+       if (jmp_env_valid)
+               longjmp(jmp_env, 1);
+       abort();
+}
+
+#define TRY(x) do { \
+               assert(!jmp_env_valid); \
+               if (setjmp(jmp_env) == 0) { \
+                       jmp_env_valid = true; \
+                       x; \
+               } \
+               jmp_env_valid = false; \
+       } while (0)
+
+/*
+ * Decode ringbuffer section:
+ */
+
+static struct {
+       uint64_t iova;
+       uint32_t rptr;
+       uint32_t wptr;
+       uint32_t size;
+       uint32_t *buf;
+} ringbuffers[5];
+
+static void
+decode_ringbuffer(void)
+{
+       int id = 0;
+
+       foreach_line_in_section (line) {
+               if (startswith(line, "  - id:")) {
+                       parseline(line, "  - id: %d", &id);
+                       assert(id < ARRAY_SIZE(ringbuffers));
+               } else if (startswith(line, "    iova:")) {
+                       parseline(line, "    iova: %"PRIx64, &ringbuffers[id].iova);
+               } else if (startswith(line, "    rptr:")) {
+                       parseline(line, "    rptr: %d", &ringbuffers[id].rptr);
+               } else if (startswith(line, "    wptr:")) {
+                       parseline(line, "    wptr: %d", &ringbuffers[id].wptr);
+               } else if (startswith(line, "    size:")) {
+                       parseline(line, "    size: %d", &ringbuffers[id].size);
+               } else if (startswith(line, "    data: !!ascii85 |")) {
+                       ringbuffers[id].buf = popline_ascii85(ringbuffers[id].size / 4);
+                       add_buffer(ringbuffers[id].iova, ringbuffers[id].size, ringbuffers[id].buf);
+                       continue;
+               }
+
+               printf("%s", line);
+       }
+}
+
+static bool
+valid_header(uint32_t pkt)
+{
+       if (options.gpu_id >= 500) {
+               return pkt_is_type4(pkt) || pkt_is_type7(pkt);
+       } else {
+               /* TODO maybe we can check validish looking pkt3 opc or pkt0
+                * register offset.. the cmds sent by kernel are usually
+                * fairly limited (other than initialization) which confines
+                * the search space a bit..
+                */
+               return true;
+       }
+}
+
+static void
+dump_cmdstream(void)
+{
+       uint64_t rb_base = regval64("CP_RB_BASE");
+
+       printf("got rb_base=%"PRIx64"\n", rb_base);
+
+       options.ibs[1].base = regval64("CP_IB1_BASE");
+       options.ibs[1].rem  = regval("CP_IB1_REM_SIZE");
+       options.ibs[2].base = regval64("CP_IB2_BASE");
+       options.ibs[2].rem  = regval("CP_IB2_REM_SIZE");
+
+       /* Adjust remaining size to account for cmdstream slurped into ROQ
+        * but not yet consumed by SQE
+        *
+        * TODO add support for earlier GPUs once we tease out the needed
+        * registers.. see crashit.c in msmtest for hints.
+        *
+        * TODO it would be nice to be able to extract out register bitfields
+        * by name rather than hard-coding this.
+        */
+       if (is_a6xx()) {
+               options.ibs[1].rem += regval("CP_CSQ_IB1_STAT") >> 16;
+               options.ibs[2].rem += regval("CP_CSQ_IB2_STAT") >> 16;
+       }
+
+       printf("IB1: %"PRIx64", %u\n", options.ibs[1].base, options.ibs[1].rem);
+       printf("IB2: %"PRIx64", %u\n", options.ibs[2].base, options.ibs[2].rem);
+
+       /* now that we've got the regvals we want, reset register state
+        * so we aren't seeing values from decode_registers();
+        */
+       reset_regs();
+
+       for (int id = 0; id < ARRAY_SIZE(ringbuffers); id++) {
+               if (ringbuffers[id].iova != rb_base)
+                       continue;
+               if (!ringbuffers[id].size)
+                       continue;
+
+               printf("found ring!\n");
+
+               /* The kernel level ringbuffer (RB) wraps around, which
+                * cffdec doesn't really deal with.. so figure out how
+                * many dwords are unread
+                */
+               unsigned ringszdw = ringbuffers[id].size >> 2;  /* in dwords */
+
+/* helper macro to deal with modulo size math: */
+#define mod_add(b, v)  ((ringszdw + (int)(b) + (int)(v)) % ringszdw)
+
+               /* The rptr will (most likely) have moved past the IB to
+                * userspace cmdstream, so back up a bit, and then advance
+                * until we find a valid start of a packet.. this is going
+                * to be less reliable on a4xx and before (pkt0/pkt3),
+                * compared to pkt4/pkt7 with parity bits
+                */
+               const int lookback = 12;
+               unsigned rptr = mod_add(ringbuffers[id].rptr, -lookback);
+
+               for (int idx = 0; idx < lookback; idx++) {
+                       if (valid_header(ringbuffers[id].buf[rptr]))
+                               break;
+                       rptr = mod_add(rptr, 1);
+               }
+
+               unsigned cmdszdw = mod_add(ringbuffers[id].wptr, -rptr);
+
+               printf("got cmdszdw=%d\n", cmdszdw);
+               uint32_t *buf = malloc(cmdszdw * 4);
+
+               for (int idx = 0; idx < cmdszdw; idx++) {
+                       int p = mod_add(rptr, idx);
+                       buf[idx] = ringbuffers[id].buf[p];
+               }
+
+               dump_commands(buf, cmdszdw, 0);
+               free(buf);
+       }
+}
+
+/*
+ * Decode 'bos' (buffers) section:
+ */
+
+static void
+decode_bos(void)
+{
+       uint32_t size = 0;
+       uint64_t iova = 0;
+
+       foreach_line_in_section (line) {
+               if (startswith(line, "  - iova:")) {
+                       parseline(line, "  - iova: %"PRIx64, &iova);
+               } else if (startswith(line, "    size:")) {
+                       parseline(line, "    size: %u", &size);
+               } else if (startswith(line, "    data: !!ascii85 |")) {
+                       uint32_t *buf = popline_ascii85(size / 4);
+
+                       if (verbose)
+                               dump_hex_ascii(buf, size, 1);
+
+                       add_buffer(iova, size, buf);
+
+                       continue;
+               }
+
+               printf("%s", line);
+       }
+}
+
+/*
+ * Decode registers section:
+ */
+
+static void
+dump_register(struct rnn *rnn, uint32_t offset, uint32_t value)
+{
+       struct rnndecaddrinfo *info = rnn_reginfo(rnn, offset);
+       if (info && info->typeinfo) {
+               char *decoded = rnndec_decodeval(rnn->vc, info->typeinfo, value);
+               printf("%s: %s\n", info->name, decoded);
+       } else if (info) {
+               printf("%s: %08x\n", info->name, value);
+       } else {
+               printf("<%04x>: %08x\n", offset, value);
+       }
+}
+
+static void
+decode_gmu_registers(void)
+{
+       foreach_line_in_section (line) {
+               uint32_t offset, value;
+               parseline(line, "  - { offset: %x, value: %x }", &offset, &value);
+
+               printf("\t%08x\t", value);
+               dump_register(rnn_gmu, offset/4, value);
+       }
+}
+
+static void
+decode_registers(void)
+{
+       foreach_line_in_section (line) {
+               uint32_t offset, value;
+               parseline(line, "  - { offset: %x, value: %x }", &offset, &value);
+
+               reg_set(offset/4, value);
+               printf("\t%08x", value);
+               dump_register_val(offset/4, value, 0);
+       }
+}
+
+/* similar to registers section, but for banked context regs: */
+static void
+decode_clusters(void)
+{
+       foreach_line_in_section (line) {
+               if (startswith(line, "  - cluster-name:") ||
+                               startswith(line, "    - context:")) {
+                       printf("%s", line);
+                       continue;
+               }
+
+               uint32_t offset, value;
+               parseline(line, "      - { offset: %x, value: %x }", &offset, &value);
+
+               printf("\t%08x", value);
+               dump_register_val(offset/4, value, 0);
+       }
+}
+
+/*
+ * Decode indexed-registers.. these aren't like normal registers, but a
+ * sort of FIFO where successive reads pop out associated debug state.
+ */
+
+static void
+dump_cp_seq_stat(uint32_t *stat)
+{
+       printf("\t PC: %04x\n", stat[0]);
+       stat++;
+
+       if (is_a6xx() && valid_header(stat[0])) {
+               if (pkt_is_type7(stat[0])) {
+                       unsigned opc = cp_type7_opcode(stat[0]);
+                       const char *name = pktname(opc);
+                       if (name)
+                               printf("\tPKT: %s\n", name);
+               } else {
+                       /* Not sure if this case can happen: */
+               }
+       }
+
+       for (int i = 0; i < 16; i++) {
+               printf("\t$%02x: %08x\t\t$%02x: %08x\n",
+                               i + 1, stat[i], i + 16 + 1, stat[i + 16]);
+       }
+}
+
+static void
+dump_control_regs(uint32_t *regs)
+{
+       if (!rnn_control)
+               return;
+
+       /* Control regs 0x100-0x17f are a scratch space to be used by the
+        * firmware however it wants, unlike lower regs which involve some
+        * fixed-function units. Therefore only these registers get dumped
+        * directly.
+        */
+       for (uint32_t i = 0; i < 0x80; i++) {
+               printf("\t%08x\t", regs[i]);
+               dump_register(rnn_control, i + 0x100, regs[i]);
+       }
+}
+
+static void
+dump_cp_ucode_dbg(uint32_t *dbg)
+{
+       /* Notes on the data:
+        * There seems to be a section every 4096 DWORD's. The sections aren't
+        * all the same size, so the rest of the 4096 DWORD's are filled with
+        * mirrors of the actual data.
+        */
+
+       for (int section = 0; section < 6; section++, dbg += 0x1000) {
+               switch (section) {
+               case 0:
+                       /* Contains scattered data from a630_sqe.fw: */
+                       printf("\tSQE instruction cache:\n");
+                       dump_hex_ascii(dbg, 4 * 0x400, 1);
+                       break;
+               case 1:
+                       printf("\tUnknown 1:\n");
+                       dump_hex_ascii(dbg, 4 * 0x80, 1);
+                       break;
+               case 2:
+                       printf("\tUnknown 2:\n");
+                       dump_hex_ascii(dbg, 4 * 0x200, 1);
+                       break;
+               case 3:
+                       printf("\tUnknown 3:\n");
+                       dump_hex_ascii(dbg, 4 * 0x80, 1);
+                       break;
+               case 4:
+                       /* Don't bother printing this normally */
+                       if (verbose) {
+                               printf("\tSQE packet jumptable contents:\n");
+                               dump_hex_ascii(dbg, 4 * 0x80, 1);
+                       }
+                       break;
+               case 5:
+                       printf("\tSQE scratch control regs:\n");
+                       dump_control_regs(dbg);
+                       break;
+               }
+       }
+}
+
+static void
+dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context, bool pipe)
+{
+       if (pipe) {
+               struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
+               printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
+
+               if (!strcmp(info->typeinfo->name, "void")) {
+                       /* registers that ignore their payload */
+               } else {
+                       printf("\t\t\t");
+                       dump_register(rnn_pipe, reg, data);
+               }
+       } else {
+               printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
+               dump_register_val(reg, data, 2);
+       }
+}
+
+static void
+dump_mem_pool_chunk(const uint32_t *chunk)
+{
+       struct __attribute__((packed)) {
+               bool reg0_enabled : 1;
+               bool reg1_enabled : 1;
+               uint32_t data0 : 32;
+               uint32_t data1 : 32;
+               uint32_t reg0 : 18;
+               uint32_t reg1 : 18;
+               bool reg0_pipe : 1;
+               bool reg1_pipe : 1;
+               uint32_t reg0_context : 1;
+               uint32_t reg1_context : 1;
+               uint32_t padding : 22;
+       } fields;
+
+       memcpy(&fields, chunk, 4 * sizeof(uint32_t));
+
+       if (fields.reg0_enabled) {
+               dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context, fields.reg0_pipe);
+       }
+
+       if (fields.reg1_enabled) {
+               dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context, fields.reg1_pipe);
+       }
+}
+
+static void
+dump_cp_mem_pool(uint32_t *mempool)
+{
+       /* The mem pool is a shared pool of memory used for storing in-flight
+        * register writes. There are 6 different queues, one for each
+        * cluster. Writing to $data (or for some special registers, $addr)
+        * pushes data onto the appropriate queue, and each queue is pulled
+        * from by the appropriate cluster. The queues are thus written to
+        * in-order, but may be read out-of-order.
+        *
+        * The queues are conceptually divided into 128-bit "chunks", and the
+        * read and write pointers are in units of chunks.  These chunks are
+        * organized internally into 8-chunk "blocks", and memory is allocated
+        * dynamically in terms of blocks. Each queue is represented as a
+        * singly-linked list of blocks, as well as 3-bit start/end chunk
+        * pointers that point within the first/last block.  The next pointers
+        * are located in a separate array, rather than inline.
+        */
+
+       /* TODO: The firmware CP_MEM_POOL save/restore routines do something
+        * like:
+        *
+        * cread $02, [ $00 + 0 ]
+        * and $02, $02, 0x118
+        * ...
+        * brne $02, 0, #label
+        * mov $03, 0x2000
+        * mov $03, 0x1000
+        * label:
+        * ...
+        *
+        * I think that control register 0 is the GPU version, and some
+        * versions have a smaller mem pool. It seems some models have a mem
+        * pool that's half the size, and a bunch of offsets are shifted
+        * accordingly. Unfortunately the kernel driver's dumping code doesn't
+        * seem to take this into account, even the downstream android driver,
+        * and we don't know which versions 0x8, 0x10, or 0x100 correspond
+        * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
+        */
+       bool small_mem_pool = false;
+
+       /* The array of next pointers for each block. */
+       const uint32_t *next_pointers = small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
+
+       /* Maximum number of blocks in the pool, also the size of the pointers
+        * array.
+        */
+       const int num_blocks = small_mem_pool ? 0x30 : 0x80;
+
+       /* Number of queues */
+       const unsigned num_queues = 6;
+
+       /* Unfortunately the per-queue state is a little more complicated than
+        * a simple pair of begin/end pointers. Instead of a single beginning
+        * block, there are *two*, with the property that either the two are
+        * equal or the second is the "next" of the first. Similarly there are
+        * two end blocks. Thus the queue either looks like this:
+        *
+        * A -> B -> ... -> C -> D
+        *
+        * Or like this, or some combination:
+        *
+        * A/B -> ... -> C/D
+        *
+        * However, there's only one beginning/end chunk offset. Now the
+        * question is, which of A or B is the actual start? I.e. is the chunk
+        * offset an offset inside A or B? It depends. I'll show a typical read
+        * cycle, starting here (read pointer marked with a *) with a chunk
+        * offset of 0:
+        *
+        *        A                    B
+        *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+        * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
+        *
+        * Once the pointer advances far enough, the hardware decides to free
+        * A, after which the read-side state looks like:
+        *
+        *      (free)                A/B
+        *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+        * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
+        *
+        * Then after advancing the pointer a bit more, the hardware fetches
+        * the "next" pointer for A and stores it in B:
+        *
+        *      (free)                 A                     B
+        *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+        * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
+        *
+        * Then the read pointer advances into B, at which point we've come
+        * back to the first state having advanced a whole block:
+        *
+        *      (free)                 A                     B
+        *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+        * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
+        *
+        *
+        * There is a similar cycle for the write pointer. Now, the question
+        * is, how do we know which state we're in? We need to know this to
+        * know whether the pointer (*) is in A or B if they're different. It
+        * seems like there should be some bit somewhere describing this, but
+        * after lots of experimentation I've come up empty-handed. For now we
+        * assume that if the pointer is in the first half, then we're in
+        * either the first or second state and use B, and otherwise we're in
+        * the second or third state and use A. So far I haven't seen anything
+        * that violates this assumption.
+        */
+
+       struct {
+               uint32_t unk0;
+               uint32_t padding0[7]; /* Mirrors of unk0 */
+
+               struct {
+                       uint32_t chunk : 3;
+                       uint32_t first_block : 32 - 3;
+               } writer[6];
+               uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
+
+               uint32_t unk1;
+               uint32_t padding2[7]; /* Mirrors of unk1 */
+
+               uint32_t writer_second_block[6];
+               uint32_t padding3[2];
+
+               uint32_t unk2[6];
+               uint32_t padding4[2];
+
+               struct {
+                       uint32_t chunk : 3;
+                       uint32_t first_block : 32 - 3;
+               } reader[6];
+               uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
+
+               uint32_t unk3;
+               uint32_t padding6[7]; /* Mirrors of unk3 */
+
+               uint32_t reader_second_block[6];
+               uint32_t padding7[2];
+
+               uint32_t block_count[6];
+               uint32_t padding[2];
+
+               uint32_t unk4;
+               uint32_t padding9[7]; /* Mirrors of unk4 */
+       } data1;
+
+       const uint32_t *data1_ptr = small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
+       memcpy(&data1, data1_ptr, sizeof(data1));
+
+       /* Based on the kernel, the first dword is the mem pool size (in
+        * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
+        */
+       const uint32_t *data2_ptr = small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
+       const int data2_size = 0x60;
+
+       /* This seems to be the size of each queue in chunks. */
+       const uint32_t *queue_sizes = &data2_ptr[0x18];
+
+       printf("\tdata2:\n");
+       dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
+
+       /* These seem to be some kind of counter of allocated/deallocated blocks */
+       if (verbose) {
+               printf("\tunk0: %x\n", data1.unk0);
+               printf("\tunk1: %x\n", data1.unk1);
+               printf("\tunk3: %x\n", data1.unk3);
+               printf("\tunk4: %x\n\n", data1.unk4);
+       }
+
+       for (int queue = 0; queue < num_queues; queue++) {
+               const char *cluster_names[6] = {
+                       "FE", "SP_VS", "PC_VS", "GRAS", "SP_PS", "PS"
+               };
+               printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
+
+               if (verbose) {
+                       printf("\t\twriter_first_block: 0x%x\n", data1.writer[queue].first_block);
+                       printf("\t\twriter_second_block: 0x%x\n", data1.writer_second_block[queue]);
+                       printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
+                       printf("\t\treader_first_block: 0x%x\n", data1.reader[queue].first_block);
+                       printf("\t\treader_second_block: 0x%x\n", data1.reader_second_block[queue]);
+                       printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
+                       printf("\t\tblock_count: %d\n", data1.block_count[queue]);
+                       printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
+                       printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
+               }
+
+               uint32_t cur_chunk = data1.reader[queue].chunk;
+               uint32_t cur_block = cur_chunk > 3 ?
+                       data1.reader[queue].first_block :
+                       data1.reader_second_block[queue];
+               uint32_t last_chunk = data1.writer[queue].chunk;
+               uint32_t last_block = last_chunk > 3 ?
+                       data1.writer[queue].first_block :
+                       data1.writer_second_block[queue];
+
+               if (verbose)
+                       printf("\tblock %x\n", cur_block);
+               if (cur_block >= num_blocks) {
+                       fprintf(stderr, "block %x too large\n", cur_block);
+                       exit(1);
+               }
+               unsigned calculated_queue_size = 0;
+               while (cur_block != last_block || cur_chunk != last_chunk) {
+                       calculated_queue_size++;
+                       uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
+
+                       dump_mem_pool_chunk(chunk_ptr);
+
+                       printf("\t%05x: %08x %08x %08x %08x\n",
+                              4 * (cur_block * 0x20 + cur_chunk + 4),
+                              chunk_ptr[0], chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
+
+                       cur_chunk++;
+                       if (cur_chunk == 8) {
+                               cur_block = next_pointers[cur_block];
+                               if (verbose)
+                                       printf("\tblock %x\n", cur_block);
+                               if (cur_block >= num_blocks) {
+                                       fprintf(stderr, "block %x too large\n", cur_block);
+                                       exit(1);
+                               }
+                               cur_chunk = 0;
+                       }
+               }
+               if (calculated_queue_size != queue_sizes[queue]) {
+                       printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n", calculated_queue_size);
+               }
+               printf("\n");
+       }
+}
+
+static void
+decode_indexed_registers(void)
+{
+       char *name = NULL;
+       uint32_t sizedwords = 0;
+
+       foreach_line_in_section (line) {
+               if (startswith(line, "  - regs-name:")) {
+                       free(name);
+                       parseline(line, "  - regs-name: %ms", &name);
+               } else if (startswith(line, "    dwords:")) {
+                       parseline(line, "    dwords: %u", &sizedwords);
+               } else if (startswith(line, "    data: !!ascii85 |")) {
+                       uint32_t *buf = popline_ascii85(sizedwords);
+
+                       /* some of the sections are pretty large, and are (at least
+                        * so far) not useful, so skip them if not in verbose mode:
+                        */
+                       bool dump = verbose ||
+                               !strcmp(name, "CP_SEQ_STAT") ||
+                               !strcmp(name, "CP_DRAW_STATE") ||
+                               !strcmp(name, "CP_ROQ") ||
+                               0;
+
+                       if (!strcmp(name, "CP_SEQ_STAT"))
+                               dump_cp_seq_stat(buf);
+
+                       if (!strcmp(name, "CP_UCODE_DBG_DATA"))
+                               dump_cp_ucode_dbg(buf);
+
+                       /* note that name was typo'd in earlier kernels: */
+                       if (!strcmp(name, "CP_MEMPOOL") || !strcmp(name, "CP_MEMPOOOL"))
+                               dump_cp_mem_pool(buf);
+
+                       if (dump)
+                               dump_hex_ascii(buf, 4 * sizedwords, 1);
+                       free(buf);
+
+                       continue;
+               }
+
+               printf("%s", line);
+       }
+}
+
+/*
+ * Decode shader-blocks:
+ */
+
+static void
+decode_shader_blocks(void)
+{
+       char *type = NULL;
+       uint32_t sizedwords = 0;
+
+       foreach_line_in_section (line) {
+               if (startswith(line, "  - type:")) {
+                       free(type);
+                       parseline(line, "  - type: %ms", &type);
+               } else if (startswith(line, "      size:")) {
+                       parseline(line, "      size: %u", &sizedwords);
+               } else if (startswith(line, "    data: !!ascii85 |")) {
+                       uint32_t *buf = popline_ascii85(sizedwords);
+
+                       /* some of the sections are pretty large, and are (at least
+                        * so far) not useful, so skip them if not in verbose mode:
+                        */
+                       bool dump = verbose ||
+                               !strcmp(type, "A6XX_SP_INST_DATA") ||
+                               !strcmp(type, "A6XX_HLSQ_INST_RAM") ||
+                               0;
+
+                       if (!strcmp(type, "A6XX_SP_INST_DATA") ||
+                                       !strcmp(type, "A6XX_HLSQ_INST_RAM")) {
+                               /* TODO this section actually contains multiple shaders
+                                * (or parts of shaders?), so perhaps we should search
+                                * for ends of shaders and decode each?
+                                */
+                               TRY(disasm_a3xx(buf, sizedwords, 1, stdout, options.gpu_id));
+                       }
+
+                       if (dump)
+                               dump_hex_ascii(buf, 4 * sizedwords, 1);
+
+                       free(buf);
+
+                       continue;
+               }
+
+               printf("%s", line);
+       }
+
+       free(type);
+}
+
+/*
+ * Decode debugbus section:
+ */
+
+static void
+decode_debugbus(void)
+{
+       char *block = NULL;
+       uint32_t sizedwords = 0;
+
+       foreach_line_in_section (line) {
+               if (startswith(line, "  - debugbus-block:")) {
+                       free(block);
+                       parseline(line, "  - debugbus-block: %ms", &block);
+               } else if (startswith(line, "    count:")) {
+                       parseline(line, "    count: %u", &sizedwords);
+               } else if (startswith(line, "    data: !!ascii85 |")) {
+                       uint32_t *buf = popline_ascii85(sizedwords);
+
+                       /* some of the sections are pretty large, and are (at least
+                        * so far) not useful, so skip them if not in verbose mode:
+                        */
+                       bool dump = verbose ||
+                               0;
+
+                       if (dump)
+                               dump_hex_ascii(buf, 4 * sizedwords, 1);
+
+                       free(buf);
+
+                       continue;
+               }
+
+               printf("%s", line);
+       }
+}
+
+/*
+ * Main crashdump decode loop:
+ */
+
+static void
+decode(void)
+{
+       const char *line;
+
+       while ((line = popline())) {
+               printf("%s", line);
+               if (startswith(line, "revision:")) {
+                       parseline(line, "revision: %u", &options.gpu_id);
+                       printf("Got gpu_id=%u\n", options.gpu_id);
+
+                       cffdec_init(&options);
+
+                       if (is_a6xx()) {
+                               rnn_gmu = rnn_new(!options.color);
+                               rnn_load_file(rnn_gmu, "adreno/a6xx_gmu.xml", "A6XX");
+                               rnn_control = rnn_new(!options.color);
+                               rnn_load_file(rnn_control, "adreno/adreno_control_regs.xml", "A6XX_CONTROL_REG");
+                               rnn_pipe = rnn_new(!options.color);
+                               rnn_load_file(rnn_pipe, "adreno/adreno_pipe_regs.xml", "A6XX_PIPE_REG");
+                       } else if (is_a5xx()) {
+                               rnn_control = rnn_new(!options.color);
+                               rnn_load_file(rnn_control, "adreno/adreno_control_regs.xml", "A5XX_CONTROL_REG");
+                       } else {
+                               rnn_control = NULL;
+                       }
+               } else if (startswith(line, "bos:")) {
+                       decode_bos();
+               } else if (startswith(line, "ringbuffer:")) {
+                       decode_ringbuffer();
+               } else if (startswith(line, "registers:")) {
+                       decode_registers();
+
+                       /* after we've recorded buffer contents, and CP register values,
+                        * we can take a stab at decoding the cmdstream:
+                        */
+                       dump_cmdstream();
+               } else if (startswith(line, "registers-gmu:")) {
+                       decode_gmu_registers();
+               } else if (startswith(line, "indexed-registers:")) {
+                       decode_indexed_registers();
+               } else if (startswith(line, "shader-blocks:")) {
+                       decode_shader_blocks();
+               } else if (startswith(line, "clusters:")) {
+                       decode_clusters();
+               } else if (startswith(line, "debugbus:")) {
+                       decode_debugbus();
+               }
+       }
+}
+
+/*
+ * Usage and argument parsing:
+ */
+
+static void
+usage(void)
+{
+       fprintf(stderr, "Usage:\n\n"
+                       "\tcrashdec [-achmsv] [-f FILE]\n\n"
+                       "Options:\n"
+                       "\t-a, --allregs   - show all registers (including ones not written since\n"
+                       "\t                  previous draw) at each draw\n"
+                       "\t-c, --color     - use colors\n"
+                       "\t-f, --file=FILE - read input from specified file (rather than stdin)\n"
+                       "\t-h, --help      - this usage message\n"
+                       "\t-m, --markers   - try to decode CP_NOP string markers\n"
+                       "\t-s, --summary   - don't show individual register writes, but just show\n"
+                       "\t                  register values on draws\n"
+                       "\t-v, --verbose   - dump more verbose output, including contents of\n"
+                       "\t                  less interesting buffers\n"
+                       "\n"
+               );
+       exit(2);
+}
+
+static const struct option opts[] = {
+       { .name = "allregs", .has_arg = 0, NULL, 'a' },
+       { .name = "color",   .has_arg = 0, NULL, 'c' },
+       { .name = "file",    .has_arg = 1, NULL, 'f' },
+       { .name = "help",    .has_arg = 0, NULL, 'h' },
+       { .name = "markers", .has_arg = 0, NULL, 'm' },
+       { .name = "summary", .has_arg = 0, NULL, 's' },
+       { .name = "verbose", .has_arg = 0, NULL, 'v' },
+       {}
+};
+
+static bool interactive;
+
+static void
+cleanup(void)
+{
+       fflush(stdout);
+
+       if (interactive) {
+               pager_close();
+       }
+}
+
+int
+main(int argc, char **argv)
+{
+       int c;
+
+       interactive = isatty(STDOUT_FILENO);
+       options.color = interactive;
+
+       /* default to read from stdin: */
+       in = stdin;
+
+       while ((c = getopt_long(argc, argv, "acf:hmsv", opts, NULL)) != -1) {
+               switch (c) {
+               case 'a':
+                       options.allregs = true;
+                       break;
+               case 'c':
+                       options.color = true;
+                       break;
+               case 'f':
+                       in = fopen(optarg, "r");
+                       break;
+               case 'm':
+                       options.decode_markers = true;
+                       break;
+               case 's':
+                       options.summary = true;
+                       break;
+               case 'v':
+                       verbose = true;
+                       break;
+               case 'h':
+               default:
+                       usage();
+               }
+       }
+
+       if (interactive) {
+               pager_open();
+       }
+
+       atexit(cleanup);
+
+       decode();
+       cleanup();
+}
diff --git a/src/freedreno/decode/disasm-a2xx.c b/src/freedreno/decode/disasm-a2xx.c
new file mode 100644 (file)
index 0000000..314c9c1
--- /dev/null
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "disasm.h"
+#include "instr-a2xx.h"
+#include "rnnutil.h"
+
+static const char *levels[] = {
+               "",
+               "\t",
+               "\t\t",
+               "\t\t\t",
+               "\t\t\t\t",
+               "\t\t\t\t\t",
+               "\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t\t",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+};
+
+enum debug_t debug;
+
+static struct rnn *rnn;
+
+/*
+ * ALU instructions:
+ */
+
+static const char chan_names[] = {
+               'x', 'y', 'z', 'w',
+               /* these only apply to FETCH dst's: */
+               '0', '1', '?', '_',
+};
+
+static void print_srcreg(uint32_t num, uint32_t type,
+               uint32_t swiz, uint32_t negate, uint32_t abs)
+{
+       if (negate)
+               printf("-");
+       if (abs)
+               printf("|");
+       printf("%c%u", type ? 'R' : 'C', num);
+       if (swiz) {
+               int i;
+               printf(".");
+               for (i = 0; i < 4; i++) {
+                       printf("%c", chan_names[(swiz + i) & 0x3]);
+                       swiz >>= 2;
+               }
+       }
+       if (abs)
+               printf("|");
+}
+
+static void print_dstreg(uint32_t num, uint32_t mask, uint32_t dst_exp)
+{
+       printf("%s%u", dst_exp ? "export" : "R", num);
+       if (mask != 0xf) {
+               int i;
+               printf(".");
+               for (i = 0; i < 4; i++) {
+                       printf("%c", (mask & 0x1) ? chan_names[i] : '_');
+                       mask >>= 1;
+               }
+       }
+}
+
+static void print_export_comment(uint32_t num, enum shader_t type)
+{
+       const char *name = NULL;
+       switch (type) {
+       case SHADER_VERTEX:
+               switch (num) {
+               case 62: name = "gl_Position";  break;
+               case 63: name = "gl_PointSize"; break;
+               }
+               break;
+       case SHADER_FRAGMENT:
+               switch (num) {
+               case 0:  name = "gl_FragColor"; break;
+               }
+               break;
+       default:
+               break;
+       }
+       /* if we had a symbol table here, we could look
+        * up the name of the varying..
+        */
+       if (name) {
+               printf("\t; %s", name);
+       }
+}
+
+struct {
+       uint32_t num_srcs;
+       const char *name;
+} vector_instructions[0x20] = {
+#define INSTR(opc, num_srcs) [opc] = { num_srcs, #opc }
+               INSTR(ADDv, 2),
+               INSTR(MULv, 2),
+               INSTR(MAXv, 2),
+               INSTR(MINv, 2),
+               INSTR(SETEv, 2),
+               INSTR(SETGTv, 2),
+               INSTR(SETGTEv, 2),
+               INSTR(SETNEv, 2),
+               INSTR(FRACv, 1),
+               INSTR(TRUNCv, 1),
+               INSTR(FLOORv, 1),
+               INSTR(MULADDv, 3),
+               INSTR(CNDEv, 3),
+               INSTR(CNDGTEv, 3),
+               INSTR(CNDGTv, 3),
+               INSTR(DOT4v, 2),
+               INSTR(DOT3v, 2),
+               INSTR(DOT2ADDv, 3),  // ???
+               INSTR(CUBEv, 2),
+               INSTR(MAX4v, 1),
+               INSTR(PRED_SETE_PUSHv, 2),
+               INSTR(PRED_SETNE_PUSHv, 2),
+               INSTR(PRED_SETGT_PUSHv, 2),
+               INSTR(PRED_SETGTE_PUSHv, 2),
+               INSTR(KILLEv, 2),
+               INSTR(KILLGTv, 2),
+               INSTR(KILLGTEv, 2),
+               INSTR(KILLNEv, 2),
+               INSTR(DSTv, 2),
+               INSTR(MOVAv, 1),
+}, scalar_instructions[0x40] = {
+               INSTR(ADDs, 1),
+               INSTR(ADD_PREVs, 1),
+               INSTR(MULs, 1),
+               INSTR(MUL_PREVs, 1),
+               INSTR(MUL_PREV2s, 1),
+               INSTR(MAXs, 1),
+               INSTR(MINs, 1),
+               INSTR(SETEs, 1),
+               INSTR(SETGTs, 1),
+               INSTR(SETGTEs, 1),
+               INSTR(SETNEs, 1),
+               INSTR(FRACs, 1),
+               INSTR(TRUNCs, 1),
+               INSTR(FLOORs, 1),
+               INSTR(EXP_IEEE, 1),
+               INSTR(LOG_CLAMP, 1),
+               INSTR(LOG_IEEE, 1),
+               INSTR(RECIP_CLAMP, 1),
+               INSTR(RECIP_FF, 1),
+               INSTR(RECIP_IEEE, 1),
+               INSTR(RECIPSQ_CLAMP, 1),
+               INSTR(RECIPSQ_FF, 1),
+               INSTR(RECIPSQ_IEEE, 1),
+               INSTR(MOVAs, 1),
+               INSTR(MOVA_FLOORs, 1),
+               INSTR(SUBs, 1),
+               INSTR(SUB_PREVs, 1),
+               INSTR(PRED_SETEs, 1),
+               INSTR(PRED_SETNEs, 1),
+               INSTR(PRED_SETGTs, 1),
+               INSTR(PRED_SETGTEs, 1),
+               INSTR(PRED_SET_INVs, 1),
+               INSTR(PRED_SET_POPs, 1),
+               INSTR(PRED_SET_CLRs, 1),
+               INSTR(PRED_SET_RESTOREs, 1),
+               INSTR(KILLEs, 1),
+               INSTR(KILLGTs, 1),
+               INSTR(KILLGTEs, 1),
+               INSTR(KILLNEs, 1),
+               INSTR(KILLONEs, 1),
+               INSTR(SQRT_IEEE, 1),
+               INSTR(MUL_CONST_0, 1),
+               INSTR(MUL_CONST_1, 1),
+               INSTR(ADD_CONST_0, 1),
+               INSTR(ADD_CONST_1, 1),
+               INSTR(SUB_CONST_0, 1),
+               INSTR(SUB_CONST_1, 1),
+               INSTR(SIN, 1),
+               INSTR(COS, 1),
+               INSTR(RETAIN_PREV, 1),
+#undef INSTR
+};
+
+static int disasm_alu(uint32_t *dwords, uint32_t alu_off,
+               int level, int sync, enum shader_t type)
+{
+       instr_alu_t *alu = (instr_alu_t *)dwords;
+
+       printf("%s", levels[level]);
+       if (debug & PRINT_RAW) {
+               printf("%02x: %08x %08x %08x\t", alu_off,
+                               dwords[0], dwords[1], dwords[2]);
+       }
+
+       printf("   %sALU:\t", sync ? "(S)" : "   ");
+
+       printf("%s", vector_instructions[alu->vector_opc].name);
+
+       if (alu->pred_select & 0x2) {
+               /* seems to work similar to conditional execution in ARM instruction
+                * set, so let's use a similar syntax for now:
+                */
+               printf((alu->pred_select & 0x1) ? "EQ" : "NE");
+       }
+
+       printf("\t");
+
+       print_dstreg(alu->vector_dest, alu->vector_write_mask, alu->export_data);
+       printf(" = ");
+       if (vector_instructions[alu->vector_opc].num_srcs == 3) {
+               print_srcreg(alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+                               alu->src3_reg_negate, alu->src3_reg_abs);
+               printf(", ");
+       }
+       print_srcreg(alu->src1_reg, alu->src1_sel, alu->src1_swiz,
+                       alu->src1_reg_negate, alu->src1_reg_abs);
+       if (vector_instructions[alu->vector_opc].num_srcs > 1) {
+               printf(", ");
+               print_srcreg(alu->src2_reg, alu->src2_sel, alu->src2_swiz,
+                               alu->src2_reg_negate, alu->src2_reg_abs);
+       }
+
+       if (alu->vector_clamp)
+               printf(" CLAMP");
+
+       if (alu->export_data)
+               print_export_comment(alu->vector_dest, type);
+
+       printf("\n");
+
+       if (alu->scalar_write_mask || !alu->vector_write_mask) {
+               /* 2nd optional scalar op: */
+
+               printf("%s", levels[level]);
+               if (debug & PRINT_RAW)
+                       printf("                          \t");
+
+               if (scalar_instructions[alu->scalar_opc].name) {
+                       printf("\t    \t%s\t", scalar_instructions[alu->scalar_opc].name);
+               } else {
+                       printf("\t    \tOP(%u)\t", alu->scalar_opc);
+               }
+
+               print_dstreg(alu->scalar_dest, alu->scalar_write_mask, alu->export_data);
+               printf(" = ");
+               print_srcreg(alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+                               alu->src3_reg_negate, alu->src3_reg_abs);
+               // TODO ADD/MUL must have another src?!?
+               if (alu->scalar_clamp)
+                       printf(" CLAMP");
+               if (alu->export_data)
+                       print_export_comment(alu->scalar_dest, type);
+               printf("\n");
+       }
+
+       return 0;
+}
+
+
+/*
+ * FETCH instructions:
+ */
+
+static void print_fetch_dst(uint32_t dst_reg, uint32_t dst_swiz)
+{
+       int i;
+       printf("\tR%u.", dst_reg);
+       for (i = 0; i < 4; i++) {
+               printf("%c", chan_names[dst_swiz & 0x7]);
+               dst_swiz >>= 3;
+       }
+}
+
+static void print_fetch_vtx(instr_fetch_t *fetch)
+{
+       instr_fetch_vtx_t *vtx = &fetch->vtx;
+
+       if (vtx->pred_select) {
+               /* seems to work similar to conditional execution in ARM instruction
+                * set, so let's use a similar syntax for now:
+                */
+               printf(vtx->pred_condition ? "EQ" : "NE");
+       }
+
+       print_fetch_dst(vtx->dst_reg, vtx->dst_swiz);
+       printf(" = R%u.", vtx->src_reg);
+       printf("%c", chan_names[vtx->src_swiz & 0x3]);
+
+       const char *fmt = rnn_enumname(rnn, "a2xx_sq_surfaceformat", vtx->format);
+       if (fmt) {
+               printf(" %s", fmt);
+       } else  {
+               printf(" TYPE(0x%x)", vtx->format);
+       }
+       printf(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED");
+       if (!vtx->num_format_all)
+               printf(" NORMALIZED");
+       printf(" STRIDE(%u)", vtx->stride);
+       if (vtx->offset)
+               printf(" OFFSET(%u)", vtx->offset);
+       printf(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel);
+       if (0) {
+               // XXX
+               printf(" src_reg_am=%u", vtx->src_reg_am);
+               printf(" dst_reg_am=%u", vtx->dst_reg_am);
+               printf(" num_format_all=%u", vtx->num_format_all);
+               printf(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all);
+               printf(" exp_adjust_all=%u", vtx->exp_adjust_all);
+       }
+}
+
+static void print_fetch_tex(instr_fetch_t *fetch)
+{
+       static const char *filter[] = {
+                       [TEX_FILTER_POINT] = "POINT",
+                       [TEX_FILTER_LINEAR] = "LINEAR",
+                       [TEX_FILTER_BASEMAP] = "BASEMAP",
+       };
+       static const char *aniso_filter[] = {
+                       [ANISO_FILTER_DISABLED] = "DISABLED",
+                       [ANISO_FILTER_MAX_1_1] = "MAX_1_1",
+                       [ANISO_FILTER_MAX_2_1] = "MAX_2_1",
+                       [ANISO_FILTER_MAX_4_1] = "MAX_4_1",
+                       [ANISO_FILTER_MAX_8_1] = "MAX_8_1",
+                       [ANISO_FILTER_MAX_16_1] = "MAX_16_1",
+       };
+       static const char *arbitrary_filter[] = {
+                       [ARBITRARY_FILTER_2X4_SYM] = "2x4_SYM",
+                       [ARBITRARY_FILTER_2X4_ASYM] = "2x4_ASYM",
+                       [ARBITRARY_FILTER_4X2_SYM] = "4x2_SYM",
+                       [ARBITRARY_FILTER_4X2_ASYM] = "4x2_ASYM",
+                       [ARBITRARY_FILTER_4X4_SYM] = "4x4_SYM",
+                       [ARBITRARY_FILTER_4X4_ASYM] = "4x4_ASYM",
+       };
+       static const char *sample_loc[] = {
+                       [SAMPLE_CENTROID] = "CENTROID",
+                       [SAMPLE_CENTER] = "CENTER",
+       };
+       instr_fetch_tex_t *tex = &fetch->tex;
+       uint32_t src_swiz = tex->src_swiz;
+       int i;
+
+       if (tex->pred_select) {
+               /* seems to work similar to conditional execution in ARM instruction
+                * set, so let's use a similar syntax for now:
+                */
+               printf(tex->pred_condition ? "EQ" : "NE");
+       }
+
+       print_fetch_dst(tex->dst_reg, tex->dst_swiz);
+       printf(" = R%u.", tex->src_reg);
+       for (i = 0; i < 3; i++) {
+               printf("%c", chan_names[src_swiz & 0x3]);
+               src_swiz >>= 2;
+       }
+       printf(" CONST(%u)", tex->const_idx);
+       if (tex->fetch_valid_only)
+               printf(" VALID_ONLY");
+       if (tex->tx_coord_denorm)
+               printf(" DENORM");
+       if (tex->mag_filter != TEX_FILTER_USE_FETCH_CONST)
+               printf(" MAG(%s)", filter[tex->mag_filter]);
+       if (tex->min_filter != TEX_FILTER_USE_FETCH_CONST)
+               printf(" MIN(%s)", filter[tex->min_filter]);
+       if (tex->mip_filter != TEX_FILTER_USE_FETCH_CONST)
+               printf(" MIP(%s)", filter[tex->mip_filter]);
+       if (tex->aniso_filter != ANISO_FILTER_USE_FETCH_CONST)
+               printf(" ANISO(%s)", aniso_filter[tex->aniso_filter]);
+       if (tex->arbitrary_filter != ARBITRARY_FILTER_USE_FETCH_CONST)
+               printf(" ARBITRARY(%s)", arbitrary_filter[tex->arbitrary_filter]);
+       if (tex->vol_mag_filter != TEX_FILTER_USE_FETCH_CONST)
+               printf(" VOL_MAG(%s)", filter[tex->vol_mag_filter]);
+       if (tex->vol_min_filter != TEX_FILTER_USE_FETCH_CONST)
+               printf(" VOL_MIN(%s)", filter[tex->vol_min_filter]);
+       if (!tex->use_comp_lod) {
+               printf(" LOD(%u)", tex->use_comp_lod);
+               printf(" LOD_BIAS(%u)", tex->lod_bias);
+       }
+       if (tex->use_reg_lod) {
+               printf(" REG_LOD(%u)", tex->use_reg_lod);
+       }
+       if (tex->use_reg_gradients)
+               printf(" USE_REG_GRADIENTS");
+       printf(" LOCATION(%s)", sample_loc[tex->sample_location]);
+       if (tex->offset_x || tex->offset_y || tex->offset_z)
+               printf(" OFFSET(%u,%u,%u)", tex->offset_x, tex->offset_y, tex->offset_z);
+}
+
+struct {
+       const char *name;
+       void (*fxn)(instr_fetch_t *cf);
+} fetch_instructions[] = {
+#define INSTR(opc, name, fxn) [opc] = { name, fxn }
+               INSTR(VTX_FETCH, "VERTEX", print_fetch_vtx),
+               INSTR(TEX_FETCH, "SAMPLE", print_fetch_tex),
+               INSTR(TEX_GET_BORDER_COLOR_FRAC, "?", print_fetch_tex),
+               INSTR(TEX_GET_COMP_TEX_LOD, "?", print_fetch_tex),
+               INSTR(TEX_GET_GRADIENTS, "?", print_fetch_tex),
+               INSTR(TEX_GET_WEIGHTS, "?", print_fetch_tex),
+               INSTR(TEX_SET_TEX_LOD, "SET_TEX_LOD", print_fetch_tex),
+               INSTR(TEX_SET_GRADIENTS_H, "?", print_fetch_tex),
+               INSTR(TEX_SET_GRADIENTS_V, "?", print_fetch_tex),
+               INSTR(TEX_RESERVED_4, "?", print_fetch_tex),
+#undef INSTR
+};
+
+static int disasm_fetch(uint32_t *dwords, uint32_t alu_off, int level, int sync)
+{
+       instr_fetch_t *fetch = (instr_fetch_t *)dwords;
+
+       printf("%s", levels[level]);
+       if (debug & PRINT_RAW) {
+               printf("%02x: %08x %08x %08x\t", alu_off,
+                               dwords[0], dwords[1], dwords[2]);
+       }
+
+       printf("   %sFETCH:\t", sync ? "(S)" : "   ");
+       printf("%s", fetch_instructions[fetch->opc].name);
+       fetch_instructions[fetch->opc].fxn(fetch);
+       printf("\n");
+
+       return 0;
+}
+
+/*
+ * CF instructions:
+ */
+
+static int cf_exec(instr_cf_t *cf)
+{
+       return (cf->opc == EXEC) ||
+                       (cf->opc == EXEC_END) ||
+                       (cf->opc == COND_EXEC) ||
+                       (cf->opc == COND_EXEC_END) ||
+                       (cf->opc == COND_PRED_EXEC) ||
+                       (cf->opc == COND_PRED_EXEC_END) ||
+                       (cf->opc == COND_EXEC_PRED_CLEAN) ||
+                       (cf->opc == COND_EXEC_PRED_CLEAN_END);
+}
+
+static int cf_cond_exec(instr_cf_t *cf)
+{
+       return (cf->opc == COND_EXEC) ||
+                       (cf->opc == COND_EXEC_END) ||
+                       (cf->opc == COND_PRED_EXEC) ||
+                       (cf->opc == COND_PRED_EXEC_END) ||
+                       (cf->opc == COND_EXEC_PRED_CLEAN) ||
+                       (cf->opc == COND_EXEC_PRED_CLEAN_END);
+}
+
+static void print_cf_nop(instr_cf_t *cf)
+{
+}
+
+static void print_cf_exec(instr_cf_t *cf)
+{
+       printf(" ADDR(0x%x) CNT(0x%x)", cf->exec.address, cf->exec.count);
+       if (cf->exec.yeild)
+               printf(" YIELD");
+       if (cf->exec.vc)
+               printf(" VC(0x%x)", cf->exec.vc);
+       if (cf->exec.bool_addr)
+               printf(" BOOL_ADDR(0x%x)", cf->exec.bool_addr);
+       if (cf->exec.address_mode == ABSOLUTE_ADDR)
+               printf(" ABSOLUTE_ADDR");
+       if (cf_cond_exec(cf))
+               printf(" COND(%d)", cf->exec.condition);
+}
+
+static void print_cf_loop(instr_cf_t *cf)
+{
+       printf(" ADDR(0x%x) LOOP_ID(%d)", cf->loop.address, cf->loop.loop_id);
+       if (cf->loop.address_mode == ABSOLUTE_ADDR)
+               printf(" ABSOLUTE_ADDR");
+}
+
+static void print_cf_jmp_call(instr_cf_t *cf)
+{
+       printf(" ADDR(0x%x) DIR(%d)", cf->jmp_call.address, cf->jmp_call.direction);
+       if (cf->jmp_call.force_call)
+               printf(" FORCE_CALL");
+       if (cf->jmp_call.predicated_jmp)
+               printf(" COND(%d)", cf->jmp_call.condition);
+       if (cf->jmp_call.bool_addr)
+               printf(" BOOL_ADDR(0x%x)", cf->jmp_call.bool_addr);
+       if (cf->jmp_call.address_mode == ABSOLUTE_ADDR)
+               printf(" ABSOLUTE_ADDR");
+}
+
+static void print_cf_alloc(instr_cf_t *cf)
+{
+       static const char *bufname[] = {
+                       [SQ_NO_ALLOC] = "NO ALLOC",
+                       [SQ_POSITION] = "POSITION",
+                       [SQ_PARAMETER_PIXEL] = "PARAM/PIXEL",
+                       [SQ_MEMORY] = "MEMORY",
+       };
+       printf(" %s SIZE(0x%x)", bufname[cf->alloc.buffer_select], cf->alloc.size);
+       if (cf->alloc.no_serial)
+               printf(" NO_SERIAL");
+       if (cf->alloc.alloc_mode) // ???
+               printf(" ALLOC_MODE");
+}
+
+struct {
+       const char *name;
+       void (*fxn)(instr_cf_t *cf);
+} cf_instructions[] = {
+#define INSTR(opc, fxn) [opc] = { #opc, fxn }
+               INSTR(NOP, print_cf_nop),
+               INSTR(EXEC, print_cf_exec),
+               INSTR(EXEC_END, print_cf_exec),
+               INSTR(COND_EXEC, print_cf_exec),
+               INSTR(COND_EXEC_END, print_cf_exec),
+               INSTR(COND_PRED_EXEC, print_cf_exec),
+               INSTR(COND_PRED_EXEC_END, print_cf_exec),
+               INSTR(LOOP_START, print_cf_loop),
+               INSTR(LOOP_END, print_cf_loop),
+               INSTR(COND_CALL, print_cf_jmp_call),
+               INSTR(RETURN, print_cf_jmp_call),
+               INSTR(COND_JMP, print_cf_jmp_call),
+               INSTR(ALLOC, print_cf_alloc),
+               INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec),
+               INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec),
+               INSTR(MARK_VS_FETCH_DONE, print_cf_nop),  // ??
+#undef INSTR
+};
+
+static void print_cf(instr_cf_t *cf, int level)
+{
+       printf("%s", levels[level]);
+       if (debug & PRINT_RAW) {
+               uint16_t *words = (uint16_t *)cf;
+               printf("    %04x %04x %04x            \t",
+                               words[0], words[1], words[2]);
+       }
+       printf("%s", cf_instructions[cf->opc].name);
+       cf_instructions[cf->opc].fxn(cf);
+       printf("\n");
+}
+
+/*
+ * The adreno shader microcode consists of two parts:
+ *   1) A CF (control-flow) program, at the header of the compiled shader,
+ *      which refers to ALU/FETCH instructions that follow it by address.
+ *   2) ALU and FETCH instructions
+ */
+
+int disasm_a2xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
+{
+       instr_cf_t *cfs = (instr_cf_t *)dwords;
+       int idx, max_idx;
+
+       if (!rnn) {
+               rnn = rnn_new(1);
+               rnn_load(rnn, "a2xx");
+       }
+
+       for (idx = 0; ; idx++) {
+               instr_cf_t *cf = &cfs[idx];
+               if (cf_exec(cf)) {
+                       max_idx = 2 * cf->exec.address;
+                       break;
+               }
+       }
+
+       for (idx = 0; idx < max_idx; idx++) {
+               instr_cf_t *cf = &cfs[idx];
+
+               print_cf(cf, level);
+
+               if (cf_exec(cf)) {
+                       uint32_t sequence = cf->exec.serialize;
+                       uint32_t i;
+                       for (i = 0; i < cf->exec.count; i++) {
+                               uint32_t alu_off = (cf->exec.address + i);
+                               if (sequence & 0x1) {
+                                       disasm_fetch(dwords + alu_off * 3, alu_off, level, sequence & 0x2);
+                               } else {
+                                       disasm_alu(dwords + alu_off * 3, alu_off, level, sequence & 0x2, type);
+                               }
+                               sequence >>= 2;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+void disasm_set_debug(enum debug_t d)
+{
+       debug = d;
+}
diff --git a/src/freedreno/decode/disasm-a3xx.c b/src/freedreno/decode/disasm-a3xx.c
new file mode 100644 (file)
index 0000000..9645dc5
--- /dev/null
@@ -0,0 +1,1641 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "disasm.h"
+#include "instr-a3xx.h"
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+extern enum debug_t debug;
+
+static const char *levels[] = {
+               "",
+               "\t",
+               "\t\t",
+               "\t\t\t",
+               "\t\t\t\t",
+               "\t\t\t\t\t",
+               "\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t\t",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+               [TYPE_F16] = "f16",
+               [TYPE_F32] = "f32",
+               [TYPE_U16] = "u16",
+               [TYPE_U32] = "u32",
+               [TYPE_S16] = "s16",
+               [TYPE_S32] = "s32",
+               [TYPE_U8]  = "u8",
+               [TYPE_S8]  = "s8",
+};
+
+
+#define MAX_REG 4096
+
+typedef struct {
+       uint8_t full[MAX_REG/8];
+       uint8_t half[MAX_REG/8];
+} regmask_t;
+
+struct disasm_ctx {
+       FILE *out;
+       int level;
+       unsigned gpu_id;
+
+       struct shader_stats *stats;
+
+       /* we have to process the dst register after src to avoid tripping up
+        * the read-before-write detection
+        */
+       unsigned last_dst;
+       bool last_dst_full;
+       bool last_dst_valid;
+
+       /* current instruction repeat flag: */
+       unsigned repeat;
+       /* current instruction repeat indx/offset (for --expand): */
+       unsigned repeatidx;
+
+       /* tracking for register usage */
+       struct {
+               regmask_t used;
+               regmask_t used_merged;
+               regmask_t rbw;      /* read before write */
+               regmask_t war;      /* write after read */
+               regmask_t cnst;     /* used consts */
+       } regs;
+};
+
+static const char *float_imms[] = {
+       "0.0",
+       "0.5",
+       "1.0",
+       "2.0",
+       "e",
+       "pi",
+       "1/pi",
+       "1/log2(e)",
+       "log2(e)",
+       "1/log2(10)",
+       "log2(10)",
+       "4.0",
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full,
+               bool is_float, bool r,
+               bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+       const char type = c ? 'c' : 'r';
+
+       // XXX I prefer - and || for neg/abs, but preserving format used
+       // by libllvm-a3xx for easy diffing..
+
+       if (abs && neg)
+               fprintf(ctx->out, "(absneg)");
+       else if (neg)
+               fprintf(ctx->out, "(neg)");
+       else if (abs)
+               fprintf(ctx->out, "(abs)");
+
+       if (r)
+               fprintf(ctx->out, "(r)");
+
+       if (im) {
+               if (is_float && full && reg.iim_val < ARRAY_SIZE(float_imms)) {
+                       fprintf(ctx->out, "(%s)", float_imms[reg.iim_val]);
+               } else {
+                       fprintf(ctx->out, "%d", reg.iim_val);
+               }
+       } else if (addr_rel) {
+               /* I would just use %+d but trying to make it diff'able with
+                * libllvm-a3xx...
+                */
+               if (reg.iim_val < 0)
+                       fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+               else if (reg.iim_val > 0)
+                       fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+               else
+                       fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
+       } else if ((reg.num == REG_A0) && !c) {
+               /* This matches libllvm output, the second (scalar) address register
+                * seems to be called a1.x instead of a0.y.
+                */
+               fprintf(ctx->out, "a%d.x", reg.comp);
+       } else if ((reg.num == REG_P0) && !c) {
+               fprintf(ctx->out, "p0.%c", component[reg.comp]);
+       } else {
+               fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+       }
+}
+
+/* Tracking for registers used, read-before-write (input), and
+ * write-after-read (output.. but not 100%)..
+ */
+
+static void regmask_set(regmask_t *regmask, unsigned num, bool full, unsigned val)
+{
+       unsigned i = num / 8;
+       unsigned j = num % 8;
+       ir3_assert(num < MAX_REG);
+       if (full) {
+               regmask->full[i] = (regmask->full[i] & ~(1 << j)) | (val << j);
+       } else {
+               regmask->half[i] = (regmask->half[i] & ~(1 << j)) | (val << j);
+       }
+}
+
+static unsigned regmask_get(regmask_t *regmask, unsigned num, bool full)
+{
+       unsigned i = num / 8;
+       unsigned j = num % 8;
+       ir3_assert(num < MAX_REG);
+       if (full) {
+               return (regmask->full[i] >> j) & 0x1;
+       } else {
+               return (regmask->half[i] >> j) & 0x1;
+       }
+}
+
+static unsigned regidx(reg_t reg)
+{
+       return (4 * reg.num) + reg.comp;
+}
+
+static reg_t idxreg(unsigned idx)
+{
+       return (reg_t){
+               .comp = idx & 0x3,
+               .num  = idx >> 2,
+       };
+}
+
+static int print_regs(struct disasm_ctx *ctx, regmask_t *regmask, bool full)
+{
+       int num, max = 0, cnt = 0;
+       int first, last;
+
+       void print_sequence(void)
+       {
+               if (first != MAX_REG) {
+                       if (first == last) {
+                               fprintf(ctx->out, " %d", first);
+                       } else {
+                               fprintf(ctx->out, " %d-%d", first, last);
+                       }
+               }
+       }
+
+       first = last = MAX_REG;
+
+       for (num = 0; num < MAX_REG; num++) {
+               if (regmask_get(regmask, num, full)) {
+                       if (num != (last + 1)) {
+                               print_sequence();
+                               first = num;
+                       }
+                       last = num;
+                       if (num < (48*4))
+                               max = num;
+                       cnt++;
+               }
+       }
+
+       print_sequence();
+
+       fprintf(ctx->out, " (cnt=%d, max=%d)", cnt, max);
+
+       return max;
+}
+
+static void print_reg_stats(struct disasm_ctx *ctx)
+{
+       int fullreg, halfreg;
+
+       fprintf(ctx->out, "%sRegister Stats:\n", levels[ctx->level]);
+       fprintf(ctx->out, "%s- used (half):", levels[ctx->level]);
+       halfreg = print_regs(ctx, &ctx->regs.used, false);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- used (full):", levels[ctx->level]);
+       fullreg = print_regs(ctx, &ctx->regs.used, true);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- used (merged):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.used_merged, false);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- input (half):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.rbw, false);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- input (full):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.rbw, true);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- const (half):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.cnst, false);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- const (full):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.cnst, true);
+       fprintf(ctx->out, "\n");
+       fprintf(ctx->out, "%s- output (half):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.war, false);
+       fprintf(ctx->out, "  (estimated)\n");
+       fprintf(ctx->out, "%s- output (full):", levels[ctx->level]);
+       print_regs(ctx, &ctx->regs.war, true);
+       fprintf(ctx->out, "  (estimated)\n");
+
+       /* convert to vec4, which is the granularity that registers are
+        * assigned to shader:
+        */
+       fullreg = (fullreg + 3) / 4;
+       halfreg = (halfreg + 3) / 4;
+
+       // Note this count of instructions includes rptN, which matches
+       // up to how mesa prints this:
+       fprintf(ctx->out, "%s- shaderdb: %d instructions, %d nops, %d non-nops, "
+                       "(%d instlen), %d half, %d full\n",
+                       levels[ctx->level], ctx->stats->instructions, ctx->stats->nops,
+                       ctx->stats->instructions - ctx->stats->nops, ctx->stats->instlen,
+                       halfreg, fullreg);
+       fprintf(ctx->out, "%s- shaderdb: %d (ss), %d (sy)\n", levels[ctx->level],
+                       ctx->stats->ss, ctx->stats->sy);
+}
+
+static void process_reg_dst(struct disasm_ctx *ctx)
+{
+       int i;
+
+       if (!ctx->last_dst_valid)
+               return;
+
+       for (i = 0; i <= ctx->repeat; i++) {
+               unsigned dst = ctx->last_dst + i;
+
+               regmask_set(&ctx->regs.war, dst, ctx->last_dst_full, 1);
+               regmask_set(&ctx->regs.used, dst, ctx->last_dst_full, 1);
+
+               if (ctx->last_dst_full) {
+                       regmask_set(&ctx->regs.used_merged, (dst*2)+0, false, 1);
+                       regmask_set(&ctx->regs.used_merged, (dst*2)+1, false, 1);
+               } else {
+                       regmask_set(&ctx->regs.used_merged, dst, false, 1);
+               }
+       }
+
+       ctx->last_dst_valid = false;
+}
+
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
+{
+       /* presumably the special registers a0.c and p0.c don't count.. */
+       if (!(addr_rel || (reg.num == 61) || (reg.num == 62))) {
+               ctx->last_dst = regidx(reg);
+               ctx->last_dst_full = full;
+               ctx->last_dst_valid = true;
+       }
+       reg = idxreg(regidx(reg) + ctx->repeatidx);
+       print_reg(ctx, reg, full, false, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool f, bool r,
+               bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+       /* presumably the special registers a0.c and p0.c don't count.. */
+       if (!(addr_rel || c || im || (reg.num == 61) || (reg.num == 62))) {
+               int i, num = regidx(reg);
+               for (i = 0; i <= ctx->repeat; i++) {
+                       unsigned src = num + i;
+
+                       if (!regmask_get(&ctx->regs.used, src, full))
+                               regmask_set(&ctx->regs.rbw, src, full, 1);
+
+                       regmask_set(&ctx->regs.war, src, full, 0);
+                       regmask_set(&ctx->regs.used, src, full, 1);
+
+                       if (full) {
+                               regmask_set(&ctx->regs.used_merged, (src*2)+0, false, 1);
+                               regmask_set(&ctx->regs.used_merged, (src*2)+1, false, 1);
+                       } else {
+                               regmask_set(&ctx->regs.used_merged, src, false, 1);
+                       }
+
+                       if (!r)
+                               break;
+               }
+       } else if (c) {
+               int i, num = regidx(reg);
+               for (i = 0; i <= ctx->repeat; i++) {
+                       unsigned src = num + i;
+
+                       regmask_set(&ctx->regs.cnst, src, full, 1);
+
+                       if (!r)
+                               break;
+               }
+
+               unsigned max = (num + ctx->repeat + 1 + 3) / 4;
+               if (max > ctx->stats->constlen)
+                       ctx->stats->constlen = max;
+       }
+
+       if (r)
+               reg = idxreg(regidx(reg) + ctx->repeatidx);
+
+       print_reg(ctx, reg, full, f, r, c, im, neg, abs, addr_rel);
+}
+
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+       reg_t reg;
+       bool full;
+       bool r;
+       bool c;
+       bool f; /* src reg is interpreted as float, used for printing immediates */
+       bool im;
+       bool neg;
+       bool abs;
+       bool addr_rel;
+};
+
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
+{
+       reg_t reg = info->reg;
+
+       if (info->r)
+               reg = idxreg(regidx(info->reg) + ctx->repeatidx);
+
+       print_reg_src(ctx, reg, info->full, info->f, info->r, info->c, info->im,
+                       info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
+//{
+//     print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
+//}
+
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
+{
+       static const struct {
+               const char *suffix;
+               int nsrc;
+               bool idx;
+       } brinfo[7] = {
+               [BRANCH_PLAIN] = { "r",   1, false },
+               [BRANCH_OR]    = { "rao", 2, false },
+               [BRANCH_AND]   = { "raa", 2, false },
+               [BRANCH_CONST] = { "rac", 0, true  },
+               [BRANCH_ANY]   = { "any", 1, false },
+               [BRANCH_ALL]   = { "all", 1, false },
+               [BRANCH_X]     = { "rax", 0, false },
+       };
+       instr_cat0_t *cat0 = &instr->cat0;
+
+       switch (instr_opc(instr, ctx->gpu_id)) {
+       case OPC_KILL:
+       case OPC_PREDT:
+       case OPC_PREDF:
+               fprintf(ctx->out, " %sp0.%c", cat0->inv0 ? "!" : "",
+                               component[cat0->comp0]);
+               break;
+       case OPC_B:
+               fprintf(ctx->out, "%s", brinfo[cat0->brtype].suffix);
+               if (brinfo[cat0->brtype].idx) {
+                       fprintf(ctx->out, ".%u", cat0->idx);
+               }
+               if (brinfo[cat0->brtype].nsrc >= 1) {
+                       fprintf(ctx->out, " %sp0.%c,", cat0->inv0 ? "!" : "",
+                                       component[cat0->comp0]);
+               }
+               if (brinfo[cat0->brtype].nsrc >= 2) {
+                       fprintf(ctx->out, " %sp0.%c,", cat0->inv1 ? "!" : "",
+                                       component[cat0->comp1]);
+               }
+               fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+               break;
+       case OPC_JUMP:
+       case OPC_CALL:
+       case OPC_BKT:
+       case OPC_GETONE:
+       case OPC_SHPS:
+               fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+               break;
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat0->dummy3|cat0->dummy4))
+               fprintf(ctx->out, "\t{0: %x,%x}", cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat1_t *cat1 = &instr->cat1;
+
+       if (cat1->ul)
+               fprintf(ctx->out, "(ul)");
+
+       if (cat1->src_type == cat1->dst_type) {
+               if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+                       /* special case (nmemonic?): */
+                       fprintf(ctx->out, "mova");
+               } else {
+                       fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+               }
+       } else {
+               fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+       }
+
+       fprintf(ctx->out, " ");
+
+       if (cat1->even)
+               fprintf(ctx->out, "(even)");
+
+       if (cat1->pos_inf)
+               fprintf(ctx->out, "(pos_infinity)");
+
+       print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+                       cat1->dst_rel);
+
+       fprintf(ctx->out, ", ");
+
+       /* ugg, have to special case this.. vs print_reg().. */
+       if (cat1->src_im) {
+               if (type_float(cat1->src_type))
+                       fprintf(ctx->out, "(%f)", cat1->fim_val);
+               else if (type_uint(cat1->src_type))
+                       fprintf(ctx->out, "0x%08x", cat1->uim_val);
+               else
+                       fprintf(ctx->out, "%d", cat1->iim_val);
+       } else if (cat1->src_rel && !cat1->src_c) {
+               /* I would just use %+d but trying to make it diff'able with
+                * libllvm-a3xx...
+                */
+               char type = cat1->src_rel_c ? 'c' : 'r';
+               const char *full = (type_size(cat1->src_type) == 32) ? "" : "h";
+               if (cat1->off < 0)
+                       fprintf(ctx->out, "%s%c<a0.x - %d>", full, type, -cat1->off);
+               else if (cat1->off > 0)
+                       fprintf(ctx->out, "%s%c<a0.x + %d>", full, type, cat1->off);
+               else
+                       fprintf(ctx->out, "%s%c<a0.x>", full, type);
+       } else {
+               struct reginfo src = {
+                       .reg = (reg_t)cat1->src,
+                       .full = type_size(cat1->src_type) == 32,
+                       .r = cat1->src_r,
+                       .c = cat1->src_c,
+                       .im = cat1->src_im,
+               };
+               print_src(ctx, &src);
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+               fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat2_t *cat2 = &instr->cat2;
+       int opc = _OPC(2, cat2->opc);
+       static const char *cond[] = {
+                       "lt",
+                       "le",
+                       "gt",
+                       "ge",
+                       "eq",
+                       "ne",
+                       "?6?",
+       };
+
+       switch (opc) {
+       case OPC_CMPS_F:
+       case OPC_CMPS_U:
+       case OPC_CMPS_S:
+       case OPC_CMPV_F:
+       case OPC_CMPV_U:
+       case OPC_CMPV_S:
+               fprintf(ctx->out, ".%s", cond[cat2->cond]);
+               break;
+       }
+
+       fprintf(ctx->out, " ");
+       if (cat2->ei)
+               fprintf(ctx->out, "(ei)");
+       print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+       fprintf(ctx->out, ", ");
+
+       struct reginfo src1 = {
+               .full = cat2->full,
+               .r = cat2->repeat ? cat2->src1_r : 0,
+               .f = is_cat2_float(opc),
+               .im = cat2->src1_im,
+               .abs = cat2->src1_abs,
+               .neg = cat2->src1_neg,
+       };
+
+       if (cat2->c1.src1_c) {
+               src1.reg = (reg_t)(cat2->c1.src1);
+               src1.c = true;
+       } else if (cat2->rel1.src1_rel) {
+               src1.reg = (reg_t)(cat2->rel1.src1);
+               src1.c = cat2->rel1.src1_c;
+               src1.addr_rel = true;
+       } else {
+               src1.reg = (reg_t)(cat2->src1);
+       }
+       print_src(ctx, &src1);
+
+       struct reginfo src2 = {
+               .r = cat2->repeat ? cat2->src2_r : 0,
+               .full = cat2->full,
+               .f = is_cat2_float(opc),
+               .abs = cat2->src2_abs,
+               .neg = cat2->src2_neg,
+               .im = cat2->src2_im,
+       };
+       switch (opc) {
+       case OPC_ABSNEG_F:
+       case OPC_ABSNEG_S:
+       case OPC_CLZ_B:
+       case OPC_CLZ_S:
+       case OPC_SIGN_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+       case OPC_NOT_B:
+       case OPC_BFREV_B:
+       case OPC_SETRM:
+       case OPC_CBITS_B:
+               /* these only have one src reg */
+               break;
+       default:
+               fprintf(ctx->out, ", ");
+               if (cat2->c2.src2_c) {
+                       src2.reg = (reg_t)(cat2->c2.src2);
+                       src2.c = true;
+               } else if (cat2->rel2.src2_rel) {
+                       src2.reg = (reg_t)(cat2->rel2.src2);
+                       src2.c = cat2->rel2.src2_c;
+                       src2.addr_rel = true;
+               } else {
+                       src2.reg = (reg_t)(cat2->src2);
+               }
+               print_src(ctx, &src2);
+               break;
+       }
+}
+
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat3_t *cat3 = &instr->cat3;
+       bool full = instr_cat3_full(cat3);
+
+       fprintf(ctx->out, " ");
+       print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+       fprintf(ctx->out, ", ");
+
+       struct reginfo src1 = {
+               .r = cat3->repeat ? cat3->src1_r : 0,
+               .full = full,
+               .neg = cat3->src1_neg,
+       };
+       if (cat3->c1.src1_c) {
+               src1.reg = (reg_t)(cat3->c1.src1);
+               src1.c = true;
+       } else if (cat3->rel1.src1_rel) {
+               src1.reg = (reg_t)(cat3->rel1.src1);
+               src1.c = cat3->rel1.src1_c;
+               src1.addr_rel = true;
+       } else {
+               src1.reg = (reg_t)(cat3->src1);
+       }
+       print_src(ctx, &src1);
+
+       fprintf(ctx->out, ", ");
+       struct reginfo src2 = {
+               .reg = (reg_t)cat3->src2,
+               .full = full,
+               .r = cat3->repeat ? cat3->src2_r : 0,
+               .c = cat3->src2_c,
+               .neg = cat3->src2_neg,
+       };
+       print_src(ctx, &src2);
+
+       fprintf(ctx->out, ", ");
+       struct reginfo src3 = {
+               .r = cat3->src3_r,
+               .full = full,
+               .neg = cat3->src3_neg,
+       };
+       if (cat3->c2.src3_c) {
+               src3.reg = (reg_t)(cat3->c2.src3);
+               src3.c = true;
+       } else if (cat3->rel2.src3_rel) {
+               src3.reg = (reg_t)(cat3->rel2.src3);
+               src3.c = cat3->rel2.src3_c;
+               src3.addr_rel = true;
+       } else {
+               src3.reg = (reg_t)(cat3->src3);
+       }
+       print_src(ctx, &src3);
+}
+
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat4_t *cat4 = &instr->cat4;
+
+       fprintf(ctx->out, " ");
+       print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+       fprintf(ctx->out, ", ");
+
+       struct reginfo src = {
+               .r = cat4->src_r,
+               .im = cat4->src_im,
+               .full = cat4->full,
+               .neg = cat4->src_neg,
+               .abs = cat4->src_abs,
+       };
+       if (cat4->c.src_c) {
+               src.reg = (reg_t)(cat4->c.src);
+               src.c = true;
+       } else if (cat4->rel.src_rel) {
+               src.reg = (reg_t)(cat4->rel.src);
+               src.c = cat4->rel.src_c;
+               src.addr_rel = true;
+       } else {
+               src.reg = (reg_t)(cat4->src);
+       }
+       print_src(ctx, &src);
+
+       if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+               fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
+{
+       static const struct {
+               bool src1, src2, samp, tex;
+       } info[0x1f] = {
+                       [opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
+                       [opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
+                       [opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
+                       [opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
+                       [opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
+                       [opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
+                       [opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
+                       [opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
+                       [opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
+                       [opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
+                       [opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
+                       [opc_op(OPC_DSX)]      = { true,  false, false, false, },
+                       [opc_op(OPC_DSY)]      = { true,  false, false, false, },
+                       [opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
+                       [opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
+                       [opc_op(OPC_RGETPOS)]  = { true,  false, false, false, },
+                       [opc_op(OPC_RGETINFO)] = { false, false, false, false, },
+       };
+
+       static const struct {
+               bool indirect;
+               bool bindless;
+               bool use_a1;
+               bool uniform;
+       } desc_features[8] = {
+               [CAT5_NONUNIFORM] = { .indirect = true, },
+               [CAT5_UNIFORM] = { .indirect = true, .uniform = true, },
+               [CAT5_BINDLESS_IMM] = { .bindless = true, },
+               [CAT5_BINDLESS_UNIFORM] = {
+                       .bindless = true,
+                       .indirect = true,
+                       .uniform = true,
+               },
+               [CAT5_BINDLESS_NONUNIFORM] = {
+                       .bindless = true,
+                       .indirect = true,
+               },
+               [CAT5_BINDLESS_A1_IMM] = {
+                       .bindless = true,
+                       .use_a1 = true,
+               },
+               [CAT5_BINDLESS_A1_UNIFORM] = {
+                       .bindless = true,
+                       .indirect = true,
+                       .uniform = true,
+                       .use_a1 = true,
+               },
+               [CAT5_BINDLESS_A1_NONUNIFORM] = {
+                       .bindless = true,
+                       .indirect = true,
+                       .use_a1 = true,
+               },
+       };
+
+       instr_cat5_t *cat5 = &instr->cat5;
+       int i;
+
+       bool desc_indirect =
+               cat5->is_s2en_bindless &&
+               desc_features[cat5->s2en_bindless.desc_mode].indirect;
+       bool bindless =
+               cat5->is_s2en_bindless &&
+               desc_features[cat5->s2en_bindless.desc_mode].bindless;
+       bool use_a1 =
+               cat5->is_s2en_bindless &&
+               desc_features[cat5->s2en_bindless.desc_mode].use_a1;
+       bool uniform =
+               cat5->is_s2en_bindless &&
+               desc_features[cat5->s2en_bindless.desc_mode].uniform;
+
+       if (cat5->is_3d)   fprintf(ctx->out, ".3d");
+       if (cat5->is_a)    fprintf(ctx->out, ".a");
+       if (cat5->is_o)    fprintf(ctx->out, ".o");
+       if (cat5->is_p)    fprintf(ctx->out, ".p");
+       if (cat5->is_s)    fprintf(ctx->out, ".s");
+       if (desc_indirect) fprintf(ctx->out, ".s2en");
+       if (uniform)       fprintf(ctx->out, ".uniform");
+
+       if (bindless) {
+               unsigned base = (cat5->s2en_bindless.base_hi << 1) | cat5->base_lo;
+               fprintf(ctx->out, ".base%d", base);
+       }
+
+       fprintf(ctx->out, " ");
+
+       switch (_OPC(5, cat5->opc)) {
+       case OPC_DSXPP_1:
+       case OPC_DSYPP_1:
+               break;
+       default:
+               fprintf(ctx->out, "(%s)", type[cat5->type]);
+               break;
+       }
+
+       fprintf(ctx->out, "(");
+       for (i = 0; i < 4; i++)
+               if (cat5->wrmask & (1 << i))
+                       fprintf(ctx->out, "%c", "xyzw"[i]);
+       fprintf(ctx->out, ")");
+
+       print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+       if (info[cat5->opc].src1) {
+               fprintf(ctx->out, ", ");
+               struct reginfo src = { .reg = (reg_t)(cat5->src1), .full = cat5->full };
+               print_src(ctx, &src);
+       }
+
+       if (cat5->is_o || info[cat5->opc].src2) {
+               fprintf(ctx->out, ", ");
+               struct reginfo src = { .reg = (reg_t)(cat5->src2), .full = cat5->full };
+               print_src(ctx, &src);
+       }
+       if (cat5->is_s2en_bindless) {
+               if (!desc_indirect) {
+                       if (info[cat5->opc].samp) {
+                               if (use_a1)
+                                       fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3);
+                               else
+                                       fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3 & 0xf);
+                       }
+
+                       if (info[cat5->opc].tex && !use_a1) {
+                               fprintf(ctx->out, ", t#%d", cat5->s2en_bindless.src3 >> 4);
+                       }
+               }
+       } else {
+               if (info[cat5->opc].samp)
+                       fprintf(ctx->out, ", s#%d", cat5->norm.samp);
+               if (info[cat5->opc].tex)
+                       fprintf(ctx->out, ", t#%d", cat5->norm.tex);
+       }
+
+       if (desc_indirect) {
+               fprintf(ctx->out, ", ");
+               struct reginfo src = { .reg = (reg_t)(cat5->s2en_bindless.src3), .full = bindless };
+               print_src(ctx, &src);
+       }
+
+       if (use_a1)
+               fprintf(ctx->out, ", a1.x");
+
+       if (debug & PRINT_VERBOSE) {
+               if (cat5->is_s2en_bindless) {
+                       if ((debug & PRINT_VERBOSE) && cat5->s2en_bindless.dummy1)
+                               fprintf(ctx->out, "\t{5: %x}", cat5->s2en_bindless.dummy1);
+               } else {
+                       if ((debug & PRINT_VERBOSE) && cat5->norm.dummy1)
+                               fprintf(ctx->out, "\t{5: %x}", cat5->norm.dummy1);
+               }
+       }
+}
+
+static void print_instr_cat6_a3xx(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat6_t *cat6 = &instr->cat6;
+       char sd = 0, ss = 0;  /* dst/src address space */
+       bool nodst = false;
+       struct reginfo dst, src1, src2;
+       int src1off = 0, dstoff = 0;
+
+       memset(&dst, 0, sizeof(dst));
+       memset(&src1, 0, sizeof(src1));
+       memset(&src2, 0, sizeof(src2));
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_RESINFO:
+       case OPC_RESFMT:
+               dst.full  = type_size(cat6->type) == 32;
+               src1.full = type_size(cat6->type) == 32;
+               src2.full = type_size(cat6->type) == 32;
+               break;
+       case OPC_L2G:
+       case OPC_G2L:
+               dst.full = true;
+               src1.full = true;
+               src2.full = true;
+               break;
+       case OPC_STG:
+       case OPC_STL:
+       case OPC_STP:
+       case OPC_STLW:
+       case OPC_STIB:
+               dst.full  = type_size(cat6->type) == 32;
+               src1.full = type_size(cat6->type) == 32;
+               src2.full = type_size(cat6->type) == 32;
+               break;
+       default:
+               dst.full  = type_size(cat6->type) == 32;
+               src1.full = true;
+               src2.full = true;
+               break;
+       }
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_PREFETCH:
+               break;
+       case OPC_RESINFO:
+               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+               break;
+       case OPC_LDGB:
+               fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+               break;
+       case OPC_STGB:
+       case OPC_STIB:
+               fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
+               break;
+       case OPC_ATOMIC_ADD:
+       case OPC_ATOMIC_SUB:
+       case OPC_ATOMIC_XCHG:
+       case OPC_ATOMIC_INC:
+       case OPC_ATOMIC_DEC:
+       case OPC_ATOMIC_CMPXCHG:
+       case OPC_ATOMIC_MIN:
+       case OPC_ATOMIC_MAX:
+       case OPC_ATOMIC_AND:
+       case OPC_ATOMIC_OR:
+       case OPC_ATOMIC_XOR:
+               ss = cat6->g ? 'g' : 'l';
+               fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+               fprintf(ctx->out, ".%c", ss);
+               break;
+       default:
+               dst.im = cat6->g && !cat6->dst_off;
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               break;
+       }
+       fprintf(ctx->out, " ");
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_STG:
+               sd = 'g';
+               break;
+       case OPC_STP:
+               sd = 'p';
+               break;
+       case OPC_STL:
+       case OPC_STLW:
+               sd = 'l';
+               break;
+
+       case OPC_LDG:
+       case OPC_LDC:
+               ss = 'g';
+               break;
+       case OPC_LDP:
+               ss = 'p';
+               break;
+       case OPC_LDL:
+       case OPC_LDLW:
+       case OPC_LDLV:
+               ss = 'l';
+               break;
+
+       case OPC_L2G:
+               ss = 'l';
+               sd = 'g';
+               break;
+
+       case OPC_G2L:
+               ss = 'g';
+               sd = 'l';
+               break;
+
+       case OPC_PREFETCH:
+               ss = 'g';
+               nodst = true;
+               break;
+       }
+
+       if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
+               struct reginfo src3;
+
+               memset(&src3, 0, sizeof(src3));
+
+               src1.reg = (reg_t)(cat6->stgb.src1);
+               src2.reg = (reg_t)(cat6->stgb.src2);
+               src2.im  = cat6->stgb.src2_im;
+               src3.reg = (reg_t)(cat6->stgb.src3);
+               src3.im  = cat6->stgb.src3_im;
+               src3.full = true;
+
+               fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+               print_src(ctx, &src1);
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src2);
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src3);
+
+               if (debug & PRINT_VERBOSE)
+                       fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+               return;
+       }
+
+       if (is_atomic(_OPC(6, cat6->opc))) {
+
+               src1.reg = (reg_t)(cat6->ldgb.src1);
+               src1.im  = cat6->ldgb.src1_im;
+               src2.reg = (reg_t)(cat6->ldgb.src2);
+               src2.im  = cat6->ldgb.src2_im;
+               dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", ");
+               if (ss == 'g') {
+                       struct reginfo src3;
+                       memset(&src3, 0, sizeof(src3));
+
+                       src3.reg = (reg_t)(cat6->ldgb.src3);
+                       src3.full = true;
+
+                       /* For images, the ".typed" variant is used and src2 is
+                        * the ivecN coordinates, ie ivec2 for 2d.
+                        *
+                        * For SSBOs, the ".untyped" variant is used and src2 is
+                        * a simple dword offset..  src3 appears to be
+                        * uvec2(offset * 4, 0).  Not sure the point of that.
+                        */
+
+                       fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+                       print_src(ctx, &src1);  /* value */
+                       fprintf(ctx->out, ", ");
+                       print_src(ctx, &src2);  /* offset/coords */
+                       fprintf(ctx->out, ", ");
+                       print_src(ctx, &src3);  /* 64b byte offset.. */
+
+                       if (debug & PRINT_VERBOSE) {
+                               fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+                                               cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+                       }
+               } else { /* ss == 'l' */
+                       fprintf(ctx->out, "l[");
+                       print_src(ctx, &src1);  /* simple byte offset */
+                       fprintf(ctx->out, "], ");
+                       print_src(ctx, &src2);  /* value */
+
+                       if (debug & PRINT_VERBOSE) {
+                               fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+                                               cat6->ldgb.src3, cat6->ldgb.pad0,
+                                               cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+                       }
+               }
+
+               return;
+       } else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
+               dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", ");
+               fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
+
+               return;
+       } else if (_OPC(6, cat6->opc) == OPC_LDGB) {
+
+               src1.reg = (reg_t)(cat6->ldgb.src1);
+               src1.im  = cat6->ldgb.src1_im;
+               src2.reg = (reg_t)(cat6->ldgb.src2);
+               src2.im  = cat6->ldgb.src2_im;
+               dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", ");
+               fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+               print_src(ctx, &src1);
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src2);
+
+               if (debug & PRINT_VERBOSE)
+                       fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+               return;
+       } else if (_OPC(6, cat6->opc) == OPC_LDG && cat6->a.src1_im && cat6->a.src2_im) {
+               struct reginfo src3;
+
+               memset(&src3, 0, sizeof(src3));
+               src1.reg = (reg_t)(cat6->a.src1);
+               src2.reg = (reg_t)(cat6->a.src2);
+               src2.im  = cat6->a.src2_im;
+               src3.reg = (reg_t)(cat6->a.off);
+               src3.full = true;
+               dst.reg  = (reg_t)(cat6->d.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", g[");
+               print_src(ctx, &src1);
+               fprintf(ctx->out, "+");
+               print_src(ctx, &src3);
+               fprintf(ctx->out, "], ");
+               print_src(ctx, &src2);
+
+               return;
+       }
+       if (cat6->dst_off) {
+               dst.reg = (reg_t)(cat6->c.dst);
+               dstoff  = cat6->c.off;
+       } else {
+               dst.reg = (reg_t)(cat6->d.dst);
+       }
+
+       if (cat6->src_off) {
+               src1.reg = (reg_t)(cat6->a.src1);
+               src1.im  = cat6->a.src1_im;
+               src2.reg = (reg_t)(cat6->a.src2);
+               src2.im  = cat6->a.src2_im;
+               src1off  = cat6->a.off;
+       } else {
+               src1.reg = (reg_t)(cat6->b.src1);
+               src1.im  = cat6->b.src1_im;
+               src2.reg = (reg_t)(cat6->b.src2);
+               src2.im  = cat6->b.src2_im;
+       }
+
+       if (!nodst) {
+               if (sd)
+                       fprintf(ctx->out, "%c[", sd);
+               /* note: dst might actually be a src (ie. address to store to) */
+               print_src(ctx, &dst);
+               if (cat6->dst_off && cat6->g) {
+                       struct reginfo dstoff_reg = {0};
+                       dstoff_reg.reg = (reg_t) cat6->c.off;
+                       dstoff_reg.full  = true;
+                       fprintf(ctx->out, "+");
+                       print_src(ctx, &dstoff_reg);
+               } else if (dstoff)
+                       fprintf(ctx->out, "%+d", dstoff);
+               if (sd)
+                       fprintf(ctx->out, "]");
+               fprintf(ctx->out, ", ");
+       }
+
+       if (ss)
+               fprintf(ctx->out, "%c[", ss);
+
+       /* can have a larger than normal immed, so hack: */
+       if (src1.im) {
+               fprintf(ctx->out, "%u", src1.reg.dummy13);
+       } else {
+               print_src(ctx, &src1);
+       }
+
+       if (cat6->src_off && cat6->g)
+               print_src(ctx, &src2);
+       else if (src1off)
+               fprintf(ctx->out, "%+d", src1off);
+       if (ss)
+               fprintf(ctx->out, "]");
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_RESINFO:
+       case OPC_RESFMT:
+               break;
+       default:
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src2);
+               break;
+       }
+}
+
+static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
+       struct reginfo src1, src2, ssbo;
+       bool uses_type = _OPC(6, cat6->opc) != OPC_LDC;
+
+       static const struct {
+               bool indirect;
+               bool bindless;
+               const char *name;
+       } desc_features[8] = {
+               [CAT6_IMM] = {
+                       .name = "imm"
+               },
+               [CAT6_UNIFORM] = {
+                       .indirect = true,
+                       .name = "uniform"
+               },
+               [CAT6_NONUNIFORM] = {
+                       .indirect = true,
+                       .name = "nonuniform"
+               },
+               [CAT6_BINDLESS_IMM] = {
+                       .bindless = true,
+                       .name = "imm"
+               },
+               [CAT6_BINDLESS_UNIFORM] = {
+                       .bindless = true,
+                       .indirect = true,
+                       .name = "uniform"
+               },
+               [CAT6_BINDLESS_NONUNIFORM] = {
+                       .bindless = true,
+                       .indirect = true,
+                       .name = "nonuniform"
+               },
+       };
+
+       bool indirect_ssbo = desc_features[cat6->desc_mode].indirect;
+       bool bindless = desc_features[cat6->desc_mode].bindless;
+       bool type_full = cat6->type != TYPE_U16;
+
+
+       memset(&src1, 0, sizeof(src1));
+       memset(&src2, 0, sizeof(src2));
+       memset(&ssbo, 0, sizeof(ssbo));
+
+       if (uses_type) {
+               fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+       } else {
+               fprintf(ctx->out, ".offset%d", cat6->d);
+       }
+       fprintf(ctx->out, ".%u", cat6->type_size + 1);
+
+       fprintf(ctx->out, ".%s", desc_features[cat6->desc_mode].name);
+       if (bindless)
+               fprintf(ctx->out, ".base%d", cat6->base);
+       fprintf(ctx->out, " ");
+
+       src2.reg = (reg_t)(cat6->src2);
+       src2.full = type_full;
+       print_src(ctx, &src2);
+       fprintf(ctx->out, ", ");
+
+       src1.reg = (reg_t)(cat6->src1);
+       src1.full = true; // XXX
+       print_src(ctx, &src1);
+       fprintf(ctx->out, ", ");
+       ssbo.reg = (reg_t)(cat6->ssbo);
+       ssbo.im = !indirect_ssbo;
+       ssbo.full = true;
+       print_src(ctx, &ssbo);
+
+       if (debug & PRINT_VERBOSE) {
+               fprintf(ctx->out, " (pad1=%x, pad2=%x, pad3=%x, pad4=%x, pad5=%x)",
+                               cat6->pad1, cat6->pad2, cat6->pad3, cat6->pad4, cat6->pad5);
+       }
+}
+
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
+{
+       if (!is_cat6_legacy(instr, ctx->gpu_id)) {
+               print_instr_cat6_a6xx(ctx, instr);
+               if (debug & PRINT_VERBOSE)
+                       fprintf(ctx->out, " NEW");
+       } else {
+               print_instr_cat6_a3xx(ctx, instr);
+               if (debug & PRINT_VERBOSE)
+                       fprintf(ctx->out, " LEGACY");
+       }
+}
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat7_t *cat7 = &instr->cat7;
+
+       if (cat7->g)
+               fprintf(ctx->out, ".g");
+       if (cat7->l)
+               fprintf(ctx->out, ".l");
+
+       if (_OPC(7, cat7->opc) == OPC_FENCE) {
+               if (cat7->r)
+                       fprintf(ctx->out, ".r");
+               if (cat7->w)
+                       fprintf(ctx->out, ".w");
+       }
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+static const struct opc_info {
+       uint16_t cat;
+       uint16_t opc;
+       const char *name;
+       void (*print)(struct disasm_ctx *ctx, instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+       /* category 0: */
+       OPC(0, OPC_NOP,          nop),
+       OPC(0, OPC_B,            b),
+       OPC(0, OPC_JUMP,         jump),
+       OPC(0, OPC_CALL,         call),
+       OPC(0, OPC_RET,          ret),
+       OPC(0, OPC_KILL,         kill),
+       OPC(0, OPC_END,          end),
+       OPC(0, OPC_EMIT,         emit),
+       OPC(0, OPC_CUT,          cut),
+       OPC(0, OPC_CHMASK,       chmask),
+       OPC(0, OPC_CHSH,         chsh),
+       OPC(0, OPC_FLOW_REV,     flow_rev),
+       OPC(0, OPC_PREDT,        predt),
+       OPC(0, OPC_PREDF,        predf),
+       OPC(0, OPC_PREDE,        prede),
+       OPC(0, OPC_BKT,          bkt),
+       OPC(0, OPC_STKS,         stks),
+       OPC(0, OPC_STKR,         stkr),
+       OPC(0, OPC_XSET,         xset),
+       OPC(0, OPC_XCLR,         xclr),
+       OPC(0, OPC_GETONE,       getone),
+       OPC(0, OPC_DBG,          dbg),
+       OPC(0, OPC_SHPS,         shps),
+       OPC(0, OPC_SHPE,         shpe),
+
+       /* category 1: */
+       OPC(1, OPC_MOV, ),
+
+       /* category 2: */
+       OPC(2, OPC_ADD_F,        add.f),
+       OPC(2, OPC_MIN_F,        min.f),
+       OPC(2, OPC_MAX_F,        max.f),
+       OPC(2, OPC_MUL_F,        mul.f),
+       OPC(2, OPC_SIGN_F,       sign.f),
+       OPC(2, OPC_CMPS_F,       cmps.f),
+       OPC(2, OPC_ABSNEG_F,     absneg.f),
+       OPC(2, OPC_CMPV_F,       cmpv.f),
+       OPC(2, OPC_FLOOR_F,      floor.f),
+       OPC(2, OPC_CEIL_F,       ceil.f),
+       OPC(2, OPC_RNDNE_F,      rndne.f),
+       OPC(2, OPC_RNDAZ_F,      rndaz.f),
+       OPC(2, OPC_TRUNC_F,      trunc.f),
+       OPC(2, OPC_ADD_U,        add.u),
+       OPC(2, OPC_ADD_S,        add.s),
+       OPC(2, OPC_SUB_U,        sub.u),
+       OPC(2, OPC_SUB_S,        sub.s),
+       OPC(2, OPC_CMPS_U,       cmps.u),
+       OPC(2, OPC_CMPS_S,       cmps.s),
+       OPC(2, OPC_MIN_U,        min.u),
+       OPC(2, OPC_MIN_S,        min.s),
+       OPC(2, OPC_MAX_U,        max.u),
+       OPC(2, OPC_MAX_S,        max.s),
+       OPC(2, OPC_ABSNEG_S,     absneg.s),
+       OPC(2, OPC_AND_B,        and.b),
+       OPC(2, OPC_OR_B,         or.b),
+       OPC(2, OPC_NOT_B,        not.b),
+       OPC(2, OPC_XOR_B,        xor.b),
+       OPC(2, OPC_CMPV_U,       cmpv.u),
+       OPC(2, OPC_CMPV_S,       cmpv.s),
+       OPC(2, OPC_MUL_U24,      mul.u24),
+       OPC(2, OPC_MUL_S24,      mul.s24),
+       OPC(2, OPC_MULL_U,       mull.u),
+       OPC(2, OPC_BFREV_B,      bfrev.b),
+       OPC(2, OPC_CLZ_S,        clz.s),
+       OPC(2, OPC_CLZ_B,        clz.b),
+       OPC(2, OPC_SHL_B,        shl.b),
+       OPC(2, OPC_SHR_B,        shr.b),
+       OPC(2, OPC_ASHR_B,       ashr.b),
+       OPC(2, OPC_BARY_F,       bary.f),
+       OPC(2, OPC_MGEN_B,       mgen.b),
+       OPC(2, OPC_GETBIT_B,     getbit.b),
+       OPC(2, OPC_SETRM,        setrm),
+       OPC(2, OPC_CBITS_B,      cbits.b),
+       OPC(2, OPC_SHB,          shb),
+       OPC(2, OPC_MSAD,         msad),
+
+       /* category 3: */
+       OPC(3, OPC_MAD_U16,      mad.u16),
+       OPC(3, OPC_MADSH_U16,    madsh.u16),
+       OPC(3, OPC_MAD_S16,      mad.s16),
+       OPC(3, OPC_MADSH_M16,    madsh.m16),
+       OPC(3, OPC_MAD_U24,      mad.u24),
+       OPC(3, OPC_MAD_S24,      mad.s24),
+       OPC(3, OPC_MAD_F16,      mad.f16),
+       OPC(3, OPC_MAD_F32,      mad.f32),
+       OPC(3, OPC_SEL_B16,      sel.b16),
+       OPC(3, OPC_SEL_B32,      sel.b32),
+       OPC(3, OPC_SEL_S16,      sel.s16),
+       OPC(3, OPC_SEL_S32,      sel.s32),
+       OPC(3, OPC_SEL_F16,      sel.f16),
+       OPC(3, OPC_SEL_F32,      sel.f32),
+       OPC(3, OPC_SAD_S16,      sad.s16),
+       OPC(3, OPC_SAD_S32,      sad.s32),
+
+       /* category 4: */
+       OPC(4, OPC_RCP,          rcp),
+       OPC(4, OPC_RSQ,          rsq),
+       OPC(4, OPC_LOG2,         log2),
+       OPC(4, OPC_EXP2,         exp2),
+       OPC(4, OPC_SIN,          sin),
+       OPC(4, OPC_COS,          cos),
+       OPC(4, OPC_SQRT,         sqrt),
+       OPC(4, OPC_HRSQ,         hrsq),
+       OPC(4, OPC_HLOG2,        hlog2),
+       OPC(4, OPC_HEXP2,        hexp2),
+
+       /* category 5: */
+       OPC(5, OPC_ISAM,         isam),
+       OPC(5, OPC_ISAML,        isaml),
+       OPC(5, OPC_ISAMM,        isamm),
+       OPC(5, OPC_SAM,          sam),
+       OPC(5, OPC_SAMB,         samb),
+       OPC(5, OPC_SAML,         saml),
+       OPC(5, OPC_SAMGQ,        samgq),
+       OPC(5, OPC_GETLOD,       getlod),
+       OPC(5, OPC_CONV,         conv),
+       OPC(5, OPC_CONVM,        convm),
+       OPC(5, OPC_GETSIZE,      getsize),
+       OPC(5, OPC_GETBUF,       getbuf),
+       OPC(5, OPC_GETPOS,       getpos),
+       OPC(5, OPC_GETINFO,      getinfo),
+       OPC(5, OPC_DSX,          dsx),
+       OPC(5, OPC_DSY,          dsy),
+       OPC(5, OPC_GATHER4R,     gather4r),
+       OPC(5, OPC_GATHER4G,     gather4g),
+       OPC(5, OPC_GATHER4B,     gather4b),
+       OPC(5, OPC_GATHER4A,     gather4a),
+       OPC(5, OPC_SAMGP0,       samgp0),
+       OPC(5, OPC_SAMGP1,       samgp1),
+       OPC(5, OPC_SAMGP2,       samgp2),
+       OPC(5, OPC_SAMGP3,       samgp3),
+       OPC(5, OPC_DSXPP_1,      dsxpp.1),
+       OPC(5, OPC_DSYPP_1,      dsypp.1),
+       OPC(5, OPC_RGETPOS,      rgetpos),
+       OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+       /* category 6: */
+       OPC(6, OPC_LDG,          ldg),
+       OPC(6, OPC_LDL,          ldl),
+       OPC(6, OPC_LDP,          ldp),
+       OPC(6, OPC_STG,          stg),
+       OPC(6, OPC_STL,          stl),
+       OPC(6, OPC_STP,          stp),
+       OPC(6, OPC_LDIB,         ldib),
+       OPC(6, OPC_G2L,          g2l),
+       OPC(6, OPC_L2G,          l2g),
+       OPC(6, OPC_PREFETCH,     prefetch),
+       OPC(6, OPC_LDLW,         ldlw),
+       OPC(6, OPC_STLW,         stlw),
+       OPC(6, OPC_RESFMT,       resfmt),
+       OPC(6, OPC_RESINFO,      resinfo),
+       OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+       OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+       OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+       OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+       OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+       OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+       OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+       OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+       OPC(6, OPC_ATOMIC_AND,     atomic.and),
+       OPC(6, OPC_ATOMIC_OR,      atomic.or),
+       OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+       OPC(6, OPC_LDGB,         ldgb),
+       OPC(6, OPC_STGB,         stgb),
+       OPC(6, OPC_STIB,         stib),
+       OPC(6, OPC_LDC,          ldc),
+       OPC(6, OPC_LDLV,         ldlv),
+
+       OPC(7, OPC_BAR,          bar),
+       OPC(7, OPC_FENCE,        fence),
+
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
+
+static void print_single_instr(struct disasm_ctx *ctx, instr_t *instr)
+{
+       const char *name = GETINFO(instr)->name;
+       uint32_t opc = instr_opc(instr, ctx->gpu_id);
+
+       if (name) {
+               fprintf(ctx->out, "%s", name);
+               GETINFO(instr)->print(ctx, instr);
+       } else {
+               fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
+
+               switch (instr->opc_cat) {
+               case 0: print_instr_cat0(ctx, instr); break;
+               case 1: print_instr_cat1(ctx, instr); break;
+               case 2: print_instr_cat2(ctx, instr); break;
+               case 3: print_instr_cat3(ctx, instr); break;
+               case 4: print_instr_cat4(ctx, instr); break;
+               case 5: print_instr_cat5(ctx, instr); break;
+               case 6: print_instr_cat6(ctx, instr); break;
+               case 7: print_instr_cat7(ctx, instr); break;
+               }
+       }
+}
+
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
+{
+       instr_t *instr = (instr_t *)dwords;
+       uint32_t opc = instr_opc(instr, ctx->gpu_id);
+       unsigned nop = 0;
+       unsigned cycles = ctx->stats->instructions;
+
+       fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
+                       instr->opc_cat, n, cycles++, dwords[1], dwords[0]);
+
+#if 0
+       /* print unknown bits: */
+       if (debug & PRINT_RAW)
+               fprintf(ctx->out, "[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000);
+
+       if (debug & PRINT_VERBOSE)
+               fprintf(ctx->out, "%d,%02d ", instr->opc_cat, opc);
+#endif
+
+       /* NOTE: order flags are printed is a bit fugly.. but for now I
+        * try to match the order in llvm-a3xx disassembler for easy
+        * diff'ing..
+        */
+
+       ctx->repeat = instr_repeat(instr);
+       ctx->stats->instructions += 1 + ctx->repeat;
+       ctx->stats->instlen++;
+
+       if (instr->sync) {
+               fprintf(ctx->out, "(sy)");
+               ctx->stats->sy++;
+       }
+       if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7))) {
+               fprintf(ctx->out, "(ss)");
+               ctx->stats->ss++;
+       }
+       if (instr->jmp_tgt)
+               fprintf(ctx->out, "(jp)");
+       if ((instr->opc_cat == 0) && instr->cat0.eq)
+               fprintf(ctx->out, "(eq)");
+       if (instr_sat(instr))
+               fprintf(ctx->out, "(sat)");
+       if (ctx->repeat)
+               fprintf(ctx->out, "(rpt%d)", ctx->repeat);
+       else if ((instr->opc_cat == 2) && (instr->cat2.src1_r || instr->cat2.src2_r))
+               nop = (instr->cat2.src2_r * 2) + instr->cat2.src1_r;
+       else if ((instr->opc_cat == 3) && (instr->cat3.src1_r || instr->cat3.src2_r))
+               nop = (instr->cat3.src2_r * 2) + instr->cat3.src1_r;
+       ctx->stats->instructions += nop;
+       ctx->stats->nops += nop;
+       if (opc == OPC_NOP)
+               ctx->stats->nops += 1 + ctx->repeat;
+       if (nop)
+               fprintf(ctx->out, "(nop%d) ", nop);
+
+       if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+               fprintf(ctx->out, "(ul)");
+
+       print_single_instr(ctx, instr);
+       fprintf(ctx->out, "\n");
+
+       process_reg_dst(ctx);
+
+       if ((instr->opc_cat <= 4) && (debug & EXPAND_REPEAT)) {
+               int i;
+               for (i = 0; i < nop; i++) {
+                       fprintf(ctx->out, "%s:%d:%04d:%04d[                   ] ",
+                                       levels[ctx->level], instr->opc_cat, n, cycles++);
+                       fprintf(ctx->out, "nop\n");
+               }
+               for (i = 0; i < ctx->repeat; i++) {
+                       ctx->repeatidx = i + 1;
+                       fprintf(ctx->out, "%s:%d:%04d:%04d[                   ] ",
+                                       levels[ctx->level], instr->opc_cat, n, cycles++);
+
+                       print_single_instr(ctx, instr);
+                       fprintf(ctx->out, "\n");
+               }
+               ctx->repeatidx = 0;
+       }
+
+       return (instr->opc_cat == 0) &&
+               ((opc == OPC_END) || (opc == OPC_CHSH));
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
+{
+       struct shader_stats stats;
+       return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
+}
+
+int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
+               unsigned gpu_id, struct shader_stats *stats)
+{
+       struct disasm_ctx ctx;
+       int i;
+       int nop_count = 0;
+       bool has_end = false;
+
+//     ir3_assert((sizedwords % 2) == 0);
+
+       memset(&ctx, 0, sizeof(ctx));
+       ctx.out = out;
+       ctx.level = level;
+       ctx.gpu_id = gpu_id;
+       ctx.stats = stats;
+       memset(ctx.stats, 0, sizeof(*ctx.stats));
+
+       for (i = 0; i < sizedwords; i += 2) {
+               has_end |= print_instr(&ctx, &dwords[i], i/2);
+               if (!has_end)
+                       continue;
+               if (dwords[i] == 0 && dwords[i + 1] == 0)
+                       nop_count++;
+               else
+                       nop_count = 0;
+               if (nop_count > 3)
+                       break;
+       }
+
+       print_reg_stats(&ctx);
+
+       return 0;
+}
diff --git a/src/freedreno/decode/disasm.h b/src/freedreno/decode/disasm.h
new file mode 100644 (file)
index 0000000..21ae5a1
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright Â© 2012 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef DISASM_H_
+#define DISASM_H_
+
+#include <stdio.h>
+
+enum shader_t {
+       SHADER_VERTEX,
+       SHADER_TCS,
+       SHADER_TES,
+       SHADER_GEOM,
+       SHADER_FRAGMENT,
+       SHADER_COMPUTE,
+};
+
+/* bitmask of debug flags */
+enum debug_t {
+       PRINT_RAW      = 0x1,    /* dump raw hexdump */
+       PRINT_VERBOSE  = 0x2,
+       EXPAND_REPEAT  = 0x4,
+};
+
+struct shader_stats {
+       /* instructions counts rpnN, and instlen does not */
+       int instructions, instlen;
+       int nops;
+       int ss, sy;
+       int constlen;
+};
+
+int disasm_a2xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type);
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
+int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
+               unsigned gpu_id, struct shader_stats *stats);
+void disasm_set_debug(enum debug_t debug);
+
+#endif /* DISASM_H_ */
diff --git a/src/freedreno/decode/instr-a2xx.h b/src/freedreno/decode/instr-a2xx.h
new file mode 100644 (file)
index 0000000..03d1991
--- /dev/null
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A2XX_H_
+#define INSTR_A2XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+
+/*
+ * ALU instructions:
+ */
+
+typedef enum {
+       ADDs = 0,
+       ADD_PREVs = 1,
+       MULs = 2,
+       MUL_PREVs = 3,
+       MUL_PREV2s = 4,
+       MAXs = 5,
+       MINs = 6,
+       SETEs = 7,
+       SETGTs = 8,
+       SETGTEs = 9,
+       SETNEs = 10,
+       FRACs = 11,
+       TRUNCs = 12,
+       FLOORs = 13,
+       EXP_IEEE = 14,
+       LOG_CLAMP = 15,
+       LOG_IEEE = 16,
+       RECIP_CLAMP = 17,
+       RECIP_FF = 18,
+       RECIP_IEEE = 19,
+       RECIPSQ_CLAMP = 20,
+       RECIPSQ_FF = 21,
+       RECIPSQ_IEEE = 22,
+       MOVAs = 23,
+       MOVA_FLOORs = 24,
+       SUBs = 25,
+       SUB_PREVs = 26,
+       PRED_SETEs = 27,
+       PRED_SETNEs = 28,
+       PRED_SETGTs = 29,
+       PRED_SETGTEs = 30,
+       PRED_SET_INVs = 31,
+       PRED_SET_POPs = 32,
+       PRED_SET_CLRs = 33,
+       PRED_SET_RESTOREs = 34,
+       KILLEs = 35,
+       KILLGTs = 36,
+       KILLGTEs = 37,
+       KILLNEs = 38,
+       KILLONEs = 39,
+       SQRT_IEEE = 40,
+       MUL_CONST_0 = 42,
+       MUL_CONST_1 = 43,
+       ADD_CONST_0 = 44,
+       ADD_CONST_1 = 45,
+       SUB_CONST_0 = 46,
+       SUB_CONST_1 = 47,
+       SIN = 48,
+       COS = 49,
+       RETAIN_PREV = 50,
+} instr_scalar_opc_t;
+
+typedef enum {
+       ADDv = 0,
+       MULv = 1,
+       MAXv = 2,
+       MINv = 3,
+       SETEv = 4,
+       SETGTv = 5,
+       SETGTEv = 6,
+       SETNEv = 7,
+       FRACv = 8,
+       TRUNCv = 9,
+       FLOORv = 10,
+       MULADDv = 11,
+       CNDEv = 12,
+       CNDGTEv = 13,
+       CNDGTv = 14,
+       DOT4v = 15,
+       DOT3v = 16,
+       DOT2ADDv = 17,
+       CUBEv = 18,
+       MAX4v = 19,
+       PRED_SETE_PUSHv = 20,
+       PRED_SETNE_PUSHv = 21,
+       PRED_SETGT_PUSHv = 22,
+       PRED_SETGTE_PUSHv = 23,
+       KILLEv = 24,
+       KILLGTv = 25,
+       KILLGTEv = 26,
+       KILLNEv = 27,
+       DSTv = 28,
+       MOVAv = 29,
+} instr_vector_opc_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       uint8_t             vector_dest              : 6;
+       uint8_t             vector_dest_rel          : 1;
+       uint8_t             low_precision_16b_fp     : 1;
+       uint8_t             scalar_dest              : 6;
+       uint8_t             scalar_dest_rel          : 1;
+       uint8_t             export_data              : 1;
+       uint8_t             vector_write_mask        : 4;
+       uint8_t             scalar_write_mask        : 4;
+       uint8_t             vector_clamp             : 1;
+       uint8_t             scalar_clamp             : 1;
+       instr_scalar_opc_t  scalar_opc               : 6;
+       /* dword1: */
+       uint8_t             src3_swiz                : 8;
+       uint8_t             src2_swiz                : 8;
+       uint8_t             src1_swiz                : 8;
+       uint8_t             src3_reg_negate          : 1;
+       uint8_t             src2_reg_negate          : 1;
+       uint8_t             src1_reg_negate          : 1;
+       uint8_t             pred_select              : 2;
+       uint8_t             relative_addr            : 1;
+       uint8_t             const_1_rel_abs          : 1;
+       uint8_t             const_0_rel_abs          : 1;
+       /* dword2: */
+       uint8_t             src3_reg                 : 6;
+       uint8_t             src3_reg_select          : 1;
+       uint8_t             src3_reg_abs             : 1;
+       uint8_t             src2_reg                 : 6;
+       uint8_t             src2_reg_select          : 1;
+       uint8_t             src2_reg_abs             : 1;
+       uint8_t             src1_reg                 : 6;
+       uint8_t             src1_reg_select          : 1;
+       uint8_t             src1_reg_abs             : 1;
+       instr_vector_opc_t  vector_opc               : 5;
+       uint8_t             src3_sel                 : 1;
+       uint8_t             src2_sel                 : 1;
+       uint8_t             src1_sel                 : 1;
+} instr_alu_t;
+
+
+
+/*
+ * CF instructions:
+ */
+
+typedef enum {
+       NOP = 0,
+       EXEC = 1,
+       EXEC_END = 2,
+       COND_EXEC = 3,
+       COND_EXEC_END = 4,
+       COND_PRED_EXEC = 5,
+       COND_PRED_EXEC_END = 6,
+       LOOP_START = 7,
+       LOOP_END = 8,
+       COND_CALL = 9,
+       RETURN = 10,
+       COND_JMP = 11,
+       ALLOC = 12,
+       COND_EXEC_PRED_CLEAN = 13,
+       COND_EXEC_PRED_CLEAN_END = 14,
+       MARK_VS_FETCH_DONE = 15,
+} instr_cf_opc_t;
+
+typedef enum {
+       RELATIVE_ADDR = 0,
+       ABSOLUTE_ADDR = 1,
+} instr_addr_mode_t;
+
+typedef enum {
+       SQ_NO_ALLOC = 0,
+       SQ_POSITION = 1,
+       SQ_PARAMETER_PIXEL = 2,
+       SQ_MEMORY = 3,
+} instr_alloc_type_t;
+
+typedef struct PACKED {
+       uint16_t            address                  : 9;
+       uint8_t             reserved0                : 3;
+       uint8_t             count                    : 3;
+       uint8_t             yeild                    : 1;
+       uint16_t            serialize                : 12;
+       uint8_t             vc                       : 6;   /* vertex cache? */
+       uint8_t             bool_addr                : 8;
+       uint8_t             condition                : 1;
+       instr_addr_mode_t   address_mode             : 1;
+       instr_cf_opc_t      opc                      : 4;
+} instr_cf_exec_t;
+
+typedef struct PACKED {
+       uint16_t            address                  : 10;
+       uint8_t             reserved0                : 6;
+       uint8_t             loop_id                  : 5;
+       uint32_t            reserved1                : 22;
+       instr_addr_mode_t   address_mode             : 1;
+       instr_cf_opc_t      opc                      : 4;
+} instr_cf_loop_t;
+
+typedef struct PACKED {
+       uint16_t            address                  : 10;
+       uint8_t             reserved0                : 3;
+       uint8_t             force_call               : 1;
+       uint8_t             predicated_jmp           : 1;
+       uint32_t            reserved1                : 18;
+       uint8_t             direction                : 1;
+       uint8_t             bool_addr                : 8;
+       uint8_t             condition                : 1;
+       instr_addr_mode_t   address_mode             : 1;
+       instr_cf_opc_t      opc                      : 4;
+} instr_cf_jmp_call_t;
+
+typedef struct PACKED {
+       uint8_t             size                     : 4;
+       uint64_t            reserved0                : 36;
+       uint8_t             no_serial                : 1;
+       instr_alloc_type_t  buffer_select            : 2;
+       uint8_t             alloc_mode               : 1;
+       instr_cf_opc_t      opc                      : 4;
+} instr_cf_alloc_t;
+
+typedef union PACKED {
+       instr_cf_exec_t     exec;
+       instr_cf_loop_t     loop;
+       instr_cf_jmp_call_t jmp_call;
+       instr_cf_alloc_t    alloc;
+       struct PACKED {
+               uint64_t        dummy                    : 44;
+               instr_cf_opc_t  opc                      : 4;
+       };
+} instr_cf_t;
+
+
+
+/*
+ * FETCH instructions:
+ */
+
+typedef enum {
+       VTX_FETCH = 0,
+       TEX_FETCH = 1,
+       TEX_GET_BORDER_COLOR_FRAC = 16,
+       TEX_GET_COMP_TEX_LOD = 17,
+       TEX_GET_GRADIENTS = 18,
+       TEX_GET_WEIGHTS = 19,
+       TEX_SET_TEX_LOD = 24,
+       TEX_SET_GRADIENTS_H = 25,
+       TEX_SET_GRADIENTS_V = 26,
+       TEX_RESERVED_4 = 27,
+} instr_fetch_opc_t;
+
+typedef enum {
+       TEX_FILTER_POINT = 0,
+       TEX_FILTER_LINEAR = 1,
+       TEX_FILTER_BASEMAP = 2,            /* only applicable for mip-filter */
+       TEX_FILTER_USE_FETCH_CONST = 3,
+} instr_tex_filter_t;
+
+typedef enum {
+       ANISO_FILTER_DISABLED = 0,
+       ANISO_FILTER_MAX_1_1 = 1,
+       ANISO_FILTER_MAX_2_1 = 2,
+       ANISO_FILTER_MAX_4_1 = 3,
+       ANISO_FILTER_MAX_8_1 = 4,
+       ANISO_FILTER_MAX_16_1 = 5,
+       ANISO_FILTER_USE_FETCH_CONST = 7,
+} instr_aniso_filter_t;
+
+typedef enum {
+       ARBITRARY_FILTER_2X4_SYM = 0,
+       ARBITRARY_FILTER_2X4_ASYM = 1,
+       ARBITRARY_FILTER_4X2_SYM = 2,
+       ARBITRARY_FILTER_4X2_ASYM = 3,
+       ARBITRARY_FILTER_4X4_SYM = 4,
+       ARBITRARY_FILTER_4X4_ASYM = 5,
+       ARBITRARY_FILTER_USE_FETCH_CONST = 7,
+} instr_arbitrary_filter_t;
+
+typedef enum {
+       SAMPLE_CENTROID = 0,
+       SAMPLE_CENTER = 1,
+} instr_sample_loc_t;
+
+typedef unsigned instr_surf_fmt_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       instr_fetch_opc_t   opc                      : 5;
+       uint8_t             src_reg                  : 6;
+       uint8_t             src_reg_am               : 1;
+       uint8_t             dst_reg                  : 6;
+       uint8_t             dst_reg_am               : 1;
+       uint8_t             fetch_valid_only         : 1;
+       uint8_t             const_idx                : 5;
+       uint8_t             tx_coord_denorm          : 1;
+       uint8_t             src_swiz                 : 6;
+       /* dword1: */
+       uint16_t            dst_swiz                 : 12;
+       instr_tex_filter_t  mag_filter               : 2;
+       instr_tex_filter_t  min_filter               : 2;
+       instr_tex_filter_t  mip_filter               : 2;
+       instr_aniso_filter_t aniso_filter            : 3;
+       instr_arbitrary_filter_t arbitrary_filter    : 3;
+       instr_tex_filter_t  vol_mag_filter           : 2;
+       instr_tex_filter_t  vol_min_filter           : 2;
+       uint8_t             use_comp_lod             : 1;
+       uint8_t             use_reg_lod              : 2;
+       uint8_t             pred_select              : 1;
+       /* dword2: */
+       uint8_t             use_reg_gradients        : 1;
+       instr_sample_loc_t  sample_location          : 1;
+       uint8_t             lod_bias                 : 7;
+       uint8_t             unused                   : 7;
+       uint8_t             offset_x                 : 5;
+       uint8_t             offset_y                 : 5;
+       uint8_t             offset_z                 : 5;
+       uint8_t             pred_condition           : 1;
+} instr_fetch_tex_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       instr_fetch_opc_t   opc                      : 5;
+       uint8_t             src_reg                  : 6;
+       uint8_t             src_reg_am               : 1;
+       uint8_t             dst_reg                  : 6;
+       uint8_t             dst_reg_am               : 1;
+       uint8_t             must_be_one              : 1;
+       uint8_t             const_index              : 5;
+       uint8_t             const_index_sel          : 2;
+       uint8_t             reserved0                : 3;
+       uint8_t             src_swiz                 : 2;
+       /* dword1: */
+       uint16_t            dst_swiz                 : 12;
+       uint8_t             format_comp_all          : 1;   /* '1' for signed, '0' for unsigned? */
+       uint8_t             num_format_all           : 1;   /* '0' for normalized, '1' for unnormalized */
+       uint8_t             signed_rf_mode_all       : 1;
+       uint8_t             reserved1                : 1;
+       instr_surf_fmt_t    format                   : 6;
+       uint8_t             reserved2                : 1;
+       uint8_t             exp_adjust_all           : 7;
+       uint8_t             reserved3                : 1;
+       uint8_t             pred_select              : 1;
+       /* dword2: */
+       uint8_t             stride                   : 8;
+       /* possibly offset and reserved4 are swapped on a200? */
+       uint8_t             offset                   : 8;
+       uint8_t             reserved4                : 8;
+       uint8_t             reserved5                : 7;
+       uint8_t             pred_condition           : 1;
+} instr_fetch_vtx_t;
+
+typedef union PACKED {
+       instr_fetch_tex_t   tex;
+       instr_fetch_vtx_t   vtx;
+       struct PACKED {
+               /* dword0: */
+               instr_fetch_opc_t opc                    : 5;
+               uint32_t        dummy0                   : 27;
+               /* dword1: */
+               uint32_t        dummy1                   : 32;
+               /* dword2: */
+               uint32_t        dummy2                   : 32;
+       };
+} instr_fetch_t;
+
+#endif /* INSTR_H_ */
diff --git a/src/freedreno/decode/instr-a3xx.h b/src/freedreno/decode/instr-a3xx.h
new file mode 100644 (file)
index 0000000..218bdc3
--- /dev/null
@@ -0,0 +1,1115 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+void ir3_assert_handler(const char *expr, const char *file, int line,
+               const char *func) __attribute__((weak)) __attribute__ ((__noreturn__));
+
+/* A wrapper for assert() that allows overriding handling of a failed
+ * assert.  This is needed for tools like crashdec which can want to
+ * attempt to disassemble memory that might not actually be valid
+ * instructions.
+ */
+#define ir3_assert(expr) do { \
+               if (!(expr)) { \
+                       if (ir3_assert_handler) { \
+                               ir3_assert_handler(#expr, __FILE__, __LINE__, __func__); \
+                       } \
+                       assert(expr); \
+               } \
+       } while (0)
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+
+typedef enum {
+       /* category 0: */
+       OPC_NOP             = _OPC(0, 0),
+       OPC_B               = _OPC(0, 1),
+       OPC_JUMP            = _OPC(0, 2),
+       OPC_CALL            = _OPC(0, 3),
+       OPC_RET             = _OPC(0, 4),
+       OPC_KILL            = _OPC(0, 5),
+       OPC_END             = _OPC(0, 6),
+       OPC_EMIT            = _OPC(0, 7),
+       OPC_CUT             = _OPC(0, 8),
+       OPC_CHMASK          = _OPC(0, 9),
+       OPC_CHSH            = _OPC(0, 10),
+       OPC_FLOW_REV        = _OPC(0, 11),
+
+       OPC_BKT             = _OPC(0, 16),
+       OPC_STKS            = _OPC(0, 17),
+       OPC_STKR            = _OPC(0, 18),
+       OPC_XSET            = _OPC(0, 19),
+       OPC_XCLR            = _OPC(0, 20),
+       OPC_GETONE          = _OPC(0, 21),
+       OPC_DBG             = _OPC(0, 22),
+       OPC_SHPS            = _OPC(0, 23),   /* shader prologue start */
+       OPC_SHPE            = _OPC(0, 24),   /* shader prologue end */
+
+       OPC_PREDT           = _OPC(0, 29),   /* predicated true */
+       OPC_PREDF           = _OPC(0, 30),   /* predicated false */
+       OPC_PREDE           = _OPC(0, 31),   /* predicated end */
+
+       /* category 1: */
+       OPC_MOV             = _OPC(1, 0),
+
+       /* category 2: */
+       OPC_ADD_F           = _OPC(2, 0),
+       OPC_MIN_F           = _OPC(2, 1),
+       OPC_MAX_F           = _OPC(2, 2),
+       OPC_MUL_F           = _OPC(2, 3),
+       OPC_SIGN_F          = _OPC(2, 4),
+       OPC_CMPS_F          = _OPC(2, 5),
+       OPC_ABSNEG_F        = _OPC(2, 6),
+       OPC_CMPV_F          = _OPC(2, 7),
+       /* 8 - invalid */
+       OPC_FLOOR_F         = _OPC(2, 9),
+       OPC_CEIL_F          = _OPC(2, 10),
+       OPC_RNDNE_F         = _OPC(2, 11),
+       OPC_RNDAZ_F         = _OPC(2, 12),
+       OPC_TRUNC_F         = _OPC(2, 13),
+       /* 14-15 - invalid */
+       OPC_ADD_U           = _OPC(2, 16),
+       OPC_ADD_S           = _OPC(2, 17),
+       OPC_SUB_U           = _OPC(2, 18),
+       OPC_SUB_S           = _OPC(2, 19),
+       OPC_CMPS_U          = _OPC(2, 20),
+       OPC_CMPS_S          = _OPC(2, 21),
+       OPC_MIN_U           = _OPC(2, 22),
+       OPC_MIN_S           = _OPC(2, 23),
+       OPC_MAX_U           = _OPC(2, 24),
+       OPC_MAX_S           = _OPC(2, 25),
+       OPC_ABSNEG_S        = _OPC(2, 26),
+       /* 27 - invalid */
+       OPC_AND_B           = _OPC(2, 28),
+       OPC_OR_B            = _OPC(2, 29),
+       OPC_NOT_B           = _OPC(2, 30),
+       OPC_XOR_B           = _OPC(2, 31),
+       /* 32 - invalid */
+       OPC_CMPV_U          = _OPC(2, 33),
+       OPC_CMPV_S          = _OPC(2, 34),
+       /* 35-47 - invalid */
+       OPC_MUL_U24         = _OPC(2, 48), /* 24b mul into 32b result */
+       OPC_MUL_S24         = _OPC(2, 49), /* 24b mul into 32b result with sign extension */
+       OPC_MULL_U          = _OPC(2, 50),
+       OPC_BFREV_B         = _OPC(2, 51),
+       OPC_CLZ_S           = _OPC(2, 52),
+       OPC_CLZ_B           = _OPC(2, 53),
+       OPC_SHL_B           = _OPC(2, 54),
+       OPC_SHR_B           = _OPC(2, 55),
+       OPC_ASHR_B          = _OPC(2, 56),
+       OPC_BARY_F          = _OPC(2, 57),
+       OPC_MGEN_B          = _OPC(2, 58),
+       OPC_GETBIT_B        = _OPC(2, 59),
+       OPC_SETRM           = _OPC(2, 60),
+       OPC_CBITS_B         = _OPC(2, 61),
+       OPC_SHB             = _OPC(2, 62),
+       OPC_MSAD            = _OPC(2, 63),
+
+       /* category 3: */
+       OPC_MAD_U16         = _OPC(3, 0),
+       OPC_MADSH_U16       = _OPC(3, 1),
+       OPC_MAD_S16         = _OPC(3, 2),
+       OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
+       OPC_MAD_U24         = _OPC(3, 4),
+       OPC_MAD_S24         = _OPC(3, 5),
+       OPC_MAD_F16         = _OPC(3, 6),
+       OPC_MAD_F32         = _OPC(3, 7),
+       OPC_SEL_B16         = _OPC(3, 8),
+       OPC_SEL_B32         = _OPC(3, 9),
+       OPC_SEL_S16         = _OPC(3, 10),
+       OPC_SEL_S32         = _OPC(3, 11),
+       OPC_SEL_F16         = _OPC(3, 12),
+       OPC_SEL_F32         = _OPC(3, 13),
+       OPC_SAD_S16         = _OPC(3, 14),
+       OPC_SAD_S32         = _OPC(3, 15),
+
+       /* category 4: */
+       OPC_RCP             = _OPC(4, 0),
+       OPC_RSQ             = _OPC(4, 1),
+       OPC_LOG2            = _OPC(4, 2),
+       OPC_EXP2            = _OPC(4, 3),
+       OPC_SIN             = _OPC(4, 4),
+       OPC_COS             = _OPC(4, 5),
+       OPC_SQRT            = _OPC(4, 6),
+       /* NOTE that these are 8+opc from their highp equivs, so it's possible
+        * that the high order bit in the opc field has been repurposed for
+        * half-precision use?  But note that other ops (rcp/lsin/cos/sqrt)
+        * still use the same opc as highp
+        */
+       OPC_HRSQ            = _OPC(4, 9),
+       OPC_HLOG2           = _OPC(4, 10),
+       OPC_HEXP2           = _OPC(4, 11),
+
+       /* category 5: */
+       OPC_ISAM            = _OPC(5, 0),
+       OPC_ISAML           = _OPC(5, 1),
+       OPC_ISAMM           = _OPC(5, 2),
+       OPC_SAM             = _OPC(5, 3),
+       OPC_SAMB            = _OPC(5, 4),
+       OPC_SAML            = _OPC(5, 5),
+       OPC_SAMGQ           = _OPC(5, 6),
+       OPC_GETLOD          = _OPC(5, 7),
+       OPC_CONV            = _OPC(5, 8),
+       OPC_CONVM           = _OPC(5, 9),
+       OPC_GETSIZE         = _OPC(5, 10),
+       OPC_GETBUF          = _OPC(5, 11),
+       OPC_GETPOS          = _OPC(5, 12),
+       OPC_GETINFO         = _OPC(5, 13),
+       OPC_DSX             = _OPC(5, 14),
+       OPC_DSY             = _OPC(5, 15),
+       OPC_GATHER4R        = _OPC(5, 16),
+       OPC_GATHER4G        = _OPC(5, 17),
+       OPC_GATHER4B        = _OPC(5, 18),
+       OPC_GATHER4A        = _OPC(5, 19),
+       OPC_SAMGP0          = _OPC(5, 20),
+       OPC_SAMGP1          = _OPC(5, 21),
+       OPC_SAMGP2          = _OPC(5, 22),
+       OPC_SAMGP3          = _OPC(5, 23),
+       OPC_DSXPP_1         = _OPC(5, 24),
+       OPC_DSYPP_1         = _OPC(5, 25),
+       OPC_RGETPOS         = _OPC(5, 26),
+       OPC_RGETINFO        = _OPC(5, 27),
+
+       /* category 6: */
+       OPC_LDG             = _OPC(6, 0),        /* load-global */
+       OPC_LDL             = _OPC(6, 1),
+       OPC_LDP             = _OPC(6, 2),
+       OPC_STG             = _OPC(6, 3),        /* store-global */
+       OPC_STL             = _OPC(6, 4),
+       OPC_STP             = _OPC(6, 5),
+       OPC_LDIB            = _OPC(6, 6),
+       OPC_G2L             = _OPC(6, 7),
+       OPC_L2G             = _OPC(6, 8),
+       OPC_PREFETCH        = _OPC(6, 9),
+       OPC_LDLW            = _OPC(6, 10),
+       OPC_STLW            = _OPC(6, 11),
+       OPC_RESFMT          = _OPC(6, 14),
+       OPC_RESINFO         = _OPC(6, 15),
+       OPC_ATOMIC_ADD      = _OPC(6, 16),
+       OPC_ATOMIC_SUB      = _OPC(6, 17),
+       OPC_ATOMIC_XCHG     = _OPC(6, 18),
+       OPC_ATOMIC_INC      = _OPC(6, 19),
+       OPC_ATOMIC_DEC      = _OPC(6, 20),
+       OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
+       OPC_ATOMIC_MIN      = _OPC(6, 22),
+       OPC_ATOMIC_MAX      = _OPC(6, 23),
+       OPC_ATOMIC_AND      = _OPC(6, 24),
+       OPC_ATOMIC_OR       = _OPC(6, 25),
+       OPC_ATOMIC_XOR      = _OPC(6, 26),
+       OPC_LDGB            = _OPC(6, 27),
+       OPC_STGB            = _OPC(6, 28),
+       OPC_STIB            = _OPC(6, 29),
+       OPC_LDC             = _OPC(6, 30),
+       OPC_LDLV            = _OPC(6, 31),
+
+       /* category 7: */
+       OPC_BAR             = _OPC(7, 0),
+       OPC_FENCE           = _OPC(7, 1),
+} opc_t;
+
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
+typedef enum {
+       TYPE_F16 = 0,
+       TYPE_F32 = 1,
+       TYPE_U16 = 2,
+       TYPE_U32 = 3,
+       TYPE_S16 = 4,
+       TYPE_S32 = 5,
+       TYPE_U8  = 6,
+       TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+       switch (type) {
+       case TYPE_F32:
+       case TYPE_U32:
+       case TYPE_S32:
+               return 32;
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return 16;
+       case TYPE_U8:
+       case TYPE_S8:
+               return 8;
+       default:
+               ir3_assert(0); /* invalid type */
+               return 0;
+       }
+}
+
+static inline int type_float(type_t type)
+{
+       return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+       return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+       return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+       /* normal gpr or const src register: */
+       struct PACKED {
+               uint32_t comp  : 2;
+               uint32_t num   : 10;
+       };
+       /* for immediate val: */
+       int32_t  iim_val   : 11;
+       /* to make compiler happy: */
+       uint32_t dummy32;
+       uint32_t dummy10   : 10;
+       int32_t  idummy10  : 10;
+       uint32_t dummy11   : 11;
+       uint32_t dummy12   : 12;
+       uint32_t dummy13   : 13;
+       uint32_t dummy8    : 8;
+       int32_t  idummy13  : 13;
+       int32_t  idummy8   : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+       return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef enum {
+       BRANCH_PLAIN = 0,   /* br */
+       BRANCH_OR    = 1,   /* brao */
+       BRANCH_AND   = 2,   /* braa */
+       BRANCH_CONST = 3,   /* brac */
+       BRANCH_ANY   = 4,   /* bany */
+       BRANCH_ALL   = 5,   /* ball */
+       BRANCH_X     = 6,   /* brax ??? */
+} brtype_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       int16_t  immed    : 16;
+                       uint32_t dummy1   : 16;
+               } a3xx;
+               struct PACKED {
+                       int32_t  immed    : 20;
+                       uint32_t dummy1   : 12;
+               } a4xx;
+               struct PACKED {
+                       int32_t immed     : 32;
+               } a5xx;
+       };
+
+       /* dword1: */
+       uint32_t idx      : 5;  /* brac.N index */
+       uint32_t brtype   : 3;  /* branch type, see brtype_t */
+       uint32_t repeat   : 3;
+       uint32_t dummy3   : 1;
+       uint32_t ss       : 1;
+       uint32_t inv1     : 1;
+       uint32_t comp1    : 2;
+       uint32_t eq       : 1;
+       uint32_t opc_hi   : 1;  /* at least one bit */
+       uint32_t dummy4   : 2;
+       uint32_t inv0     : 1;
+       uint32_t comp0    : 2;  /* component for first src */
+       uint32_t opc      : 4;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               /* for normal src register: */
+               struct PACKED {
+                       uint32_t src : 11;
+                       /* at least low bit of pad must be zero or it will
+                        * look like a address relative src
+                        */
+                       uint32_t pad : 21;
+               };
+               /* for address relative: */
+               struct PACKED {
+                       int32_t  off : 10;
+                       uint32_t src_rel_c : 1;
+                       uint32_t src_rel : 1;
+                       uint32_t unknown : 20;
+               };
+               /* for immediate: */
+               int32_t  iim_val;
+               uint32_t uim_val;
+               float    fim_val;
+       };
+
+       /* dword1: */
+       uint32_t dst        : 8;
+       uint32_t repeat     : 3;
+       uint32_t src_r      : 1;
+       uint32_t ss         : 1;
+       uint32_t ul         : 1;
+       uint32_t dst_type   : 3;
+       uint32_t dst_rel    : 1;
+       uint32_t src_type   : 3;
+       uint32_t src_c      : 1;
+       uint32_t src_im     : 1;
+       uint32_t even       : 1;
+       uint32_t pos_inf    : 1;
+       uint32_t must_be_0  : 2;
+       uint32_t jmp_tgt    : 1;
+       uint32_t sync       : 1;
+       uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src1         : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src1_im      : 1;   /* immediate */
+                       uint32_t src1_neg     : 1;   /* negate */
+                       uint32_t src1_abs     : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src1         : 10;
+                       uint32_t src1_c       : 1;   /* relative-const */
+                       uint32_t src1_rel     : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel1;
+               struct PACKED {
+                       uint32_t src1         : 12;
+                       uint32_t src1_c       : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c1;
+       };
+
+       union PACKED {
+               struct PACKED {
+                       uint32_t src2         : 11;
+                       uint32_t must_be_zero2: 2;
+                       uint32_t src2_im      : 1;   /* immediate */
+                       uint32_t src2_neg     : 1;   /* negate */
+                       uint32_t src2_abs     : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src2         : 10;
+                       uint32_t src2_c       : 1;   /* relative-const */
+                       uint32_t src2_rel     : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel2;
+               struct PACKED {
+                       uint32_t src2         : 12;
+                       uint32_t src2_c       : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c2;
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 2;
+       uint32_t sat      : 1;
+       uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;   /* dunno */
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t ei       : 1;
+       uint32_t cond     : 3;
+       uint32_t src2_r   : 1;   /* doubles as nop1 if repeat==0 */
+       uint32_t full     : 1;   /* not half */
+       uint32_t opc      : 6;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src1         : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src2_c       : 1;
+                       uint32_t src1_neg     : 1;
+                       uint32_t src2_r       : 1;  /* doubles as nop1 if repeat==0 */
+               };
+               struct PACKED {
+                       uint32_t src1         : 10;
+                       uint32_t src1_c       : 1;
+                       uint32_t src1_rel     : 1;
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel1;
+               struct PACKED {
+                       uint32_t src1         : 12;
+                       uint32_t src1_c       : 1;
+                       uint32_t dummy        : 3;
+               } c1;
+       };
+
+       union PACKED {
+               struct PACKED {
+                       uint32_t src3         : 11;
+                       uint32_t must_be_zero2: 2;
+                       uint32_t src3_r       : 1;
+                       uint32_t src2_neg     : 1;
+                       uint32_t src3_neg     : 1;
+               };
+               struct PACKED {
+                       uint32_t src3         : 10;
+                       uint32_t src3_c       : 1;
+                       uint32_t src3_rel     : 1;
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel2;
+               struct PACKED {
+                       uint32_t src3         : 12;
+                       uint32_t src3_c       : 1;
+                       uint32_t dummy        : 3;
+               } c2;
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 2;
+       uint32_t sat      : 1;
+       uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t src2     : 8;
+       uint32_t opc      : 4;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+       switch (_OPC(3, cat3->opc)) {
+       case OPC_MAD_F16:
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_SEL_B16:
+       case OPC_SEL_S16:
+       case OPC_SEL_F16:
+       case OPC_SAD_S16:
+       case OPC_SAD_S32:  // really??
+               return false;
+       default:
+               return true;
+       }
+}
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src          : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src_im       : 1;   /* immediate */
+                       uint32_t src_neg      : 1;   /* negate */
+                       uint32_t src_abs      : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src          : 10;
+                       uint32_t src_c        : 1;   /* relative-const */
+                       uint32_t src_rel      : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel;
+               struct PACKED {
+                       uint32_t src          : 12;
+                       uint32_t src_c        : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c;
+       };
+       uint32_t dummy1   : 16;  /* seem to be ignored */
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 2;
+       uint32_t sat      : 1;
+       uint32_t src_r    : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t dummy2   : 5;   /* seem to be ignored */
+       uint32_t full     : 1;   /* not half */
+       uint32_t opc      : 6;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+/* With is_bindless_s2en = 1, this determines whether bindless is enabled and
+ * if so, how to get the (base, index) pair for both sampler and texture.
+ * There is a single base embedded in the instruction, which is always used
+ * for the texture.
+ */
+typedef enum {
+       /* Use traditional GL binding model, get texture and sampler index
+        * from src3 which is not presumed to be uniform. This is
+        * backwards-compatible with earlier generations, where this field was
+        * always 0 and nonuniform-indexed sampling always worked.
+        */
+       CAT5_NONUNIFORM = 0,
+
+       /* The sampler base comes from the low 3 bits of a1.x, and the sampler
+        * and texture index come from src3 which is presumed to be uniform.
+        */
+       CAT5_BINDLESS_A1_UNIFORM = 1,
+
+       /* The texture and sampler share the same base, and the sampler and
+        * texture index come from src3 which is *not* presumed to be uniform.
+        */
+       CAT5_BINDLESS_NONUNIFORM = 2,
+
+       /* The sampler base comes from the low 3 bits of a1.x, and the sampler
+        * and texture index come from src3 which is *not* presumed to be
+        * uniform.
+        */
+       CAT5_BINDLESS_A1_NONUNIFORM = 3,
+
+       /* Use traditional GL binding model, get texture and sampler index
+        * from src3 which is presumed to be uniform.
+        */
+       CAT5_UNIFORM = 4,
+
+       /* The texture and sampler share the same base, and the sampler and
+        * texture index come from src3 which is presumed to be uniform.
+        */
+       CAT5_BINDLESS_UNIFORM = 5,
+
+       /* The texture and sampler share the same base, get sampler index from low
+        * 4 bits of src3 and texture index from high 4 bits.
+        */
+       CAT5_BINDLESS_IMM = 6,
+
+       /* The sampler base comes from the low 3 bits of a1.x, and the texture
+        * index comes from the next 8 bits of a1.x. The sampler index is an
+        * immediate in src3.
+        */
+       CAT5_BINDLESS_A1_IMM = 7,
+} cat5_desc_mode_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               /* normal case: */
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t src2     : 8;
+                       uint32_t dummy1   : 4;   /* seem to be ignored */
+                       uint32_t samp     : 4;
+                       uint32_t tex      : 7;
+               } norm;
+               /* s2en case: */
+               struct PACKED {
+                       uint32_t full         : 1;   /* not half */
+                       uint32_t src1         : 8;
+                       uint32_t src2         : 8;
+                       uint32_t dummy1       : 2;
+                       uint32_t base_hi      : 2;
+                       uint32_t src3         : 8;
+                       uint32_t desc_mode    : 3;
+               } s2en_bindless;
+               /* same in either case: */
+               // XXX I think, confirm this
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t src2     : 8;
+                       uint32_t pad      : 15;
+               };
+       };
+
+       /* dword1: */
+       uint32_t dst              : 8;
+       uint32_t wrmask           : 4;   /* write-mask */
+       uint32_t type             : 3;
+       uint32_t base_lo          : 1;   /* used with bindless */
+       uint32_t is_3d            : 1;
+
+       uint32_t is_a             : 1;
+       uint32_t is_s             : 1;
+       uint32_t is_s2en_bindless : 1;
+       uint32_t is_o             : 1;
+       uint32_t is_p             : 1;
+
+       uint32_t opc              : 5;
+       uint32_t jmp_tgt          : 1;
+       uint32_t sync             : 1;
+       uint32_t opc_cat          : 3;
+} instr_cat5_t;
+
+/* dword0 encoding for src_off: [src1 + off], src2: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t mustbe1  : 1;
+       int32_t  off      : 13;
+       uint32_t src1     : 8;
+       uint32_t src1_im  : 1;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t dword1;
+} instr_cat6a_t;
+
+/* dword0 encoding for !src_off: [src1], src2 */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t mustbe0  : 1;
+       uint32_t src1     : 13;
+       uint32_t ignore0  : 8;
+       uint32_t src1_im  : 1;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t dword1;
+} instr_cat6b_t;
+
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t dword0;
+
+       /* note: there is some weird stuff going on where sometimes
+        * cat6->a.off is involved.. but that seems like a bug in
+        * the blob, since it is used even if !cat6->src_off
+        * It would make sense for there to be some more bits to
+        * bring us to 11 bits worth of offset, but not sure..
+        */
+       int32_t off       : 8;
+       uint32_t mustbe1  : 1;
+       uint32_t dst      : 8;
+       uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t dword0;
+
+       uint32_t dst      : 8;
+       uint32_t mustbe0  : 1;
+       uint32_t idx      : 8;
+       uint32_t pad0     : 15;
+} instr_cat6d_t;
+
+/* ldgb and atomics..
+ *
+ * ldgb:      pad0=0, pad3=1
+ * atomic .g: pad0=1, pad3=1
+ *        .l: pad0=1, pad3=0
+ */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t pad0     : 1;
+       uint32_t src3     : 8;
+       uint32_t d        : 2;
+       uint32_t typed    : 1;
+       uint32_t type_size : 2;
+       uint32_t src1     : 8;
+       uint32_t src1_im  : 1;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t mustbe0  : 1;
+       uint32_t src_ssbo : 8;
+       uint32_t pad2     : 3;  // type
+       uint32_t g        : 1;
+       uint32_t pad3     : 1;
+       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t mustbe1  : 1;  // ???
+       uint32_t src1     : 8;
+       uint32_t d        : 2;
+       uint32_t typed    : 1;
+       uint32_t type_size : 2;
+       uint32_t pad0     : 9;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t src3     : 8;
+       uint32_t src3_im  : 1;
+       uint32_t dst_ssbo : 8;
+       uint32_t pad2     : 3;  // type
+       uint32_t pad3     : 2;
+       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
+
+typedef union PACKED {
+       instr_cat6a_t a;
+       instr_cat6b_t b;
+       instr_cat6c_t c;
+       instr_cat6d_t d;
+       instr_cat6ldgb_t ldgb;
+       instr_cat6stgb_t stgb;
+       struct PACKED {
+               /* dword0: */
+               uint32_t src_off  : 1;
+               uint32_t pad1     : 31;
+
+               /* dword1: */
+               uint32_t pad2     : 8;
+               uint32_t dst_off  : 1;
+               uint32_t pad3     : 8;
+               uint32_t type     : 3;
+               uint32_t g        : 1;  /* or in some cases it means dst immed */
+               uint32_t pad4     : 1;
+               uint32_t opc      : 5;
+               uint32_t jmp_tgt  : 1;
+               uint32_t sync     : 1;
+               uint32_t opc_cat  : 3;
+       };
+} instr_cat6_t;
+
+/* Similar to cat5_desc_mode_t, describes how the descriptor is loaded.
+ */
+typedef enum {
+       /* Use old GL binding model with an immediate index. */
+       CAT6_IMM = 0,
+
+       CAT6_UNIFORM = 1,
+
+       CAT6_NONUNIFORM = 2,
+
+       /* Use the bindless model, with an immediate index.
+        */
+       CAT6_BINDLESS_IMM = 4,
+
+       /* Use the bindless model, with a uniform register index.
+        */
+       CAT6_BINDLESS_UNIFORM = 5,
+
+       /* Use the bindless model, with a register index that isn't guaranteed
+        * to be uniform. This presumably checks if the indices are equal and
+        * splits up the load/store, because it works the way you would
+        * expect.
+        */
+       CAT6_BINDLESS_NONUNIFORM = 6,
+} cat6_desc_mode_t;
+
+/**
+ * For atomic ops (which return a value):
+ *
+ *    pad1=1, pad3=c, pad5=3
+ *    src1    - vecN offset/coords
+ *    src2.x  - is actually dest register
+ *    src2.y  - is 'data' except for cmpxchg where src2.y is 'compare'
+ *              and src2.z is 'data'
+ *
+ * For stib (which does not return a value):
+ *    pad1=0, pad3=c, pad5=2
+ *    src1    - vecN offset/coords
+ *    src2    - value to store
+ *
+ * For ldib:
+ *    pad1=1, pad3=c, pad5=2
+ *    src1    - vecN offset/coords
+ *
+ * for ldc (load from UBO using descriptor):
+ *    pad1=0, pad3=8, pad5=2
+ *
+ * pad2 and pad5 are only observed to be 0.
+ */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t pad1     : 1;
+       uint32_t base     : 3;
+       uint32_t pad2     : 2;
+       uint32_t desc_mode : 3;
+       uint32_t d        : 2;
+       uint32_t typed    : 1;
+       uint32_t type_size : 2;
+       uint32_t opc      : 5;
+       uint32_t pad3     : 5;
+       uint32_t src1     : 8;  /* coordinate/offset */
+
+       /* dword1: */
+       uint32_t src2     : 8;  /* or the dst for load instructions */
+       uint32_t pad4     : 1;  //mustbe0 ??
+       uint32_t ssbo     : 8;  /* ssbo/image binding point */
+       uint32_t type     : 3;
+       uint32_t pad5     : 7;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat6_a6xx_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t pad1     : 32;
+
+       /* dword1: */
+       uint32_t pad2     : 12;
+       uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
+       uint32_t pad3     : 6;
+       uint32_t w        : 1;  /* write */
+       uint32_t r        : 1;  /* read */
+       uint32_t l        : 1;  /* local */
+       uint32_t g        : 1;  /* global */
+       uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
+       uint32_t jmp_tgt  : 1;  /* (jp) */
+       uint32_t sync     : 1;  /* (sy) */
+       uint32_t opc_cat  : 3;
+} instr_cat7_t;
+
+typedef union PACKED {
+       instr_cat0_t cat0;
+       instr_cat1_t cat1;
+       instr_cat2_t cat2;
+       instr_cat3_t cat3;
+       instr_cat4_t cat4;
+       instr_cat5_t cat5;
+       instr_cat6_t cat6;
+       instr_cat6_a6xx_t cat6_a6xx;
+       instr_cat7_t cat7;
+       struct PACKED {
+               /* dword0: */
+               uint32_t pad1     : 32;
+
+               /* dword1: */
+               uint32_t pad2     : 12;
+               uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
+               uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+               uint32_t pad3     : 13;
+               uint32_t jmp_tgt  : 1;
+               uint32_t sync     : 1;
+               uint32_t opc_cat  : 3;
+
+       };
+} instr_t;
+
+static inline uint32_t instr_repeat(instr_t *instr)
+{
+       switch (instr->opc_cat) {
+       case 0:  return instr->cat0.repeat;
+       case 1:  return instr->cat1.repeat;
+       case 2:  return instr->cat2.repeat;
+       case 3:  return instr->cat3.repeat;
+       case 4:  return instr->cat4.repeat;
+       default: return 0;
+       }
+}
+
+static inline bool instr_sat(instr_t *instr)
+{
+       switch (instr->opc_cat) {
+       case 2:  return instr->cat2.sat;
+       case 3:  return instr->cat3.sat;
+       case 4:  return instr->cat4.sat;
+       default: return false;
+       }
+}
+
+/* We can probably drop the gpu_id arg, but keeping it for now so we can
+ * assert if we see something we think should be new encoding on an older
+ * gpu.
+ */
+static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id)
+{
+       instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
+
+       /* At least one of these two bits is pad in all the possible
+        * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
+        * cmdstream traces I have indicates that the pad bit is zero
+        * in all cases.  So we can use this to detect new encoding:
+        */
+       if ((cat6->pad3 & 0x8) && (cat6->pad5 & 0x2)) {
+               ir3_assert(gpu_id >= 600);
+               ir3_assert(instr->cat6.opc == 0);
+               return false;
+       }
+
+       return true;
+}
+
+static inline uint32_t instr_opc(instr_t *instr, unsigned gpu_id)
+{
+       switch (instr->opc_cat) {
+       case 0:  return instr->cat0.opc | instr->cat0.opc_hi << 4;
+       case 1:  return 0;
+       case 2:  return instr->cat2.opc;
+       case 3:  return instr->cat3.opc;
+       case 4:  return instr->cat4.opc;
+       case 5:  return instr->cat5.opc;
+       case 6:
+               if (!is_cat6_legacy(instr, gpu_id))
+                       return instr->cat6_a6xx.opc;
+               return instr->cat6.opc;
+       case 7:  return instr->cat7.opc;
+       default: return 0;
+       }
+}
+
+static inline bool is_mad(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_MAD_U24:
+       case OPC_MAD_S24:
+       case OPC_MAD_F16:
+       case OPC_MAD_F32:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_madsh(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MADSH_U16:
+       case OPC_MADSH_M16:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_atomic(opc_t opc)
+{
+       switch (opc) {
+       case OPC_ATOMIC_ADD:
+       case OPC_ATOMIC_SUB:
+       case OPC_ATOMIC_XCHG:
+       case OPC_ATOMIC_INC:
+       case OPC_ATOMIC_DEC:
+       case OPC_ATOMIC_CMPXCHG:
+       case OPC_ATOMIC_MIN:
+       case OPC_ATOMIC_MAX:
+       case OPC_ATOMIC_AND:
+       case OPC_ATOMIC_OR:
+       case OPC_ATOMIC_XOR:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_ssbo(opc_t opc)
+{
+       switch (opc) {
+       case OPC_RESFMT:
+       case OPC_RESINFO:
+       case OPC_LDGB:
+       case OPC_STGB:
+       case OPC_STIB:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_isam(opc_t opc)
+{
+       switch (opc) {
+       case OPC_ISAM:
+       case OPC_ISAML:
+       case OPC_ISAMM:
+               return true;
+       default:
+               return false;
+       }
+}
+
+
+static inline bool is_cat2_float(opc_t opc)
+{
+       switch (opc) {
+       case OPC_ADD_F:
+       case OPC_MIN_F:
+       case OPC_MAX_F:
+       case OPC_MUL_F:
+       case OPC_SIGN_F:
+       case OPC_CMPS_F:
+       case OPC_ABSNEG_F:
+       case OPC_CMPV_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+               return true;
+
+       default:
+               return false;
+       }
+}
+
+static inline bool is_cat3_float(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MAD_F16:
+       case OPC_MAD_F32:
+       case OPC_SEL_F16:
+       case OPC_SEL_F32:
+               return true;
+       default:
+               return false;
+       }
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/decode/io.c b/src/freedreno/decode/io.c
new file mode 100644 (file)
index 0000000..5fc5752
--- /dev/null
@@ -0,0 +1,163 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <archive.h>
+#include <archive_entry.h>
+
+#include "io.h"
+
+struct io {
+       struct archive *a;
+       struct archive_entry *entry;
+       unsigned offset;
+};
+
+static void io_error(struct io *io)
+{
+       fprintf(stderr, "%s\n", archive_error_string(io->a));
+       io_close(io);
+}
+
+static struct io * io_new(void)
+{
+       struct io *io = calloc(1, sizeof(*io));
+       int ret;
+
+       if (!io)
+               return NULL;
+
+       io->a = archive_read_new();
+       ret = archive_read_support_filter_gzip(io->a);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       ret = archive_read_support_filter_none(io->a);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       ret = archive_read_support_format_all(io->a);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       ret = archive_read_support_format_raw(io->a);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       return io;
+}
+
+struct io * io_open(const char *filename)
+{
+       struct io *io = io_new();
+       int ret;
+
+       if (!io)
+               return NULL;
+
+       ret = archive_read_open_filename(io->a, filename, 10240);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       ret = archive_read_next_header(io->a, &io->entry);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       return io;
+}
+
+struct io * io_openfd(int fd)
+{
+       struct io *io = io_new();
+       int ret;
+
+       if (!io)
+               return NULL;
+
+       ret = archive_read_open_fd(io->a, fd, 10240);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       ret = archive_read_next_header(io->a, &io->entry);
+       if (ret != ARCHIVE_OK) {
+               io_error(io);
+               return NULL;
+       }
+
+       return io;
+}
+
+void io_close(struct io *io)
+{
+       archive_read_free(io->a);
+       free(io);
+}
+
+unsigned io_offset(struct io *io)
+{
+       return io->offset;
+}
+
+#include <assert.h>
+int io_readn(struct io *io, void *buf, int nbytes)
+{
+       char *ptr = buf;
+       int ret = 0;
+       while (nbytes > 0) {
+               int n = archive_read_data(io->a, ptr, nbytes);
+               if (n < 0) {
+                       fprintf(stderr, "%s\n", archive_error_string(io->a));
+                       return n;
+               }
+               if (n == 0)
+                       break;
+               ptr += n;
+               nbytes -= n;
+               ret += n;
+               io->offset += n;
+       }
+       return ret;
+}
diff --git a/src/freedreno/decode/io.h b/src/freedreno/decode/io.h
new file mode 100644 (file)
index 0000000..d26ba4b
--- /dev/null
@@ -0,0 +1,51 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IO_H_
+#define IO_H_
+
+/* Simple API to abstract reading from file which might be compressed.
+ * Maybe someday I'll add writing..
+ */
+
+struct io;
+
+struct io * io_open(const char *filename);
+struct io * io_openfd(int fd);
+void io_close(struct io *io);
+unsigned io_offset(struct io *io);
+int io_readn(struct io *io, void *buf, int nbytes);
+
+
+static inline int
+check_extension(const char *path, const char *ext)
+{
+       return strcmp(path + strlen(path) - strlen(ext), ext) == 0;
+}
+
+#endif /* IO_H_ */
diff --git a/src/freedreno/decode/meson.build b/src/freedreno/decode/meson.build
new file mode 100644 (file)
index 0000000..0ec9995
--- /dev/null
@@ -0,0 +1,144 @@
+# Copyright Â© 2020 Google, Inc
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+dep_lua = dependency('lua53', required: false)
+if not dep_lua.found()
+  dep_lua = dependency('lua52', required: false)
+endif
+if not dep_lua.found()
+  dep_lua = dependency('lua', required: false)
+endif
+
+dep_libarchive = dependency('libarchive', required: false)
+
+# Shared cmdstream decoding:
+libfreedreno_cffdec = static_library(
+  'freedreno_cffdec',
+  [
+    'buffers.c',
+    'buffers.h',
+    'cffdec.c',
+    'cffdec.h',
+    'disasm-a2xx.c',
+    'disasm-a3xx.c',
+    'disasm.h',
+    'instr-a2xx.h',
+    'instr-a3xx.h',
+    'pager.c',
+    'pager.h',
+    'rnnutil.c',
+    'rnnutil.h',
+    'util.h',
+  ],
+  include_directories: [
+    inc_freedreno_rnn,
+  ],
+  c_args : [ no_override_init_args ],
+  gnu_symbol_visibility: 'hidden',
+  dependencies: [],
+  link_with: libfreedreno_rnn,
+  build_by_default: false,
+)
+
+if dep_libarchive.found()
+  libfreedreno_io = static_library(
+    'libfreedreno_io',
+    [
+      'io.c',
+      'io.h',
+    ],
+    include_directories: [],
+    c_args : [no_override_init_args],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [
+      dep_libarchive,
+    ],
+    build_by_default: false,
+  )
+endif
+
+if dep_lua.found() and dep_libarchive.found()
+  cffdump = executable(
+    'cffdump',
+    [
+      'cffdump.c',
+      'script.c',
+      'script.h'
+    ],
+    include_directories: [
+      inc_freedreno_rnn,
+    ],
+    c_args : [no_override_init_args],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [
+      dep_lua,
+    ],
+    link_with: [
+      libfreedreno_cffdec,
+      libfreedreno_io,
+    ],
+    build_by_default: with_tools.contains('freedreno'),
+    install : with_tools.contains('freedreno'),
+  )
+endif
+
+crashdec = executable(
+  'crashdec',
+  'crashdec.c',
+  include_directories: [
+    inc_freedreno_rnn,
+  ],
+  gnu_symbol_visibility: 'hidden',
+  dependencies: [],
+  link_with: [
+    libfreedreno_cffdec,
+  ],
+  build_by_default: with_tools.contains('freedreno'),
+  install : with_tools.contains('freedreno'),
+)
+
+if dep_libarchive.found()
+  pgmdump = executable(
+    'pgmdump',
+    'pgmdump.c',
+    include_directories: [],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [],
+    link_with: [
+      libfreedreno_cffdec,
+      libfreedreno_io,
+    ],
+    build_by_default: with_tools.contains('freedreno'),
+    install: false,
+  )
+  pgmdump2 = executable(
+    'pgmdump2',
+    'pgmdump2.c',
+    include_directories: [],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [],
+    link_with: [
+      libfreedreno_cffdec,
+      libfreedreno_io,
+    ],
+    build_by_default: with_tools.contains('freedreno'),
+    install: false,
+  )
+endif
diff --git a/src/freedreno/decode/pager.c b/src/freedreno/decode/pager.c
new file mode 100644 (file)
index 0000000..fa07c10
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pager.h"
+
+static pid_t pager_pid;
+
+
+static void
+pager_death(int n)
+{
+       exit(0);
+}
+
+void
+pager_open(void)
+{
+       int fd[2];
+
+       if (pipe(fd) < 0) {
+               fprintf(stderr, "Failed to create pager pipe: %m\n");
+               exit(-1);
+       }
+
+       pager_pid = fork();
+       if (pager_pid < 0) {
+               fprintf(stderr, "Failed to fork pager: %m\n");
+               exit(-1);
+       }
+
+       if (pager_pid == 0) {
+               const char* less_opts;
+
+               dup2(fd[0], STDIN_FILENO);
+               close(fd[0]);
+               close(fd[1]);
+
+               less_opts = "FRSMKX";
+               setenv("LESS", less_opts, 1);
+
+               execlp("less", "less", NULL);
+
+       } else {
+               /* we want to kill the parent process when pager exits: */
+               signal(SIGCHLD, pager_death);
+               dup2(fd[1], STDOUT_FILENO);
+               close(fd[0]);
+               close(fd[1]);
+       }
+}
+
+int
+pager_close(void)
+{
+       siginfo_t status;
+
+       close(STDOUT_FILENO);
+
+       while (true) {
+               memset(&status, 0, sizeof(status));
+               if (waitid(P_PID, pager_pid, &status, WEXITED) < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       return -errno;
+               }
+
+               return 0;
+       }
+}
diff --git a/src/freedreno/decode/pager.h b/src/freedreno/decode/pager.h
new file mode 100644 (file)
index 0000000..022786e
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __PAGER_H__
+#define __PAGER_H__
+
+void pager_open(void);
+int pager_close(void);
+
+#endif /* __PAGER_H__ */
diff --git a/src/freedreno/decode/pgmdump.c b/src/freedreno/decode/pgmdump.c
new file mode 100644 (file)
index 0000000..b8d7cd3
--- /dev/null
@@ -0,0 +1,1054 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "io.h"
+
+#define ASCII_XOR 0xff
+#include "util.h"
+
+struct pgm_header {
+       uint32_t size;
+       uint32_t unknown1;
+       uint32_t unknown2;
+       uint32_t revision;
+       uint32_t unknown4;
+       uint32_t unknown5;
+       uint32_t unknown6;
+       uint32_t unknown7;
+       uint32_t unknown8;
+       uint32_t num_attribs;
+       uint32_t num_uniforms;
+       uint32_t num_samplers;
+       uint32_t num_varyings;
+       uint32_t num_uniformblocks;
+};
+
+struct vs_header {
+       uint32_t unknown1;  /* seems to be # of sections up to and including shader */
+       uint32_t unknown2;  /* seems to be low byte or so of SQ_PROGRAM_CNTL */
+       uint32_t unknown3;
+       uint32_t unknown4;
+       uint32_t unknown5;
+       uint32_t unknown6;
+       uint32_t unknown7;
+       uint32_t unknown8;
+       uint32_t unknown9;  /* seems to be # of sections following shader */
+};
+
+struct fs_header {
+       uint32_t unknown1;
+};
+/*
+       // Covers a lot of type_info
+       // varying, attribute, uniform, sampler
+       type_info & 0xFF
+       if ((type_info >> 8) == 0x8b) // vector
+               0x50 = vec2
+               0x51 = vec3
+               0x52 = vec4
+               0x53 = ivec2
+               0x54 = ivec3
+               0x55 = ivec4
+               0x56 = bool // Why is this in vector?
+               0x57 = bvec2
+               0x58 = bvec3
+               0x59 = bvec4
+               0x5a = mat2
+               0x5b = mat3
+               0x5c = mat4
+               0x5a = mat2x2 // Same as mat2
+               0x65 = mat2x3
+               0x66 = mat2x4
+               0x67 = mat3x2
+               0x5b = mat3x3 // Same as mat3
+               0x68 = mat3x4
+               0x69 = mat4x2
+               0x6a = mat4x3
+               0x5c = mat4x4 // same as mat4
+               0x5e = sampler2D
+               0x5f = sampler3D
+               0x60 = samplerCube // XXX: Doesn't work
+               0x62 = sampler2DShadow
+               0xc6 = uvec2
+               0xc7 = uvec3
+               0xc8 = uvec4
+       else if ((type_info >> 8) == 0x8d) // GLES3 samplers
+               0xC1 = sampler2DArray
+               0xC4 = sampler2DArrayShadow
+               0xC5 = samplerCubeShadow
+               0xCA = isampler2D
+               0xCB = isampler3D
+               0xCC = isamplerCube
+               0xD2 = usampler2D
+               0xD3 = usampler3D
+               0xD4 = usamplerCube
+               0xD7 = isampler2DArray
+               0xD7 = usampler2DArray // Is the same as isampler2DArray?
+       else // 0x14 = single
+               0x04 = int
+               0x05 = uint
+               0x06 = float
+*/
+struct attribute {
+       uint32_t type_info;
+       uint32_t reg;       /* seems to be the register the fetch instruction loads to */
+       uint32_t const_idx; /* the CONST() indx value for sampler */
+       uint32_t unknown2;
+       uint32_t unknown3;
+       uint32_t unknown4;
+       uint32_t unknown5;
+       char name[];
+};
+
+struct uniform {
+       uint32_t type_info;
+       uint32_t unknown2;
+       uint32_t unknown3;
+       uint32_t unknown4;
+       uint32_t const_base; /* const base register (for uniforms that take more than one const reg, ie. matrices) */
+       uint32_t unknown6;
+       uint32_t const_reg; /* the const register holding the value */
+       uint32_t unknown7;
+       uint32_t unknown8;
+       uint32_t unknown9;
+       union {
+               struct {
+                       char name[1];
+               } v1;
+               struct {
+                       uint32_t unknown10;
+                       uint32_t unknown11;
+                       uint32_t unknown12;
+                       char name[];
+               } v2;
+       };
+};
+
+struct uniformblockmember {
+       uint32_t type_info;
+       uint32_t is_array;
+       uint32_t array_size; /* elements in the array */
+       uint32_t unknown2; /* Same as array_size */
+       uint32_t unknown3; /* Seems to be a offset within UBO in vertex (by components) */
+       uint32_t unknown4;
+       uint32_t unknown5; /* Seems to be a offset within UBO in fragment (by vec4) */
+       uint32_t unknown6;
+       uint32_t unknown7;
+       uint32_t unknown8;
+       uint32_t unknown9; /* UBO block index? */
+       uint32_t unknown10;
+       uint32_t unknown11;
+       uint32_t unknown12;
+       char name[];
+};
+
+struct uniformblock
+{
+       uint32_t type_info;
+       uint32_t unknown1;
+       uint32_t unknown2;
+       uint32_t unknown3;
+       uint32_t unknown4;
+       uint32_t num_members;
+       uint32_t num_members2;
+       uint32_t unknown5;
+       uint32_t unknown6;
+       uint32_t unknown7;
+       char name[];
+};
+
+
+struct sampler {
+       uint32_t type_info;
+       uint32_t is_array;
+       uint32_t array_size; /* elements in the array */
+       uint32_t unknown4; /* same as array_size */
+       uint32_t unknown5;
+       uint32_t unknown6;
+       uint32_t const_idx; /* the CONST() indx value for the sampler */
+       uint32_t unknown7;
+       char name[];
+};
+
+struct varying {
+       uint32_t type_info;
+       uint32_t unknown2;
+       uint32_t unknown3;
+       uint32_t reg;       /* the register holding the value (on entry to the shader) */
+       char name[];
+};
+
+struct output {
+       uint32_t type_info;
+       uint32_t unknown2;
+       uint32_t unknown3;
+       uint32_t unknown4;
+       uint32_t unknown5;
+       uint32_t unknown6;
+       uint32_t unknown7;
+       uint32_t unknown8;
+       char name[];
+};
+
+struct constant {
+       uint32_t unknown1;
+       uint32_t unknown2;
+       uint32_t unknown3;
+       uint32_t const_idx;
+       float val[];
+};
+
+struct state {
+       char *buf;
+       int sz;
+       struct pgm_header *hdr;
+       struct attribute *attribs[32];  /* don't really know the upper limit.. */
+       struct uniform *uniforms[32];
+       struct sampler *samplers[32];
+       struct varying *varyings[32];
+       struct {
+               struct uniformblock *header;
+               struct uniformblockmember **members; /* GL ES 3.0 spec mandates minimum 16K support. a3xx supports 65K */
+       } uniformblocks[24]; /* Maximum a330 supports */
+       struct output  *outputs[0];  /* I guess only one?? */
+};
+
+static const char *infile;
+static int full_dump = 1;
+static int dump_shaders = 0;
+static int gpu_id;
+
+static char *find_sect_end(char *buf, int sz)
+{
+       uint8_t *ptr = (uint8_t *)buf;
+       uint8_t *end = ptr + sz - 3;
+
+       while (ptr < end) {
+               uint32_t d = 0;
+
+               d |= ptr[0] <<  0;
+               d |= ptr[1] <<  8;
+               d |= ptr[2] << 16;
+               d |= ptr[3] << 24;
+
+               /* someone at QC likes baseball */
+               if (d == 0xba5eba11)
+                       return (char *)ptr;
+
+               ptr++;
+       }
+       return NULL;
+}
+
+static void *next_sect(struct state *state, int *sect_size)
+{
+       char *end = find_sect_end(state->buf, state->sz);
+       void *sect;
+
+       if (!end)
+               return NULL;
+
+       *sect_size = end - state->buf;
+
+       /* copy the section to keep things nicely 32b aligned: */
+       sect = malloc(ALIGN(*sect_size, 4));
+       memcpy(sect, state->buf, *sect_size);
+
+       state->sz -= *sect_size + 4;
+       state->buf = end + 4;
+
+       return sect;
+}
+
+static int valid_type(uint32_t type_info)
+{
+       switch ((type_info >> 8) & 0xff) {
+       case 0x8b:     /* vector */
+       case 0x8d:     /* GLES3 samplers */
+       case 0x14:     /* float */
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+#if 0
+static int valid_uniformblock(uint32_t type_info)
+{
+       if (type_info == 0x128)
+               return 1;
+       return 0;
+}
+#endif
+
+static void dump_attribute(struct attribute *attrib)
+{
+       printf("\tR%d, CONST(%d): %s\n", attrib->reg,
+                       attrib->const_idx, attrib->name);
+}
+
+static inline int is_uniform_v2(struct uniform *uniform)
+{
+       /* TODO maybe this should be based on revision #? */
+       if (uniform->v2.unknown10 == 0)
+               return 1;
+       return 0;
+}
+
+static void dump_uniform(struct uniform *uniform)
+{
+       char *name = is_uniform_v2(uniform) ? uniform->v2.name : uniform->v1.name;
+       if (uniform->const_reg == -1) {
+               printf("\tC%d+: %s\n", uniform->const_base, name);
+       } else {
+               printf("\tC%d: %s\n", uniform->const_reg, name);
+       }
+}
+
+static void dump_sampler(struct sampler *sampler)
+{
+       printf("\tCONST(%d): %s\n", sampler->const_idx, sampler->name);
+}
+
+static void dump_varying(struct varying *varying)
+{
+       printf("\tR%d: %s\n", varying->reg, varying->name);
+}
+
+static void dump_uniformblock(struct uniformblock *uniformblock)
+{
+       printf("\tUniform Block: %s(%d)\n", uniformblock->name, uniformblock->num_members);
+}
+
+static void dump_uniformblockmember(struct uniformblockmember *member)
+{
+       printf("Uniform Block member: %s\n", member->name);
+}
+
+static void dump_output(struct output *output)
+{
+       printf("\tR?: %s\n", output->name);
+}
+
+static void dump_constant(struct constant *constant)
+{
+       printf("\tC%d: %f, %f, %f, %f\n", constant->const_idx,
+                       constant->val[0], constant->val[1],
+                       constant->val[2], constant->val[3]);
+}
+
+/* dump attr/uniform/sampler/varying/const summary: */
+static void dump_short_summary(struct state *state, int nconsts,
+               struct constant **constants)
+{
+       int i;
+
+       /* dump attr/uniform/sampler/varying/const summary: */
+       for (i = 0; i < state->hdr->num_varyings; i++) {
+               dump_varying(state->varyings[i]);
+       }
+       for (i = 0; i < state->hdr->num_attribs; i++) {
+               dump_attribute(state->attribs[i]);
+       }
+       for (i = 0; i < state->hdr->num_uniforms; i++) {
+               dump_uniform(state->uniforms[i]);
+       }
+       for (i = 0; i < state->hdr->num_samplers; i++) {
+               dump_sampler(state->samplers[i]);
+       }
+       for (i = 0; i < nconsts - 1; i++) {
+               if (constants[i]->unknown2 == 0) {
+                       dump_constant(constants[i]);
+               }
+       }
+       printf("\n");
+}
+
+static void dump_raw_shader(uint32_t *dwords, uint32_t sizedwords, int n, char *ext)
+{
+       static char filename[256];
+       int fd;
+
+       if (!dump_shaders)
+               return;
+
+       sprintf(filename, "%.*s-%d.%s", (int)strlen(infile)-3, infile, n, ext);
+       fd = open(filename, O_WRONLY | O_TRUNC | O_CREAT, 0644);
+       write(fd, dwords, sizedwords * 4);
+}
+
+static void dump_shaders_a2xx(struct state *state)
+{
+       int i, sect_size;
+       uint8_t *ptr;
+
+       /* dump vertex shaders: */
+       for (i = 0; i < 3; i++) {
+               struct vs_header *vs_hdr = next_sect(state, &sect_size);
+               struct constant *constants[32];
+               int j, level = 0;
+
+               printf("\n");
+
+               if (full_dump) {
+                       printf("#######################################################\n");
+                       printf("######## VS%d HEADER: (size %d)\n", i, sect_size);
+                       dump_hex((void *)vs_hdr, sect_size);
+               }
+
+               for (j = 0; j < (int)vs_hdr->unknown1 - 1; j++) {
+                       constants[j] = next_sect(state, &sect_size);
+                       if (full_dump) {
+                               printf("######## VS%d CONST: (size=%d)\n", i, sect_size);
+                               dump_constant(constants[j]);
+                               dump_hex((char *)constants[j], sect_size);
+                       }
+               }
+
+               ptr = next_sect(state, &sect_size);
+               printf("######## VS%d SHADER: (size=%d)\n", i, sect_size);
+               if (full_dump) {
+                       dump_hex(ptr, sect_size);
+                       level = 1;
+               } else {
+                       dump_short_summary(state, vs_hdr->unknown1 - 1, constants);
+               }
+               disasm_a2xx((uint32_t *)(ptr + 32), (sect_size - 32) / 4, level+1, SHADER_VERTEX);
+               dump_raw_shader((uint32_t *)(ptr + 32), (sect_size - 32) / 4, i, "vo");
+               free(ptr);
+
+               for (j = 0; j < vs_hdr->unknown9; j++) {
+                       ptr = next_sect(state, &sect_size);
+                       if (full_dump) {
+                               printf("######## VS%d CONST?: (size=%d)\n", i, sect_size);
+                               dump_hex(ptr, sect_size);
+                       }
+                       free(ptr);
+               }
+
+               for (j = 0; j < vs_hdr->unknown1 - 1; j++) {
+                       free(constants[j]);
+               }
+
+               free(vs_hdr);
+       }
+
+       /* dump fragment shaders: */
+       for (i = 0; i < 1; i++) {
+               struct fs_header *fs_hdr = next_sect(state, &sect_size);
+               struct constant *constants[32];
+               int j, level = 0;
+
+               printf("\n");
+
+               if (full_dump) {
+                       printf("#######################################################\n");
+                       printf("######## FS%d HEADER: (size %d)\n", i, sect_size);
+                       dump_hex((void *)fs_hdr, sect_size);
+               }
+
+               for (j = 0; j < fs_hdr->unknown1 - 1; j++) {
+                       constants[j] = next_sect(state, &sect_size);
+                       if (full_dump) {
+                               printf("######## FS%d CONST: (size=%d)\n", i, sect_size);
+                               dump_constant(constants[j]);
+                               dump_hex((char *)constants[j], sect_size);
+                       }
+               }
+
+               ptr = next_sect(state, &sect_size);
+               printf("######## FS%d SHADER: (size=%d)\n", i, sect_size);
+               if (full_dump) {
+                       dump_hex(ptr, sect_size);
+                       level = 1;
+               } else {
+                       dump_short_summary(state, fs_hdr->unknown1 - 1, constants);
+               }
+               disasm_a2xx((uint32_t *)(ptr + 32), (sect_size - 32) / 4, level+1, SHADER_FRAGMENT);
+               dump_raw_shader((uint32_t *)(ptr + 32), (sect_size - 32) / 4, i, "fo");
+               free(ptr);
+
+               for (j = 0; j < fs_hdr->unknown1 - 1; j++) {
+                       free(constants[j]);
+               }
+
+               free(fs_hdr);
+       }
+}
+
+static void dump_shaders_a3xx(struct state *state)
+{
+       int i, j;
+
+       /* dump vertex shaders: */
+       for (i = 0; i < 2; i++) {
+               int instrs_size, hdr_size, sect_size, nconsts = 0, level = 0, compact = 0;
+               uint8_t *vs_hdr;
+               struct constant *constants[32];
+               uint8_t *instrs = NULL;
+
+               vs_hdr = next_sect(state, &hdr_size);
+printf("hdr_size=%d\n", hdr_size);
+
+               /* seems like there are two cases, either:
+                *  1) 152 byte header,
+                *  2) zero or more 32 byte compiler const sections
+                *  3) followed by shader instructions
+                * or, if there are no compiler consts, this can be
+                * all smashed in one large section
+                */
+               int n;
+               if (state->hdr->revision >= 0xb)
+                       n = 160;
+               else if (state->hdr->revision >= 7)
+                       n = 156;
+               else
+                       n = 152;
+               if (hdr_size > n) {
+                       instrs = &vs_hdr[n];
+                       instrs_size = hdr_size - n;
+                       hdr_size = n;
+                       compact = 1;
+               } else {
+                       while (1) {
+                               void *ptr = next_sect(state, &sect_size);
+
+                               if ((sect_size != 32) && (sect_size != 44)) {
+                                       /* end of constants: */
+                                       instrs = ptr;
+                                       instrs_size = sect_size;
+                                       break;
+                               }
+                               dump_hex_ascii(ptr, sect_size, 0);
+                               constants[nconsts++] = ptr;
+                       }
+               }
+
+               printf("\n");
+
+               if (full_dump) {
+                       printf("#######################################################\n");
+                       printf("######## VS%d HEADER: (size %d)\n", i, hdr_size);
+                       dump_hex((void *)vs_hdr, hdr_size);
+                       for (j = 0; j < nconsts; j++) {
+                               printf("######## VS%d CONST: (size=%d)\n", i, (int)sizeof(constants[i]));
+                               dump_constant(constants[j]);
+                               dump_hex((char *)constants[j], sizeof(constants[j]));
+                       }
+               }
+
+               printf("######## VS%d SHADER: (size=%d)\n", i, instrs_size);
+               if (full_dump) {
+                       dump_hex(instrs, instrs_size);
+                       level = 1;
+               } else {
+                       dump_short_summary(state, nconsts, constants);
+               }
+
+               if (!compact) {
+                       if (state->hdr->revision >= 7) {
+                               instrs += ALIGN(instrs_size, 8) - instrs_size;
+                               instrs_size = ALIGN(instrs_size, 8);
+                       }
+                       instrs += 32;
+                       instrs_size -= 32;
+               }
+
+               disasm_a3xx((uint32_t *)instrs, instrs_size / 4, level+1, SHADER_VERTEX, gpu_id);
+               dump_raw_shader((uint32_t *)instrs, instrs_size / 4, i, "vo3");
+               free(vs_hdr);
+       }
+
+       /* dump fragment shaders: */
+       for (i = 0; i < 1; i++) {
+               int instrs_size, hdr_size, sect_size, nconsts = 0, level = 0, compact = 0;
+               uint8_t *fs_hdr;
+               struct constant *constants[32];
+               uint8_t *instrs = NULL;
+
+               fs_hdr = next_sect(state, &hdr_size);
+
+printf("hdr_size=%d\n", hdr_size);
+               /* two cases, similar to vertex shader, but magic # is 200
+                * (or 208 for newer?)..
+                */
+               int n;
+               if (state->hdr->revision >= 0xb)
+                       n = 256;
+               else if (state->hdr->revision >= 8)
+                       n = 208;
+               else if (state->hdr->revision == 7)
+                       n = 204;
+               else
+                       n = 200;
+
+               if (hdr_size > n) {
+                       instrs = &fs_hdr[n];
+                       instrs_size = hdr_size - n;
+                       hdr_size = n;
+                       compact = 1;
+               } else {
+                       while (1) {
+                               void *ptr = next_sect(state, &sect_size);
+
+                               if ((sect_size != 32) && (sect_size != 44)) {
+                                       /* end of constants: */
+                                       instrs = ptr;
+                                       instrs_size = sect_size;
+                                       break;
+                               }
+
+                               dump_hex_ascii(ptr, sect_size, 0);
+                               constants[nconsts++] = ptr;
+                       }
+               }
+
+               printf("\n");
+
+               if (full_dump) {
+                       printf("#######################################################\n");
+                       printf("######## FS%d HEADER: (size %d)\n", i, hdr_size);
+                       dump_hex((void *)fs_hdr, hdr_size);
+                       for (j = 0; j < nconsts; j++) {
+                               printf("######## FS%d CONST: (size=%d)\n", i, (int)sizeof(constants[i]));
+                               dump_constant(constants[j]);
+                               dump_hex((char *)constants[j], sizeof(constants[j]));
+                       }
+               }
+
+               printf("######## FS%d SHADER: (size=%d)\n", i, instrs_size);
+               if (full_dump) {
+                       dump_hex(instrs, instrs_size);
+                       level = 1;
+               } else {
+                       dump_short_summary(state, nconsts, constants);
+               }
+
+               if (!compact) {
+                       if (state->hdr->revision >= 7) {
+                               instrs += 44;
+                               instrs_size -= 44;
+                       } else {
+                               instrs += 32;
+                               instrs_size -= 32;
+                       }
+               }
+               disasm_a3xx((uint32_t *)instrs, instrs_size / 4, level+1, stdout, gpu_id);
+               dump_raw_shader((uint32_t *)instrs, instrs_size / 4, i, "fo3");
+               free(fs_hdr);
+       }
+}
+
+static void dump_program(struct state *state)
+{
+       int i, sect_size;
+       uint8_t *ptr;
+
+       state->hdr = next_sect(state, &sect_size);
+
+       printf("######## HEADER: (size %d)\n", sect_size);
+       printf("\tsize:           %d\n", state->hdr->size);
+       printf("\trevision:       %d\n", state->hdr->revision);
+       printf("\tattributes:     %d\n", state->hdr->num_attribs);
+       printf("\tuniforms:       %d\n", state->hdr->num_uniforms);
+       printf("\tsamplers:       %d\n", state->hdr->num_samplers);
+       printf("\tvaryings:       %d\n", state->hdr->num_varyings);
+       printf("\tuniform blocks: %d\n", state->hdr->num_uniformblocks);
+       if (full_dump)
+               dump_hex((void *)state->hdr, sect_size);
+       printf("\n");
+
+       /* there seems to be two 0xba5eba11's at the end of the header, possibly
+        * with some other stuff between them:
+        */
+       ptr = next_sect(state, &sect_size);
+       if (full_dump) {
+               dump_hex_ascii(ptr, sect_size, 0);
+       }
+
+       for (i = 0; (i < state->hdr->num_attribs) && (state->sz > 0); i++) {
+               state->attribs[i] = next_sect(state, &sect_size);
+
+               /* hmm, for a3xx (or maybe just newer driver version), we have some
+                * extra sections that don't seem useful, so skip these:
+                */
+               while (!valid_type(state->attribs[i]->type_info)) {
+                       dump_hex_ascii(state->attribs[i], sect_size, 0);
+                       state->attribs[i] = next_sect(state, &sect_size);
+               }
+
+               clean_ascii(state->attribs[i]->name, sect_size - 28);
+               if (full_dump) {
+                       printf("######## ATTRIBUTE: (size %d)\n", sect_size);
+                       dump_attribute(state->attribs[i]);
+                       dump_hex((char *)state->attribs[i], sect_size);
+               }
+       }
+
+       for (i = 0; (i < state->hdr->num_uniforms) && (state->sz > 0); i++) {
+               state->uniforms[i] = next_sect(state, &sect_size);
+
+               /* hmm, for a3xx (or maybe just newer driver version), we have some
+                * extra sections that don't seem useful, so skip these:
+                */
+               while (!valid_type(state->uniforms[i]->type_info)) {
+                       dump_hex_ascii(state->uniforms[i], sect_size, 0);
+                       state->uniforms[i] = next_sect(state, &sect_size);
+               }
+
+               if (is_uniform_v2(state->uniforms[i])) {
+                       clean_ascii(state->uniforms[i]->v2.name, sect_size - 53);
+               } else {
+                       clean_ascii(state->uniforms[i]->v1.name, sect_size - 41);
+               }
+
+               if (full_dump) {
+                       printf("######## UNIFORM: (size %d)\n", sect_size);
+                       dump_uniform(state->uniforms[i]);
+                       dump_hex((char *)state->uniforms[i], sect_size);
+               }
+       }
+
+       for (i = 0; (i < state->hdr->num_samplers) && (state->sz > 0); i++) {
+               state->samplers[i] = next_sect(state, &sect_size);
+
+               /* hmm, for a3xx (or maybe just newer driver version), we have some
+                * extra sections that don't seem useful, so skip these:
+                */
+               while (!valid_type(state->samplers[i]->type_info)) {
+                       dump_hex_ascii(state->samplers[i], sect_size, 0);
+                       state->samplers[i] = next_sect(state, &sect_size);
+               }
+
+               clean_ascii(state->samplers[i]->name, sect_size - 33);
+               if (full_dump) {
+                       printf("######## SAMPLER: (size %d)\n", sect_size);
+                       dump_sampler(state->samplers[i]);
+                       dump_hex((char *)state->samplers[i], sect_size);
+               }
+
+       }
+
+       // These sections show up after all of the other sampler sections
+       // Loops through them all since we don't deal with them
+       if (state->hdr->revision >= 7) {
+               for (i = 0; (i < state->hdr->num_samplers) && (state->sz > 0); i++) {
+                       ptr = next_sect(state, &sect_size);
+                       dump_hex_ascii(ptr, sect_size, 0);
+               }
+       }
+
+
+       for (i = 0; (i < state->hdr->num_varyings) && (state->sz > 0); i++) {
+               state->varyings[i] = next_sect(state, &sect_size);
+
+               /* hmm, for a3xx (or maybe just newer driver version), we have some
+                * extra sections that don't seem useful, so skip these:
+                */
+               while (!valid_type(state->varyings[i]->type_info)) {
+                       dump_hex_ascii(state->varyings[i], sect_size, 0);
+                       state->varyings[i] = next_sect(state, &sect_size);
+               }
+
+               clean_ascii(state->varyings[i]->name, sect_size - 16);
+               if (full_dump) {
+                       printf("######## VARYING: (size %d)\n", sect_size);
+                       dump_varying(state->varyings[i]);
+                       dump_hex((char *)state->varyings[i], sect_size);
+               }
+       }
+
+       /* show up again for revision >= 14?? */
+       if (state->hdr->revision >= 14) {
+               for (i = 0; (i < state->hdr->num_varyings) && (state->sz > 0); i++) {
+                       ptr = next_sect(state, &sect_size);
+                       dump_hex_ascii(ptr, sect_size, 0);
+               }
+       }
+
+       /* not sure exactly which revision started this, but seems at least
+        * rev7 and rev8 implicitly include a new section for gl_FragColor:
+        */
+       if (state->hdr->revision >= 7) {
+               /* I guess only one? */
+               state->outputs[0] = next_sect(state, &sect_size);
+
+               clean_ascii(state->outputs[0]->name, sect_size - 32);
+               if (full_dump) {
+                       printf("######## OUTPUT: (size %d)\n", sect_size);
+                       dump_output(state->outputs[0]);
+                       dump_hex((char *)state->outputs[0], sect_size);
+               }
+       }
+
+       for (i = 0; (i < state->hdr->num_uniformblocks) && (state->sz > 0); i++) {
+               state->uniformblocks[i].header = next_sect(state, &sect_size);
+
+               clean_ascii(state->uniformblocks[i].header->name, sect_size - 40);
+               if (full_dump) {
+                       printf("######## UNIFORM BLOCK: (size %d)\n", sect_size);
+                       dump_uniformblock(state->uniformblocks[i].header);
+                       dump_hex((char *)state->uniformblocks[i].header, sect_size);
+               }
+
+               /*
+                * OpenGL ES 3.0 spec mandates a minimum amount of 16K members supported
+                * a330 supports a minimum of 65K
+                */
+               state->uniformblocks[i].members = malloc(state->uniformblocks[i].header->num_members * sizeof(void*));
+
+               int member = 0;
+               for (member = 0; (member < state->uniformblocks[i].header->num_members) && (state->sz > 0); member++) {
+                       state->uniformblocks[i].members[member] = next_sect(state, &sect_size);
+
+                       clean_ascii(state->uniformblocks[i].members[member]->name, sect_size - 56);
+                       if (full_dump) {
+                               printf("######## UNIFORM BLOCK MEMBER: (size %d)\n", sect_size);
+                               dump_uniformblockmember(state->uniformblocks[i].members[member]);
+                               dump_hex((char *)state->uniformblocks[i].members[member], sect_size);
+                       }
+               }
+               /*
+                * Qualcomm saves the UBO members twice for each UBO
+                * Don't ask me why
+                */
+               for (member = 0; (member < state->uniformblocks[i].header->num_members) && (state->sz > 0); member++) {
+                       state->uniformblocks[i].members[member] = next_sect(state, &sect_size);
+
+                       clean_ascii(state->uniformblocks[i].members[member]->name, sect_size - 56);
+                       if (full_dump) {
+                               printf("######## UNIFORM BLOCK MEMBER2: (size %d)\n", sect_size);
+                               dump_uniformblockmember(state->uniformblocks[i].members[member]);
+                               dump_hex((char *)state->uniformblocks[i].members[member], sect_size);
+                       }
+               }
+       }
+
+       if (gpu_id >= 300) {
+               dump_shaders_a3xx(state);
+       } else {
+               dump_shaders_a2xx(state);
+       }
+
+       if (!full_dump)
+               return;
+
+       /* dump ascii version of shader program: */
+       ptr = next_sect(state, &sect_size);
+       printf("\n#######################################################\n");
+       printf("######## SHADER SRC: (size=%d)\n", sect_size);
+       dump_ascii(ptr, sect_size);
+       free(ptr);
+
+       /* dump remaining sections (there shouldn't be any): */
+       while (state->sz > 0) {
+               ptr = next_sect(state, &sect_size);
+               printf("######## section (size=%d)\n", sect_size);
+               printf("as hex:\n");
+               dump_hex(ptr, sect_size);
+               printf("as float:\n");
+               dump_float(ptr, sect_size);
+               printf("as ascii:\n");
+               dump_ascii(ptr, sect_size);
+               free(ptr);
+       }
+       /* cleanup the uniform buffer members we allocated */
+       if (state->hdr->num_uniformblocks > 0)
+               free (state->uniformblocks[i].members);
+}
+
+int main(int argc, char **argv)
+{
+       enum rd_sect_type type = RD_NONE;
+       enum debug_t debug = 0;
+       void *buf = NULL;
+       int sz;
+       struct io *io;
+       int raw_program = 0;
+
+       /* lame argument parsing: */
+
+       while (1) {
+               if ((argc > 1) && !strcmp(argv[1], "--verbose")) {
+                       debug |= PRINT_RAW | PRINT_VERBOSE;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--expand")) {
+                       debug |= EXPAND_REPEAT;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--short")) {
+                       /* only short dump, original shader, symbol table, and disassembly */
+                       full_dump = 0;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--dump-shaders")) {
+                       dump_shaders = 1;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--raw")) {
+                       raw_program = 1;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--gpu300")) {
+                       gpu_id = 320;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               break;
+       }
+
+       if (argc != 2) {
+               fprintf(stderr, "usage: pgmdump [--verbose] [--short] [--dump-shaders] testlog.rd\n");
+               return -1;
+       }
+
+       disasm_set_debug(debug);
+
+       infile = argv[1];
+
+       io = io_open(infile);
+       if (!io) {
+               fprintf(stderr, "could not open: %s\n", infile);
+               return -1;
+       }
+
+       if (raw_program)
+       {
+               io_readn(io, &sz, 4);
+               free(buf);
+
+               /* note: allow hex dumps to go a bit past the end of the buffer..
+                * might see some garbage, but better than missing the last few bytes..
+                */
+               buf = calloc(1, sz + 3);
+               io_readn(io, buf + 4, sz);
+               (*(int*)buf) = sz;
+
+               struct state state = {
+                               .buf = buf,
+                               .sz = sz,
+               };
+               printf("############################################################\n");
+               printf("program:\n");
+               dump_program(&state);
+               printf("############################################################\n");
+               return 0;
+       }
+
+       /* figure out what sort of input we are dealing with: */
+       if (!(check_extension(infile, ".rd") || check_extension(infile, ".rd.gz"))) {
+               enum shader_t shader = ~0;
+               int ret;
+               if (check_extension(infile, ".vo")) {
+                       shader = SHADER_VERTEX;
+               } else if (check_extension(infile, ".fo")) {
+                       shader = SHADER_FRAGMENT;
+               } else if (check_extension(infile, ".vo3")) {
+               } else if (check_extension(infile, ".fo3")) {
+               } else if (check_extension(infile, ".co3")) {
+               } else {
+                       fprintf(stderr, "invalid input file: %s\n", infile);
+                       return -1;
+               }
+               buf = calloc(1, 100 * 1024);
+               ret = io_readn(io, buf, 100 * 1024);
+               if (ret < 0) {
+                       fprintf(stderr, "error: %m");
+                       return -1;
+               }
+               if (shader != ~0) {
+                       return disasm_a2xx(buf, ret/4, 0, shader);
+               } else {
+                       /* disassembly does not depend on shader stage on a3xx+: */
+                       return disasm_a3xx(buf, ret/4, 0, stdout, gpu_id);
+               }
+       }
+
+       while ((io_readn(io, &type, sizeof(type)) > 0) && (io_readn(io, &sz, 4) > 0)) {
+               free(buf);
+
+               /* note: allow hex dumps to go a bit past the end of the buffer..
+                * might see some garbage, but better than missing the last few bytes..
+                */
+               buf = calloc(1, sz + 3);
+               io_readn(io, buf, sz);
+
+               switch(type) {
+               case RD_TEST:
+                       if (full_dump)
+                               printf("test: %s\n", (char *)buf);
+                       break;
+               case RD_VERT_SHADER:
+                       printf("vertex shader:\n%s\n", (char *)buf);
+                       break;
+               case RD_FRAG_SHADER:
+                       printf("fragment shader:\n%s\n", (char *)buf);
+                       break;
+               case RD_PROGRAM: {
+                       struct state state = {
+                                       .buf = buf,
+                                       .sz = sz,
+                       };
+                       printf("############################################################\n");
+                       printf("program:\n");
+                       dump_program(&state);
+                       printf("############################################################\n");
+                       break;
+               }
+               case RD_GPU_ID:
+                       gpu_id = *((unsigned int *)buf);
+                       printf("gpu_id: %d\n", gpu_id);
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       io_close(io);
+
+       return 0;
+}
+
diff --git a/src/freedreno/decode/pgmdump2.c b/src/freedreno/decode/pgmdump2.c
new file mode 100644 (file)
index 0000000..7410bcd
--- /dev/null
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Decoder for "new" GL_OES_get_program_binary format.
+ *
+ * Overall structure is:
+ *
+ *   - header at top, contains, amongst other things, offsets of
+ *     per shader stage sections.
+ *   - per shader stage section (shader_info) starts with a header,
+ *     followed by a variably length list of descriptors.  Each
+ *     descriptor has a type/count/size plus offset from the start
+ *     of shader_info section where the data is found
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stddef.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "io.h"
+#include "util.h"
+
+const char *infile;
+static int dump_full = 0;
+static int dump_offsets = 0;
+static int gpu_id = 320;
+static int shaderdb = 0;     /* output shaderdb style traces to stderr */
+
+struct state {
+       char *buf;
+       int sz;
+       int lvl;
+
+       /* current shader_info section, some offsets calculated relative to
+        * this, rather than relative to start of buffer.
+        */
+       void *shader;
+
+       /* size of each entry within a shader_descriptor_blk: */
+       int desc_size;
+
+       const char *shader_type;
+       int full_regs;
+       int half_regs;
+};
+
+#define PACKED __attribute__((__packed__))
+
+#define OFF(field) do {                                               \
+               if (dump_offsets)                                             \
+                       printf("%08x: ", (uint32_t)((char *)&field - state->buf));\
+       } while (0)
+
+/* decode field as hex */
+#define X(s, field)  do {                                             \
+               OFF(s->field);                                                \
+               printf("%s%12s:\t0x%x\n", tab(state->lvl), #field, s->field); \
+       } while (0)
+
+/* decode field as digit */
+#define D(s, field)  do {                                             \
+               OFF(s->field);                                                \
+               printf("%s%12s:\t%u\n", tab(state->lvl), #field, s->field);   \
+       } while (0)
+
+/* decode field as float/hex */
+#define F(s, field)  do {                                             \
+               OFF(s->field);                                                \
+               printf("%s%12s:\t%f (0x%0x)\n", tab(state->lvl), #field,      \
+                               d2f(s->field), s->field);                             \
+       } while (0)
+
+/* decode field as register: (type is 'r' or 'c') */
+#define R(s, field, type) do {                                        \
+               OFF(s->field);                                                \
+               printf("%s%12s:\t%c%u.%c\n", tab(state->lvl), #field, type,   \
+                               (s->field >> 2), "xyzw"[s->field & 0x3]);             \
+       } while (0)
+
+/* decode inline string (presumably null terminated?) */
+#define S(s, field)  do {                                             \
+               OFF(s->field);                                                \
+               printf("%s%12s:\t%s\n", tab(state->lvl), #field, s->field);   \
+       } while (0)
+
+/* decode string-table string */
+#define T(s, field)  TODO
+
+/* decode field as unknown */
+#define U(s, start, end) \
+       dump_unknown(state, s->unk_ ## start ## _ ## end, 0x ## start, (4 + 0x ## end - 0x ## start) / 4)
+
+/* decode field as offset to other section */
+#define O(s, field, type) do {              \
+               X(s, field);                        \
+               assert(s->field < state->sz);       \
+               void *_p = &state->buf[s->field];   \
+               state->lvl++;                       \
+               decode_ ## type (state, _p);        \
+               state->lvl--;                       \
+       } while (0)
+
+struct shader_info;
+static void decode_shader_info(struct state *state, struct shader_info *info);
+
+static void dump_unknown(struct state *state, void *buf, unsigned start, unsigned n)
+{
+       uint32_t *ptr = buf;
+       uint8_t *ascii = buf;
+
+       for (unsigned i = 0; i < n; i++) {
+               uint32_t d = ptr[i];
+
+               if (dump_offsets)
+                       printf("%08x:", (uint32_t)((char *)&ptr[i] - state->buf));
+
+               printf("%s        %04x:\t%08x", tab(state->lvl), start + i * 4, d);
+
+               printf("\t|");
+               for (unsigned j = 0; j < 4; j++) {
+                       uint8_t c = *(ascii++);
+                       printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
+               }
+               printf("|\t%f", d2f(d));
+
+               /* TODO maybe scan for first non-null and non-ascii char starting from
+                * end of shader binary to (roughly) establish the start of the string
+                * table.. that would be a bit better filter for deciding if something
+                * might be a pointer into the string table.  Also, the previous char
+                * to what it points to should probably be null.
+                */
+               if ((d < state->sz) &&
+                               isascii(state->buf[d]) &&
+                               (strlen(&state->buf[d]) > 2) &&
+                               isascii(state->buf[d+1]))
+                       printf("\t<== %s", &state->buf[d]);
+
+               printf("\n");
+       }
+}
+
+struct PACKED header {
+       uint32_t version;   /* I guess, always b10bcace ? */
+       uint32_t unk_0004_0014[5];
+       uint32_t size;
+       uint32_t size2;     /* just to be sure? */
+       uint32_t unk_0020_0020[1];
+       uint32_t chksum;    /* I guess?  Small changes seem to result in big diffs here */
+       uint32_t unk_0028_0050[11];
+       uint32_t fs_info;   /* offset of FS shader_info section */
+       uint32_t unk_0058_0090[15];
+       uint32_t vs_info;   /* offset of VS shader_info section */
+       uint32_t unk_0098_00b0[7];
+       uint32_t vs_info2;  /* offset of VS shader_info section (again?) */
+       uint32_t unk_00b8_0110[23];
+       uint32_t bs_info;   /* offset of binning shader_info section */
+};
+
+static void decode_header(struct state *state, struct header *hdr)
+{
+       X(hdr, version);
+       U(hdr, 0004, 0014);
+       X(hdr, size);
+       X(hdr, size2);
+       U(hdr, 0020, 0020);
+       X(hdr, chksum);
+       U(hdr, 0028, 0050);
+       state->shader_type = "FRAG";
+       O(hdr, fs_info, shader_info);
+       U(hdr, 0058, 0090);
+       state->shader_type = "VERT";
+       O(hdr, vs_info, shader_info);
+       U(hdr, 0098, 00b0);
+       assert(hdr->vs_info == hdr->vs_info2);  /* not sure what this if it is ever different */
+       X(hdr, vs_info2);
+       U(hdr, 00b8, 0110);
+       state->shader_type = "BVERT";
+       O(hdr, bs_info, shader_info);
+
+       /* not sure how much of the rest of contents before start of fs_info
+        * is the header, vs other things.. just dump it all as unknown for
+        * now:
+        */
+       dump_unknown(state, (void *)hdr + sizeof(*hdr),
+               sizeof(*hdr), (hdr->fs_info - sizeof(*hdr)) / 4);
+}
+
+struct PACKED shader_entry_point {
+       /* entry point name, ie. "main" of TBD length, followed by unknown */
+       char name[8];
+};
+
+static void decode_shader_entry_point(struct state *state,
+               struct shader_entry_point *e)
+{
+       S(e, name);
+}
+
+struct PACKED shader_config {
+       uint32_t unk_0000_0008[3];
+       uint32_t full_regs;
+       uint32_t half_regs;
+};
+
+static void decode_shader_config(struct state *state, struct shader_config *cfg)
+{
+       U(cfg, 0000, 0008);
+       D(cfg, full_regs);
+       D(cfg, half_regs);
+
+       state->full_regs = cfg->full_regs;
+       state->half_regs = cfg->half_regs;
+
+       /* dump reset of unknown (size differs btwn versions) */
+       dump_unknown(state, (void *)cfg + sizeof(*cfg), sizeof(*cfg),
+                       (state->desc_size - sizeof(*cfg))/4);
+}
+
+struct PACKED shader_io_block {
+       /* name of TBD length followed by unknown.. 42 dwords total */
+       char name[20];
+       uint32_t unk_0014_00a4[37];
+};
+
+static void decode_shader_io_block(struct state *state,
+               struct shader_io_block *io)
+{
+       S(io, name);
+       U(io, 0014, 00a4);
+}
+
+struct PACKED shader_constant_block {
+       uint32_t value;
+       uint32_t unk_0004_000c[3];
+       uint32_t regid;
+       uint32_t unk_0014_0024[5];
+};
+
+static void decode_shader_constant_block(struct state *state,
+               struct shader_constant_block *c)
+{
+       F(c, value);
+       U(c, 0004, 000c);
+       R(c, regid, 'c');
+       U(c, 0014, 0024);
+}
+
+enum {
+       ENTRY_POINT    =  0,     /* shader_entry_point */
+       SHADER_CONFIG  =  1,     /* XXX placeholder name */
+       SHADER_INPUT   =  2,     /* shader_io_block */
+       SHADER_OUTPUT  =  3,     /* shader_io_block */
+       CONSTANTS      =  6,     /* shader_constant_block */
+       INTERNAL       =  8,     /* internal input, like bary.f coord */
+       SHADER         = 10,
+} shader_info_block_type;
+
+/* Refers to location of some type of records, with an offset relative to
+ * start of shader_info block.
+ */
+struct PACKED shader_descriptor_block {
+       uint32_t type;      /* block type */
+       uint32_t offset;    /* offset (relative to start of shader_info block) */
+       uint32_t size;      /* size in bytes */
+       uint32_t count;     /* number of records */
+       uint32_t unk_0010_0010[1];
+};
+
+static void decode_shader_descriptor_block(struct state *state,
+               struct shader_descriptor_block *blk)
+{
+       D(blk, type);
+       X(blk, offset);
+       D(blk, size);
+       D(blk, count);
+       U(blk, 0010, 0010);
+
+       /* offset relative to current shader block: */
+       void *ptr = state->shader + blk->offset;
+
+       if (blk->count == 0) {
+               assert(blk->size == 0);
+       } else {
+               assert((blk->size % blk->count) == 0);
+       }
+
+       state->desc_size = blk->size / blk->count;
+       state->lvl++;
+       for (unsigned i = 0; i < blk->count; i++) {
+               switch (blk->type) {
+               case ENTRY_POINT:
+                       printf("%sentry point %u:\n", tab(state->lvl-1), i);
+                       decode_shader_entry_point(state, ptr);
+                       break;
+               case SHADER_CONFIG:
+                       printf("%sconfig %u:\n", tab(state->lvl-1), i);
+                       decode_shader_config(state, ptr);
+                       break;
+               case SHADER_INPUT:
+                       printf("%sinput %u:\n", tab(state->lvl-1), i);
+                       decode_shader_io_block(state, ptr);
+                       break;
+               case SHADER_OUTPUT:
+                       printf("%soutput %u:\n", tab(state->lvl-1), i);
+                       decode_shader_io_block(state, ptr);
+                       break;
+               case INTERNAL:
+                       printf("%sinternal input %u:\n", tab(state->lvl-1), i);
+                       decode_shader_io_block(state, ptr);
+                       break;
+               case CONSTANTS:
+                       printf("%sconstant %u:\n", tab(state->lvl-1), i);
+                       decode_shader_constant_block(state, ptr);
+                       break;
+               case SHADER: {
+                       struct shader_stats stats;
+                       printf("%sshader %u:\n", tab(state->lvl-1), i);
+                       disasm_a3xx_stat(ptr, blk->size/4, state->lvl, stdout, gpu_id, &stats);
+                       if (shaderdb) {
+                               unsigned dwords = 2 * stats.instlen;
+
+                               if (gpu_id >= 400) {
+                                       dwords = ALIGN(dwords, 16 * 2);
+                               } else {
+                                       dwords = ALIGN(dwords, 4 * 2);
+                               }
+
+                               unsigned half_regs = state->half_regs;
+                               unsigned full_regs = state->full_regs;
+
+                               /* On a6xx w/ merged/conflicting half and full regs, the
+                                * full_regs footprint will be max of full_regs and half
+                                * of half_regs.. we only care about which value is higher.
+                                */
+                               if (gpu_id >= 600) {
+                                       /* footprint of half_regs in units of full_regs: */
+                                       unsigned half_full = (half_regs + 1) / 2;
+                                       if (half_full > full_regs)
+                                               full_regs = half_full;
+                                       half_regs = 0;
+                               }
+
+                               fprintf(stderr,
+                                               "%s shader: %u inst, %u nops, %u non-nops, %u dwords, "
+                                               "%u half, %u full, %u constlen, "
+                                               "%u (ss), %u (sy), %d max_sun, %d loops\n",
+                                       state->shader_type, stats.instructions,
+                                       stats.nops, stats.instructions - stats.nops,
+                                       dwords, half_regs, full_regs,
+                                       stats.constlen, stats.ss, stats.sy,
+                                       0, 0);  /* max_sun or loops not possible */
+                       }
+                       /* this is a special case in a way, blk->count is # of
+                        * instructions but disasm_a3xx() decodes all instructions,
+                        * so just bail.
+                        */
+                       i = blk->count;
+                       break;
+               }
+               default:
+                       dump_unknown(state, ptr, 0, state->desc_size/4);
+                       break;
+               }
+               ptr += state->desc_size;
+       }
+       state->lvl--;
+}
+
+/* there looks like one of these per shader, followed by "main" and
+ * some more info, and then the shader itself.
+ */
+struct PACKED shader_info {
+       uint32_t unk_0000_0010[5];
+       uint32_t desc_off;       /* offset to first descriptor block */
+       uint32_t num_blocks;
+};
+
+static void decode_shader_info(struct state *state, struct shader_info *info)
+{
+       assert((info->desc_off % 4) == 0);
+
+       U(info, 0000, 0010);
+       X(info, desc_off);
+       D(info, num_blocks);
+
+       dump_unknown(state, &info[1], 0, (info->desc_off - sizeof(*info))/4);
+
+       state->shader = info;
+
+       struct shader_descriptor_block *blocks = ((void *)info) + info->desc_off;
+       for (unsigned i = 0; i < info->num_blocks; i++) {
+               printf("%sdescriptor %u:\n", tab(state->lvl), i);
+               state->lvl++;
+               decode_shader_descriptor_block(state, &blocks[i]);
+               state->lvl--;
+       }
+}
+
+static void dump_program(struct state *state)
+{
+       struct header *hdr = (void *)state->buf;
+
+       if (dump_full)
+               dump_unknown(state, state->buf, 0, state->sz/4);
+
+       decode_header(state, hdr);
+}
+
+int main(int argc, char **argv)
+{
+       enum rd_sect_type type = RD_NONE;
+       enum debug_t debug = 0;
+       void *buf = NULL;
+       int sz;
+       struct io *io;
+       int raw_program = 0;
+
+       /* lame argument parsing: */
+
+       while (1) {
+               if ((argc > 1) && !strcmp(argv[1], "--verbose")) {
+                       debug |= PRINT_RAW | PRINT_VERBOSE;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--expand")) {
+                       debug |= EXPAND_REPEAT;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--full")) {
+                       /* only short dump, original shader, symbol table, and disassembly */
+                       dump_full = 1;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--dump-offsets")) {
+                       dump_offsets = 1;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--raw")) {
+                       raw_program = 1;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               if ((argc > 1) && !strcmp(argv[1], "--shaderdb")) {
+                       shaderdb = 1;
+                       argv++;
+                       argc--;
+                       continue;
+               }
+               break;
+       }
+
+       if (argc != 2) {
+               fprintf(stderr, "usage: pgmdump2 [--verbose] [--expand] [--full] [--dump-offsets] [--raw] [--shaderdb] testlog.rd\n");
+               return -1;
+       }
+
+       disasm_set_debug(debug);
+
+       infile = argv[1];
+
+       io = io_open(infile);
+       if (!io) {
+               fprintf(stderr, "could not open: %s\n", infile);
+               return -1;
+       }
+
+       if (raw_program)
+       {
+               io_readn(io, &sz, 4);
+               free(buf);
+
+               /* note: allow hex dumps to go a bit past the end of the buffer..
+                * might see some garbage, but better than missing the last few bytes..
+                */
+               buf = calloc(1, sz + 3);
+               io_readn(io, buf + 4, sz);
+               (*(int*)buf) = sz;
+
+               struct state state = {
+                               .buf = buf,
+                               .sz = sz,
+               };
+               printf("############################################################\n");
+               printf("program:\n");
+               dump_program(&state);
+               printf("############################################################\n");
+               return 0;
+       }
+
+       /* figure out what sort of input we are dealing with: */
+       if (!(check_extension(infile, ".rd") || check_extension(infile, ".rd.gz"))) {
+               int ret;
+               buf = calloc(1, 100 * 1024);
+               ret = io_readn(io, buf, 100 * 1024);
+               if (ret < 0) {
+                       fprintf(stderr, "error: %m");
+                       return -1;
+               }
+               return disasm_a3xx(buf, ret/4, 0, stdout, gpu_id);
+       }
+
+       while ((io_readn(io, &type, sizeof(type)) > 0) && (io_readn(io, &sz, 4) > 0)) {
+               free(buf);
+
+               /* note: allow hex dumps to go a bit past the end of the buffer..
+                * might see some garbage, but better than missing the last few bytes..
+                */
+               buf = calloc(1, sz + 3);
+               io_readn(io, buf, sz);
+
+               switch(type) {
+               case RD_TEST:
+                       if (dump_full)
+                               printf("test: %s\n", (char *)buf);
+                       break;
+               case RD_VERT_SHADER:
+                       printf("vertex shader:\n%s\n", (char *)buf);
+                       break;
+               case RD_FRAG_SHADER:
+                       printf("fragment shader:\n%s\n", (char *)buf);
+                       break;
+               case RD_PROGRAM: {
+                       struct state state = {
+                                       .buf = buf,
+                                       .sz = sz,
+                       };
+                       printf("############################################################\n");
+                       printf("program:\n");
+                       dump_program(&state);
+                       printf("############################################################\n");
+                       break;
+               }
+               case RD_GPU_ID:
+                       gpu_id = *((unsigned int *)buf);
+                       printf("gpu_id: %d\n", gpu_id);
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       io_close(io);
+
+       return 0;
+}
diff --git a/src/freedreno/decode/redump.h b/src/freedreno/decode/redump.h
new file mode 100644 (file)
index 0000000..c77344e
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * Copyright Â© 2012 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef REDUMP_H_
+#define REDUMP_H_
+
+enum rd_sect_type {
+       RD_NONE,
+       RD_TEST,       /* ascii text */
+       RD_CMD,        /* ascii text */
+       RD_GPUADDR,    /* u32 gpuaddr, u32 size */
+       RD_CONTEXT,    /* raw dump */
+       RD_CMDSTREAM,  /* raw dump */
+       RD_CMDSTREAM_ADDR, /* gpu addr of cmdstream */
+       RD_PARAM,      /* u32 param_type, u32 param_val, u32 bitlen */
+       RD_FLUSH,      /* empty, clear previous params */
+       RD_PROGRAM,    /* shader program, raw dump */
+       RD_VERT_SHADER,
+       RD_FRAG_SHADER,
+       RD_BUFFER_CONTENTS,
+       RD_GPU_ID,
+};
+
+/* RD_PARAM types: */
+enum rd_param_type {
+       RD_PARAM_SURFACE_WIDTH,
+       RD_PARAM_SURFACE_HEIGHT,
+       RD_PARAM_SURFACE_PITCH,
+       RD_PARAM_COLOR,
+       RD_PARAM_BLIT_X,
+       RD_PARAM_BLIT_Y,
+       RD_PARAM_BLIT_WIDTH,
+       RD_PARAM_BLIT_HEIGHT,
+       RD_PARAM_BLIT_X2,      /* BLIT_X + BLIT_WIDTH */
+       RD_PARAM_BLIT_Y2,      /* BLIT_Y + BLIT_WIDTH */
+};
+
+void rd_start(const char *name, const char *fmt, ...) __attribute__((weak));
+void rd_end(void) __attribute__((weak));
+void rd_write_section(enum rd_sect_type type, const void *buf, int sz) __attribute__((weak));
+
+/* for code that should run with and without libwrap, use the following
+ * macros which check if the fxns are present before calling
+ */
+#define RD_START(n,f,...)        do { if (rd_start) rd_start(n,f,##__VA_ARGS__); } while (0)
+#define RD_END()                 do { if (rd_end) rd_end(); } while (0)
+#define RD_WRITE_SECTION(t,b,s)  do { if (rd_write_section) rd_write_section(t,b,s); } while (0)
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#undef ALIGN
+#define ALIGN(v,a) (((v) + (a) - 1) & ~((a) - 1))
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+
+#endif /* REDUMP_H_ */
diff --git a/src/freedreno/decode/rnnutil.c b/src/freedreno/decode/rnnutil.c
new file mode 100644 (file)
index 0000000..7891597
--- /dev/null
@@ -0,0 +1,217 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include "rnnutil.h"
+
+static struct rnndomain *finddom(struct rnn *rnn, uint32_t regbase)
+{
+       if (rnndec_checkaddr(rnn->vc, rnn->dom[0], regbase, 0))
+               return rnn->dom[0];
+       return rnn->dom[1];
+}
+
+void _rnn_init(struct rnn *rnn, int nocolor)
+{
+       rnn_init();
+
+       rnn->db = rnn_newdb();
+       rnn->vc_nocolor = rnndec_newcontext(rnn->db);
+       rnn->vc_nocolor->colors = &envy_null_colors;
+       if (nocolor) {
+               rnn->vc = rnn->vc_nocolor;
+       } else {
+               rnn->vc = rnndec_newcontext(rnn->db);
+               rnn->vc->colors = &envy_def_colors;
+       }
+}
+
+struct rnn *rnn_new(int nocolor)
+{
+       struct rnn *rnn = calloc(sizeof(*rnn), 1);
+
+       if (!rnn)
+               return NULL;
+
+       _rnn_init(rnn, nocolor);
+
+       return rnn;
+}
+
+static void init(struct rnn *rnn, char *file, char *domain)
+{
+       /* prepare rnn stuff for lookup */
+       rnn_parsefile(rnn->db, file);
+       rnn_prepdb(rnn->db);
+       rnn->dom[0] = rnn_finddomain(rnn->db, domain);
+       if ((strcmp(domain, "A2XX") == 0) || (strcmp(domain, "A3XX") == 0)) {
+               rnn->dom[1] = rnn_finddomain(rnn->db, "AXXX");
+       } else {
+               rnn->dom[1] = rnn->dom[0];
+       }
+       if (!rnn->dom[0] && rnn->dom[1]) {
+               fprintf(stderr, "Could not find domain %s in %s\n", domain, file);
+       }
+       rnn->variant = domain;
+
+       rnndec_varadd(rnn->vc, "chip", domain);
+       if (rnn->vc != rnn->vc_nocolor)
+               rnndec_varadd(rnn->vc_nocolor, "chip", domain);
+}
+
+void rnn_load_file(struct rnn *rnn, char *file, char *domain)
+{
+       init(rnn, file, domain);
+}
+
+void rnn_load(struct rnn *rnn, const char *gpuname)
+{
+       if (strstr(gpuname, "a2")) {
+               init(rnn, "adreno/a2xx.xml", "A2XX");
+       } else if (strstr(gpuname, "a3")) {
+               init(rnn, "adreno/a3xx.xml", "A3XX");
+       } else if (strstr(gpuname, "a4")) {
+               init(rnn, "adreno/a4xx.xml", "A4XX");
+       } else if (strstr(gpuname, "a5")) {
+               init(rnn, "adreno/a5xx.xml", "A5XX");
+       } else if (strstr(gpuname, "a6")) {
+               init(rnn, "adreno/a6xx.xml", "A6XX");
+       }
+}
+
+uint32_t rnn_regbase(struct rnn *rnn, const char *name)
+{
+       uint32_t regbase = rnndec_decodereg(rnn->vc_nocolor, rnn->dom[0], name);
+       if (!regbase)
+               regbase = rnndec_decodereg(rnn->vc_nocolor, rnn->dom[1], name);
+       return regbase;
+}
+
+const char *rnn_regname(struct rnn *rnn, uint32_t regbase, int color)
+{
+       static char buf[128];
+       struct rnndecaddrinfo *info;
+
+       info = rnndec_decodeaddr(color ? rnn->vc : rnn->vc_nocolor,
+                       finddom(rnn, regbase), regbase, 0);
+       if (info) {
+               strcpy(buf, info->name);
+               free(info->name);
+               free(info);
+               return buf;
+       }
+       return NULL;
+}
+
+struct rnndecaddrinfo *rnn_reginfo(struct rnn *rnn, uint32_t regbase)
+{
+       return rnndec_decodeaddr(rnn->vc, finddom(rnn, regbase), regbase, 0);
+}
+
+const char *rnn_enumname(struct rnn *rnn, const char *name, uint32_t val)
+{
+       struct rnndeccontext *ctx = rnn->vc;
+       struct rnnenum *en = rnn_findenum(ctx->db, name);
+       if (en) {
+               int i;
+               for (i = 0; i < en->valsnum; i++) {
+                       struct rnnvalue *eval = en->vals[i];
+                       if (eval->valvalid && eval->value == val &&
+                                       rnndec_varmatch(ctx, &eval->varinfo)) {
+                               return en->vals[i]->name;
+                       }
+               }
+       }
+       return NULL;
+}
+
+static struct rnndelem *regelem(struct rnndomain *domain, const char *name)
+{
+       int i;
+       for (i = 0; i < domain->subelemsnum; i++) {
+               struct rnndelem *elem = domain->subelems[i];
+               if (!strcmp(elem->name, name))
+                       return elem;
+       }
+       return NULL;
+}
+
+/* Lookup rnndelem by name: */
+struct rnndelem *rnn_regelem(struct rnn *rnn, const char *name)
+{
+       struct rnndelem *elem = regelem(rnn->dom[0], name);
+       if (elem)
+               return elem;
+       return regelem(rnn->dom[1], name);
+}
+
+static struct rnndelem *regoff(struct rnndomain *domain, uint32_t offset)
+{
+       int i;
+       for (i = 0; i < domain->subelemsnum; i++) {
+               struct rnndelem *elem = domain->subelems[i];
+               if (elem->offset == offset)
+                       return elem;
+       }
+       return NULL;
+}
+
+/* Lookup rnndelem by offset: */
+struct rnndelem *rnn_regoff(struct rnn *rnn, uint32_t offset)
+{
+       struct rnndelem *elem = regoff(rnn->dom[0], offset);
+       if (elem)
+               return elem;
+       return regoff(rnn->dom[1], offset);
+}
+
+enum rnnttype rnn_decodelem(struct rnn *rnn, struct rnntypeinfo *info,
+               uint32_t regval, union rnndecval *val)
+{
+       val->u = regval;
+       switch (info->type) {
+       case RNN_TTYPE_INLINE_ENUM:
+       case RNN_TTYPE_ENUM:
+       case RNN_TTYPE_HEX:
+       case RNN_TTYPE_INT:
+       case RNN_TTYPE_UINT:
+       case RNN_TTYPE_FLOAT:
+       case RNN_TTYPE_BOOLEAN:
+               return info->type;
+       case RNN_TTYPE_FIXED:
+       case RNN_TTYPE_UFIXED:
+               /* TODO */
+       default:
+               return RNN_TTYPE_INVALID;
+       }
+}
diff --git a/src/freedreno/decode/rnnutil.h b/src/freedreno/decode/rnnutil.h
new file mode 100644 (file)
index 0000000..ea66747
--- /dev/null
@@ -0,0 +1,66 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef RNNUTIL_H_
+#define RNNUTIL_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include "rnn.h"
+#include "rnndec.h"
+
+struct rnn {
+       struct rnndb *db;
+       struct rnndeccontext *vc, *vc_nocolor;
+       struct rnndomain *dom[2];
+       const char *variant;
+};
+
+union rnndecval {
+       uint32_t u;
+       int32_t i;
+       float f;
+};
+
+void _rnn_init(struct rnn *rnn, int nocolor);
+struct rnn *rnn_new(int nocolor);
+void rnn_load_file(struct rnn *rnn, char *file, char *domain);
+void rnn_load(struct rnn *rnn, const char *gpuname);
+uint32_t rnn_regbase(struct rnn *rnn, const char *name);
+const char *rnn_regname(struct rnn *rnn, uint32_t regbase, int color);
+struct rnndecaddrinfo *rnn_reginfo(struct rnn *rnn, uint32_t regbase);
+const char *rnn_enumname(struct rnn *rnn, const char *name, uint32_t val);
+
+struct rnndelem *rnn_regelem(struct rnn *rnn, const char *name);
+struct rnndelem *rnn_regoff(struct rnn *rnn, uint32_t offset);
+enum rnnttype rnn_decodelem(struct rnn *rnn, struct rnntypeinfo *info,
+               uint32_t regval, union rnndecval *val);
+
+#endif /* RNNUTIL_H_ */
diff --git a/src/freedreno/decode/script.c b/src/freedreno/decode/script.c
new file mode 100644 (file)
index 0000000..a882dd2
--- /dev/null
@@ -0,0 +1,775 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#define _GNU_SOURCE
+#define LUA_COMPAT_APIINTCASTS
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <lua.h>
+#include <lauxlib.h>
+#include <lualib.h>
+#include <assert.h>
+
+#include "script.h"
+#include "cffdec.h"
+#include "rnnutil.h"
+
+static lua_State *L;
+
+#if 0
+#define DBG(fmt, ...) \
+               do { printf(" ** %s:%d ** "fmt "\n", \
+                               __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+#else
+#define DBG(fmt, ...) do {} while (0)
+#endif
+
+/* An rnn based decoder, which can either be decoding current register
+ * values, or domain based decoding of a pm4 packet.
+ *
+ */
+struct rnndec {
+       struct rnn base;
+
+       /* for pm4 packet decoding: */
+       uint32_t sizedwords;
+       uint32_t *dwords;
+};
+
+static inline struct rnndec *to_rnndec(struct rnn *rnn)
+{
+       return (struct rnndec *)rnn;
+}
+
+static uint32_t rnn_val(struct rnn *rnn, uint32_t regbase)
+{
+       struct rnndec *rnndec = to_rnndec(rnn);
+
+       if (!rnndec->sizedwords) {
+               return reg_val(regbase);
+       } else if (regbase < rnndec->sizedwords) {
+               return rnndec->dwords[regbase];
+       } else {
+               // XXX throw an error
+               return -1;
+       }
+}
+
+/* does not return */
+static void error(const char *fmt)
+{
+       fprintf(stderr, fmt, lua_tostring(L, -1));
+       exit(1);
+}
+
+/*
+ * An enum type that can be used as string or number:
+ */
+
+struct rnndenum {
+       const char *str;
+       int val;
+};
+
+static int l_meta_rnn_enum_tostring(lua_State *L)
+{
+       struct rnndenum *e = lua_touserdata(L, 1);
+       if (e->str) {
+               lua_pushstring(L, e->str);
+       } else {
+               char buf[32];
+               sprintf(buf, "%u", e->val);
+               lua_pushstring(L, buf);
+       }
+       return 1;
+}
+
+/* so, this doesn't actually seem to be implemented yet, but hopefully
+ * some day lua comes to it's senses
+ */
+static int l_meta_rnn_enum_tonumber(lua_State *L)
+{
+       struct rnndenum *e = lua_touserdata(L, 1);
+       lua_pushinteger(L, e->val);
+       return 1;
+}
+
+static const struct luaL_Reg l_meta_rnn_enum[] = {
+       {"__tostring", l_meta_rnn_enum_tostring},
+       {"__tonumber", l_meta_rnn_enum_tonumber},
+       {NULL, NULL}  /* sentinel */
+};
+
+static void pushenum(struct lua_State *L, int val, struct rnnenum *info)
+{
+       struct rnndenum *e = lua_newuserdata(L, sizeof(*e));
+
+       e->val = val;
+       e->str = NULL;
+
+       for (int i = 0; i < info->valsnum; i++) {
+               if (info->vals[i]->valvalid && (info->vals[i]->value == val)) {
+                       e->str = info->vals[i]->name;
+                       break;
+               }
+       }
+
+       luaL_newmetatable(L, "rnnmetaenum");
+       luaL_setfuncs(L, l_meta_rnn_enum, 0);
+       lua_pop(L, 1);
+
+       luaL_setmetatable(L, "rnnmetaenum");
+}
+
+/* Expose rnn decode to script environment as "rnn" library:
+ */
+
+struct rnndoff {
+       struct rnn *rnn;
+       struct rnndelem *elem;
+       uint64_t offset;
+};
+
+static void push_rnndoff(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset)
+{
+       struct rnndoff *rnndoff = lua_newuserdata(L, sizeof(*rnndoff));
+       rnndoff->rnn = rnn;
+       rnndoff->elem = elem;
+       rnndoff->offset = offset;
+}
+
+static int l_rnn_etype_array(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset);
+static int l_rnn_etype_reg(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset);
+
+static int pushdecval(struct lua_State *L, struct rnn *rnn,
+               uint32_t regval, struct rnntypeinfo *info)
+{
+       union rnndecval val;
+       switch (rnn_decodelem(rnn, info, regval, &val)) {
+       case RNN_TTYPE_ENUM:
+       case RNN_TTYPE_INLINE_ENUM:
+               pushenum(L, val.i, info->eenum);
+               return 1;
+       case RNN_TTYPE_INT:
+               lua_pushinteger(L, val.i);
+               return 1;
+       case RNN_TTYPE_UINT:
+       case RNN_TTYPE_HEX:
+               lua_pushunsigned(L, val.u);
+               return 1;
+       case RNN_TTYPE_FLOAT:
+               lua_pushnumber(L, val.f);
+               return 1;
+       case RNN_TTYPE_BOOLEAN:
+               lua_pushboolean(L, val.u);
+               return 1;
+       case RNN_TTYPE_INVALID:
+       default:
+               return 0;
+       }
+
+}
+
+static int l_rnn_etype(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset)
+{
+       int ret;
+       uint32_t regval;
+       DBG("elem=%p (%d), offset=%lu", elem, elem->type, offset);
+       switch (elem->type) {
+       case RNN_ETYPE_REG:
+               /* if a register has no bitfields, just return
+                * the raw value:
+                */
+               regval = rnn_val(rnn, offset);
+               regval <<= elem->typeinfo.shr;
+               ret = pushdecval(L, rnn, regval, &elem->typeinfo);
+               if (ret)
+                       return ret;
+               return l_rnn_etype_reg(L, rnn, elem, offset);
+       case RNN_ETYPE_ARRAY:
+               return l_rnn_etype_array(L, rnn, elem, offset);
+       default:
+               /* hmm.. */
+               printf("unhandled type: %d\n", elem->type);
+               return 0;
+       }
+}
+
+/*
+ * Struct Object:
+ * To implement stuff like 'RB_MRT[n].CONTROL' we need a struct-object
+ * to represent the current array index (ie. 'RB_MRT[n]')
+ */
+
+static int l_rnn_struct_meta_index(lua_State *L)
+{
+       struct rnndoff *rnndoff = lua_touserdata(L, 1);
+       const char *name = lua_tostring(L, 2);
+       struct rnndelem *elem = rnndoff->elem;
+       int i;
+
+       for (i = 0; i < elem->subelemsnum; i++) {
+               struct rnndelem *subelem = elem->subelems[i];
+               if (!strcmp(name, subelem->name)) {
+                       return l_rnn_etype(L, rnndoff->rnn, subelem,
+                                       rnndoff->offset + subelem->offset);
+               }
+       }
+
+       return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn_struct[] = {
+       {"__index", l_rnn_struct_meta_index},
+       {NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_etype_struct(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset)
+{
+       push_rnndoff(L, rnn, elem, offset);
+
+       luaL_newmetatable(L, "rnnmetastruct");
+       luaL_setfuncs(L, l_meta_rnn_struct, 0);
+       lua_pop(L, 1);
+
+       luaL_setmetatable(L, "rnnmetastruct");
+
+       return 1;
+}
+
+/*
+ * Array Object:
+ */
+
+static int l_rnn_array_meta_index(lua_State *L)
+{
+       struct rnndoff *rnndoff = lua_touserdata(L, 1);
+       int idx = lua_tointeger(L, 2);
+       struct rnndelem *elem = rnndoff->elem;
+       uint64_t offset = rnndoff->offset + (elem->stride * idx);
+
+       DBG("rnndoff=%p, idx=%d, numsubelems=%d",
+                       rnndoff, idx, rnndoff->elem->subelemsnum);
+
+       /* if just a single sub-element, it is directly a register,
+        * otherwise we need to accumulate the array index while
+        * we wait for the register name within the array..
+        */
+       if (elem->subelemsnum == 1) {
+               return l_rnn_etype(L, rnndoff->rnn, elem->subelems[0], offset);
+       } else {
+               return l_rnn_etype_struct(L, rnndoff->rnn, elem, offset);
+       }
+
+       return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn_array[] = {
+       {"__index", l_rnn_array_meta_index},
+       {NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_etype_array(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset)
+{
+       push_rnndoff(L, rnn, elem, offset);
+
+       luaL_newmetatable(L, "rnnmetaarray");
+       luaL_setfuncs(L, l_meta_rnn_array, 0);
+       lua_pop(L, 1);
+
+       luaL_setmetatable(L, "rnnmetaarray");
+
+       return 1;
+}
+
+/*
+ * Register element:
+ */
+
+static int l_rnn_reg_meta_index(lua_State *L)
+{
+       struct rnndoff *rnndoff = lua_touserdata(L, 1);
+       const char *name = lua_tostring(L, 2);
+       struct rnndelem *elem = rnndoff->elem;
+       struct rnntypeinfo *info = &elem->typeinfo;
+       struct rnnbitfield **bitfields;
+       int bitfieldsnum;
+       int i;
+
+       switch (info->type) {
+       case RNN_TTYPE_BITSET:
+               bitfields = info->ebitset->bitfields;
+               bitfieldsnum = info->ebitset->bitfieldsnum;
+               break;
+       case RNN_TTYPE_INLINE_BITSET:
+               bitfields = info->bitfields;
+               bitfieldsnum = info->bitfieldsnum;
+               break;
+       default:
+               printf("invalid register type: %d\n", info->type);
+               return 0;
+       }
+
+       for (i = 0; i < bitfieldsnum; i++) {
+               struct rnnbitfield *bf = bitfields[i];
+               if (!strcmp(name, bf->name)) {
+                       uint32_t regval = rnn_val(rnndoff->rnn, rnndoff->offset);
+
+                       regval &= typeinfo_mask(&bf->typeinfo);
+                       regval >>= bf->typeinfo.low;
+                       regval <<= bf->typeinfo.shr;
+
+                       DBG("name=%s, info=%p, subelemsnum=%d, type=%d, regval=%x",
+                                       name, info, rnndoff->elem->subelemsnum,
+                                       bf->typeinfo.type, regval);
+
+                       return pushdecval(L, rnndoff->rnn, regval, &bf->typeinfo);
+               }
+       }
+
+       printf("invalid member: %s\n", name);
+       return 0;
+}
+
+static int l_rnn_reg_meta_tostring(lua_State *L)
+{
+       struct rnndoff *rnndoff = lua_touserdata(L, 1);
+       uint32_t regval = rnn_val(rnndoff->rnn, rnndoff->offset);
+       struct rnndecaddrinfo *info = rnn_reginfo(rnndoff->rnn, rnndoff->offset);
+       char *decoded;
+       if (info && info->typeinfo) {
+               decoded = rnndec_decodeval(rnndoff->rnn->vc,
+                               info->typeinfo, regval);
+       } else {
+               asprintf(&decoded, "%08x", regval);
+       }
+       lua_pushstring(L, decoded);
+       free(decoded);
+       if (info) {
+               free(info->name);
+               free(info);
+       }
+       return 1;
+}
+
+static int l_rnn_reg_meta_tonumber(lua_State *L)
+{
+       struct rnndoff *rnndoff = lua_touserdata(L, 1);
+       uint32_t regval = rnn_val(rnndoff->rnn, rnndoff->offset);
+
+       regval <<= rnndoff->elem->typeinfo.shr;
+
+       lua_pushnumber(L, regval);
+       return 1;
+}
+
+static const struct luaL_Reg l_meta_rnn_reg[] = {
+       {"__index", l_rnn_reg_meta_index},
+       {"__tostring", l_rnn_reg_meta_tostring},
+       {"__tonumber", l_rnn_reg_meta_tonumber},
+       {NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_etype_reg(lua_State *L, struct rnn *rnn,
+               struct rnndelem *elem, uint64_t offset)
+{
+       push_rnndoff(L, rnn, elem, offset);
+
+       luaL_newmetatable(L, "rnnmetareg");
+       luaL_setfuncs(L, l_meta_rnn_reg, 0);
+       lua_pop(L, 1);
+
+       luaL_setmetatable(L, "rnnmetareg");
+
+       return 1;
+}
+
+/*
+ *
+ */
+
+static int l_rnn_meta_index(lua_State *L)
+{
+       struct rnn *rnn = lua_touserdata(L, 1);
+       const char *name = lua_tostring(L, 2);
+       struct rnndelem *elem;
+
+       elem = rnn_regelem(rnn, name);
+       if (!elem)
+               return 0;
+
+       return l_rnn_etype(L, rnn, elem, elem->offset);
+}
+
+static int l_rnn_meta_gc(lua_State *L)
+{
+       // TODO
+       //struct rnn *rnn = lua_touserdata(L, 1);
+       //rnn_deinit(rnn);
+       return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn[] = {
+       {"__index", l_rnn_meta_index},
+       {"__gc", l_rnn_meta_gc},
+       {NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_init(lua_State *L)
+{
+       const char *gpuname = lua_tostring(L, 1);
+       struct rnndec *rnndec = lua_newuserdata(L, sizeof(*rnndec));
+       _rnn_init(&rnndec->base, 0);
+       rnn_load(&rnndec->base, gpuname);
+       rnndec->sizedwords = 0;
+
+       luaL_newmetatable(L, "rnnmeta");
+       luaL_setfuncs(L, l_meta_rnn, 0);
+       lua_pop(L, 1);
+
+       luaL_setmetatable(L, "rnnmeta");
+
+       return 1;
+}
+
+static int l_rnn_enumname(lua_State *L)
+{
+       struct rnn *rnn = lua_touserdata(L, 1);
+       const char *name = lua_tostring(L, 2);
+       uint32_t val = (uint32_t)lua_tonumber(L, 3);
+       lua_pushstring(L, rnn_enumname(rnn, name, val));
+       return 1;
+}
+
+static int l_rnn_regname(lua_State *L)
+{
+       struct rnn *rnn = lua_touserdata(L, 1);
+       uint32_t regbase = (uint32_t)lua_tonumber(L, 2);
+       lua_pushstring(L, rnn_regname(rnn, regbase, 1));
+       return 1;
+}
+
+static int l_rnn_regval(lua_State *L)
+{
+       struct rnn *rnn = lua_touserdata(L, 1);
+       uint32_t regbase = (uint32_t)lua_tonumber(L, 2);
+       uint32_t regval = (uint32_t)lua_tonumber(L, 3);
+       struct rnndecaddrinfo *info = rnn_reginfo(rnn, regbase);
+       char *decoded;
+       if (info && info->typeinfo) {
+               decoded = rnndec_decodeval(rnn->vc, info->typeinfo, regval);
+       } else {
+               asprintf(&decoded, "%08x", regval);
+       }
+       lua_pushstring(L, decoded);
+       free(decoded);
+       if (info) {
+               free(info->name);
+               free(info);
+       }
+       return 1;
+}
+
+static const struct luaL_Reg l_rnn[] = {
+       {"init", l_rnn_init},
+       {"enumname", l_rnn_enumname},
+       {"regname", l_rnn_regname},
+       {"regval", l_rnn_regval},
+       {NULL, NULL}  /* sentinel */
+};
+
+
+
+/* Expose the register state to script enviroment as a "regs" library:
+ */
+
+static int l_reg_written(lua_State *L)
+{
+       uint32_t regbase = (uint32_t)lua_tonumber(L, 1);
+       lua_pushnumber(L, reg_written(regbase));
+       return 1;
+}
+
+static int l_reg_lastval(lua_State *L)
+{
+       uint32_t regbase = (uint32_t)lua_tonumber(L, 1);
+       lua_pushnumber(L, reg_lastval(regbase));
+       return 1;
+}
+
+static int l_reg_val(lua_State *L)
+{
+       uint32_t regbase = (uint32_t)lua_tonumber(L, 1);
+       lua_pushnumber(L, reg_val(regbase));
+       return 1;
+}
+
+static const struct luaL_Reg l_regs[] = {
+       {"written", l_reg_written},
+       {"lastval", l_reg_lastval},
+       {"val",     l_reg_val},
+       {NULL, NULL}  /* sentinel */
+};
+
+/* Expose API to lookup snapshot buffers:
+ */
+
+uint64_t gpubaseaddr(uint64_t gpuaddr);
+unsigned hostlen(uint64_t gpuaddr);
+
+/* given address, return base-address of buffer: */
+static int l_bo_base(lua_State *L)
+{
+       uint64_t addr = (uint64_t)lua_tonumber(L, 1);
+       lua_pushnumber(L, gpubaseaddr(addr));
+       return 1;
+}
+
+/* given address, return the remaining size of the buffer: */
+static int l_bo_size(lua_State *L)
+{
+       uint64_t addr = (uint64_t)lua_tonumber(L, 1);
+       lua_pushnumber(L, hostlen(addr));
+       return 1;
+}
+
+static const struct luaL_Reg l_bos[] = {
+       {"base", l_bo_base},
+       {"size", l_bo_size},
+       {NULL, NULL}  /* sentinel */
+};
+
+static void openlib(const char *lib, const luaL_Reg *reg)
+{
+  lua_newtable(L);
+  luaL_setfuncs(L, reg, 0);
+  lua_setglobal(L, lib);
+}
+
+/* called at start to load the script: */
+int script_load(const char *file)
+{
+       int ret;
+
+       assert(!L);
+
+       L = luaL_newstate();
+       luaL_openlibs(L);
+       openlib("bos", l_bos);
+       openlib("regs", l_regs);
+       openlib("rnn", l_rnn);
+
+       ret = luaL_loadfile(L, file);
+       if (ret)
+               error("%s\n");
+
+       ret = lua_pcall(L, 0, LUA_MULTRET, 0);
+       if (ret)
+               error("%s\n");
+
+       return 0;
+}
+
+
+/* called at start of each cmdstream file: */
+void script_start_cmdstream(const char *name)
+{
+       if (!L)
+               return;
+
+       lua_getglobal(L, "start_cmdstream");
+
+       /* if no handler just ignore it: */
+       if (!lua_isfunction(L, -1)) {
+               lua_pop(L, 1);
+               return;
+       }
+
+       lua_pushstring(L, name);
+
+       /* do the call (1 arguments, 0 result) */
+       if (lua_pcall(L, 1, 0, 0) != 0)
+               error("error running function `f': %s\n");
+}
+
+/* called at each DRAW_INDX, calls script drawidx fxn to process
+ * the current state
+ */
+void script_draw(const char *primtype, uint32_t nindx)
+{
+       if (!L)
+               return;
+
+       lua_getglobal(L, "draw");
+
+       /* if no handler just ignore it: */
+       if (!lua_isfunction(L, -1)) {
+               lua_pop(L, 1);
+               return;
+       }
+
+       lua_pushstring(L, primtype);
+       lua_pushnumber(L, nindx);
+
+       /* do the call (2 arguments, 0 result) */
+       if (lua_pcall(L, 2, 0, 0) != 0)
+               error("error running function `f': %s\n");
+}
+
+
+static int l_rnn_meta_dom_index(lua_State *L)
+{
+       struct rnn *rnn = lua_touserdata(L, 1);
+       uint32_t offset = (uint32_t)lua_tonumber(L, 2);
+       struct rnndelem *elem;
+
+       /* TODO might be nicer if the arg isn't a number, to search the domain
+        * for matching bitfields.. so that the script could do something like
+        * 'pkt.WIDTH' insteadl of 'pkt[1].WIDTH', ie. not have to remember the
+        * offset of the dword containing the bitfield..
+        */
+
+       elem = rnn_regoff(rnn, offset);
+       if (!elem)
+               return 0;
+
+       return l_rnn_etype(L, rnn, elem, elem->offset);
+}
+
+/*
+ * A wrapper object for rnndomain based decoding of an array of dwords
+ * (ie. for pm4 packet decoding).  Mostly re-uses the register-value
+ * decoding for the individual dwords and bitfields.
+ */
+
+static int l_rnn_meta_dom_gc(lua_State *L)
+{
+       // TODO
+       //struct rnn *rnn = lua_touserdata(L, 1);
+       //rnn_deinit(rnn);
+       return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn_dom[] = {
+       {"__index", l_rnn_meta_dom_index},
+       {"__gc", l_rnn_meta_dom_gc},
+       {NULL, NULL}  /* sentinel */
+};
+
+/* called to general pm4 packet decoding, such as texture/sampler state
+ */
+void script_packet(uint32_t *dwords, uint32_t sizedwords,
+               struct rnn *rnn, struct rnndomain *dom)
+{
+       if (!L)
+               return;
+
+       lua_getglobal(L, dom->name);
+
+       /* if no handler for the packet, just ignore it: */
+       if (!lua_isfunction(L, -1)) {
+               lua_pop(L, 1);
+               return;
+       }
+
+       struct rnndec *rnndec = lua_newuserdata(L, sizeof(*rnndec));
+
+       rnndec->base = *rnn;
+       rnndec->base.dom[0] = dom;
+       rnndec->base.dom[1] = NULL;
+       rnndec->dwords = dwords;
+       rnndec->sizedwords = sizedwords;
+
+       luaL_newmetatable(L, "rnnmetadom");
+       luaL_setfuncs(L, l_meta_rnn_dom, 0);
+       lua_pop(L, 1);
+
+       luaL_setmetatable(L, "rnnmetadom");
+
+       lua_pushnumber(L, sizedwords);
+
+       if (lua_pcall(L, 2, 0, 0) != 0)
+               error("error running function `f': %s\n");
+}
+
+/* helper to call fxn that takes and returns void: */
+static void simple_call(const char *name)
+{
+       if (!L)
+               return;
+
+       lua_getglobal(L, name);
+
+       /* if no handler just ignore it: */
+       if (!lua_isfunction(L, -1)) {
+               lua_pop(L, 1);
+               return;
+       }
+
+       /* do the call (0 arguments, 0 result) */
+       if (lua_pcall(L, 0, 0, 0) != 0)
+               error("error running function `f': %s\n");
+}
+
+/* called at end of each cmdstream file: */
+void script_end_cmdstream(void)
+{
+       simple_call("end_cmdstream");
+}
+
+/* called at start of submit/issueibcmds: */
+void script_start_submit(void)
+{
+       simple_call("start_submit");
+}
+
+/* called at end of submit/issueibcmds: */
+void script_end_submit(void)
+{
+       simple_call("end_submit");
+}
+
+/* called after last cmdstream file: */
+void script_finish(void)
+{
+       if (!L)
+               return;
+
+       simple_call("finish");
+
+       lua_close(L);
+       L = NULL;
+}
diff --git a/src/freedreno/decode/script.h b/src/freedreno/decode/script.h
new file mode 100644 (file)
index 0000000..d14b69a
--- /dev/null
@@ -0,0 +1,76 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef SCRIPT_H_
+#define SCRIPT_H_
+
+#include <stdint.h>
+
+
+// XXX make script support optional
+#define ENABLE_SCRIPTING 1
+
+#ifdef ENABLE_SCRIPTING
+
+/* called at start to load the script: */
+int script_load(const char *file);
+
+/* called at start of each cmdstream file: */
+void script_start_cmdstream(const char *name);
+
+/* called at each DRAW_INDX, calls script drawidx fxn to process
+ * the current state
+ */
+__attribute__((weak))
+void script_draw(const char *primtype, uint32_t nindx);
+
+struct rnn;
+struct rnndomain;
+__attribute__((weak))
+void script_packet(uint32_t *dwords, uint32_t sizedwords,
+               struct rnn *rnn, struct rnndomain *dom);
+
+/* maybe at some point it is interesting to add additional script
+ * hooks for CP_EVENT_WRITE, etc?
+ */
+
+/* called at end of each cmdstream file: */
+void script_end_cmdstream(void);
+
+void script_start_submit(void);
+void script_end_submit(void);
+
+/* called after last cmdstream file: */
+void script_finish(void);
+
+#else
+// TODO no-op stubs..
+#endif
+
+
+#endif /* SCRIPT_H_ */
diff --git a/src/freedreno/decode/scripts/analyze.lua b/src/freedreno/decode/scripts/analyze.lua
new file mode 100644 (file)
index 0000000..27e97ec
--- /dev/null
@@ -0,0 +1,178 @@
+-- A script that compares a set of equivalent cmdstream captures from
+-- various generations, looking for equivalencies between registers.
+--
+-- This would be run across a group of similar tests for various
+-- generations, for example:
+--
+--   cffdump --script scripts/analyze.lua a320/quad-flat-*.rd a420/quad-flat-*.rd
+--
+-- This is done by comparing unique register values.  Ie. for each
+-- generation, find the set of registers that have different values
+-- between equivalent draw calls.
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+-- results - table structure:
+-- * [gpuname] - gpu
+--   * tests
+--     * [testname] - current test
+--       * draws
+--         * [1..n] - the draws
+--           * primtype - the primitive type
+--           * regs - table of values for draw
+--             * [regbase] - regval
+--   * regvals - table of unique values across all draws
+--     * [regbase]
+--       * [regval] - list of test names
+--         * [1..n] - testname "." didx
+local results = {}
+
+local test = nil
+local gpuname = nil
+local testname = nil
+
+
+-- srsly, no sparse table size() op?
+function tblsz(tbl)
+  local n = 0;
+  for k,v in pairs(tbl) do
+    n = n + 1
+  end
+  return n
+end
+
+
+function start_cmdstream(name)
+  testname = posix.basename(name)
+  gpuname = posix.basename(posix.dirname(name))
+  --io.write("START: gpuname=" .. gpuname .. ", testname=" .. testname .. "\n");
+  local gpu = results[gpuname]
+  if gpu == nil then
+    gpu = {["tests"] = {}, ["regvals"] = {}}
+    results[gpuname] = gpu
+  end
+  test = {["draws"] = {}}
+  gpu["tests"][testname] = test
+end
+
+function draw(primtype, nindx)
+  -- RECTLIST is only used internally.. we want to ignore it for
+  -- now, although it could potentially be interesting to track
+  -- these separately (separating clear/restore/resolve) just to
+  -- figure out which registers are used for which..
+  if primtype == "DI_PT_RECTLIST" then
+    return
+  end
+  local regtbl = {}
+  local draw = {["primtype"] = primtype, ["regs"] = regtbl}
+  local didx = tblsz(test["draws"])
+
+  test["draws"][didx] = draw
+
+  -- populate current regs.  For now just consider ones that have
+  -- been written.. maybe we need to make that configurable in
+  -- case it filters out too many registers.
+  for regbase=0,0xffff do
+    if regs.written(regbase) ~= 0 then
+      local regval = regs.val(regbase)
+
+      -- track reg vals per draw:
+      regtbl[regbase] = regval
+
+      -- also track which reg vals appear in which tests:
+      local uniq_regvals = results[gpuname]["regvals"][regbase]
+      if uniq_regvals == nil then
+        uniq_regvals = {}
+        results[gpuname]["regvals"][regbase] = uniq_regvals;
+      end
+      local drawlist = uniq_regvals[regval]
+      if drawlist == nil then
+        drawlist = {}
+        uniq_regvals[regval] = drawlist
+      end
+      table.insert(drawlist, testname .. "." .. didx)
+    end
+  end
+
+  -- TODO maybe we want to whitelist a few well known regs, for the
+  -- convenience of the code that runs at the end to analyze the data?
+  -- TODO also would be useful to somehow capture CP_SET_BIN..
+
+end
+
+function end_cmdstream()
+  test = nil
+  gpuname = nil
+  testname = nil
+end
+
+function print_draws(gpuname, gpu)
+  io.write("  " .. gpuname .. "\n")
+  for testname,test in pairs(gpu["tests"]) do
+    io.write("    " .. testname .. ", draws=" .. #test["draws"] .. "\n")
+    for didx,draw in pairs(test["draws"]) do
+      io.write("      " .. didx .. ": " .. draw["primtype"] .. "\n")
+    end
+  end
+end
+
+-- sort and concat a list of draw names to form a key which can be
+-- compared to other drawlists to check for equality
+-- TODO maybe we instead want a scheme that allows for some fuzzyness
+-- in the matching??
+function drawlistname(drawlist)
+  local name = nil
+  for idx,draw in pairs(drawlist) do
+    if name == nil then
+      name = draw
+    else
+      name = name .. ":" .. draw
+    end
+  end
+  return name
+end
+
+local rnntbl = {}
+
+function dumpmatches(name)
+  for gpuname,gpu in pairs(results) do
+    local r = rnntbl[gpuname]
+    if r == nil then
+      io.write("loading rnn database: \n" .. gpuname)
+      r = rnn.init(gpuname)
+      rnntbl[gpuname] = r
+    end
+    for regbase,regvals in pairs(gpu["regvals"]) do
+      for regval,drawlist in pairs(regvals) do
+        local name2 = drawlistname(drawlist)
+        if name == name2 then
+          io.write(string.format("  %s:%s:\t%08x  %s\n",
+                                 gpuname, rnn.regname(r, regbase),
+                                 regval, rnn.regval(r, regbase, regval)))
+        end
+      end
+    end
+  end
+end
+
+function finish()
+  -- drawlistnames that we've already dumped:
+  local dumped = {}
+
+  for gpuname,gpu in pairs(results) do
+    -- print_draws(gpuname, gpu)
+    for regbase,regvals in pairs(gpu["regvals"]) do
+      for regval,drawlist in pairs(regvals) do
+        local name = drawlistname(drawlist)
+        if dumped[name] == nil then
+          io.write("\n" .. name .. ":\n")
+          dumpmatches(name)
+          dumped[name] = 1
+        end
+      end
+    end
+  end
+end
+
diff --git a/src/freedreno/decode/scripts/parse-submits.lua b/src/freedreno/decode/scripts/parse-submits.lua
new file mode 100644 (file)
index 0000000..1d21716
--- /dev/null
@@ -0,0 +1,413 @@
+-- Parse cmdstream dump and analyse blits and batches
+
+--local posix = require "posix"
+
+function printf(fmt, ...)
+       return io.write(string.format(fmt, ...))
+end
+
+function dbg(fmt, ...)
+       --printf(fmt, ...)
+end
+
+printf("Analyzing Data...\n")
+
+local r = rnn.init("a630")
+
+-- Each submit, all draws will target the same N MRTs:
+local mrts = {}
+local allmrts = {}  -- includes historical render targets
+function push_mrt(fmt, w, h, samples, base, flag, gmem)
+       dbg("MRT: %s %ux%u 0x%x\n", fmt, w, h, base)
+
+       local mrt = {}
+       mrt.format = fmt
+       mrt.w = w
+       mrt.h = h
+       mrt.samples = samples
+       mrt.base = base
+       mrt.flag = flag
+       mrt.gmem = gmem
+
+       mrts[base] = mrt
+       allmrts[base] = mrt
+end
+
+-- And each each draw will read from M sources/textures:
+local sources = {}
+function push_source(fmt, w, h, samples, base, flag)
+       dbg("SRC: %s %ux%u 0x%x\n", fmt, w, h, base)
+
+       local source = {}
+       source.format = fmt
+       source.w = w
+       source.h = h
+       source.samples = samples
+       source.base = base
+       source.flag = flag
+
+       sources[base] = source
+end
+
+local binw
+local binh
+local nbins
+local blits = 0
+local draws = 0
+local drawmode
+local cleared
+local restored
+local resolved
+local nullbatch
+local depthtest
+local depthwrite
+local stenciltest
+local stencilwrite
+
+function start_cmdstream(name)
+       printf("Parsing %s\n", name)
+end
+
+function reset()
+       dbg("reset\n")
+       mrts = {}
+       sources = {}
+       draws = 0
+       blits = 0
+       cleared = {}
+       restored = {}
+       resolved = {}
+       depthtest = false
+       depthwrite = false
+       stenciltest = false
+       stencilwrite = false
+       drawmode = Nil
+end
+
+function start_submit()
+       dbg("start_submit\n")
+       reset()
+       nullbatch = true
+end
+
+function finish()
+       dbg("finish\n")
+
+       printf("\n")
+
+       -- TODO we get false-positives for 'NULL BATCH!' because we don't have
+       -- a really good way to differentiate between submits and cmds.  Ie.
+       -- with growable cmdstream, and a large # of tiles, IB1 can get split
+       -- across multiple buffers.  Since we ignore GMEM draws for window-
+       -- offset != 0,0, the later cmds will appear as null batches
+       if draws == 0 and blits == 0 then
+               if nullbatch then
+                       printf("NULL BATCH!\n");
+               end
+               return
+       end
+
+       if draws > 0 then
+               printf("Batch:\n")
+               printf("-------\n")
+               printf("  # of draws: %u\n", draws)
+               printf("  mode: %s\n", drawmode)
+               if drawmode == "RM6_GMEM" then
+                       printf("  bin size: %ux%u (%u bins)\n", binw, binh, nbins)
+               end
+               if depthtest or depthwrite then
+                       printf("  ")
+                       if depthtest then
+                               printf("DEPTHTEST ")
+                       end
+                       if depthwrite then
+                               printf("DEPTHWRITE")
+                       end
+                       printf("\n")
+               end
+               if stenciltest or stencilwrite then
+                       printf("  ")
+                       if stenciltest then
+                               printf("STENCILTEST ")
+                       end
+                       if stencilwrite then
+                               printf("STENCILWRITE")
+                       end
+                       printf("\n")
+               end
+       else
+               printf("Blit:\n")
+               printf("-----\n")
+       end
+
+       for base,mrt in pairs(mrts) do
+               printf("  MRT[0x%x:0x%x]:\t%ux%u\t\t%s (%s)", base, mrt.flag, mrt.w, mrt.h, mrt.format, mrt.samples)
+               if drawmode == "RM6_GMEM" then
+                       if cleared[mrt.gmem] then
+                               printf("\tCLEARED")
+                       end
+                       if restored[mrt.gmem] then
+                               printf("\tRESTORED")
+                       end
+                       if resolved[mrt.gmem] then
+                               printf("\tRESOLVED")
+                       end
+               else
+                       if cleared[mrt.base] then
+                               printf("\tCLEARED")
+                       end
+               end
+               printf("\n")
+       end
+
+       function print_source(source)
+               printf("  SRC[0x%x:0x%x]:\t%ux%u\t\t%s (%s)\n", source.base, source.flag, source.w, source.h, source.format, source.samples)
+       end
+
+       for base,source in pairs(sources) do
+               -- only show sources that have been previously rendered to, other
+               -- textures are less interesting.  Possibly this should be an
+               -- option somehow
+               if draws < 10 then
+                       print_source(source)
+               elseif allmrts[base] or draws == 0 then
+                       print_source(source)
+               elseif source.flag and allmrts[source.flag] then
+                       print_source(source)
+               end
+       end
+       reset()
+end
+
+function end_submit()
+       dbg("end_submit\n")
+       finish()
+end
+
+-- Track the current mode:
+local mode = ""
+function CP_SET_MARKER(pkt, size)
+       mode = pkt[0].MARKER
+       dbg("mode: %s\n", mode)
+end
+
+function CP_EVENT_WRITE(pkt, size)
+       if tostring(pkt[0].EVENT) ~= "BLIT" then
+               return
+       end
+       nullbatch = false
+       local m = tostring(mode)
+       if m == "RM6_GMEM" then
+               -- either clear or restore:
+               if r.RB_BLIT_INFO.CLEAR_MASK == 0 then
+                       restored[r.RB_BLIT_BASE_GMEM] = 1
+               else
+                       cleared[r.RB_BLIT_BASE_GMEM] = 1
+               end
+               -- push_mrt() because we could have GMEM
+               -- passes with only a clear and no draws:
+               local flag = 0
+               local sysmem = 0;
+               -- try to match up the GMEM addr with the MRT/DEPTH state,
+               -- to avoid relying on RB_BLIT_DST also getting written:
+               for n = 0,r.RB_FS_OUTPUT_CNTL1.MRT-1 do
+                       if r.RB_MRT[n].BASE_GMEM == r.RB_BLIT_BASE_GMEM then
+                               sysmem = r.RB_MRT[n].BASE_LO | (r.RB_MRT[n].BASE_HI << 32)
+                               flag = r.RB_MRT_FLAG_BUFFER[n].ADDR_LO | (r.RB_MRT_FLAG_BUFFER[n].ADDR_HI << 32)
+                               break
+                       end
+               end
+               if sysmem == 0 and r.RB_BLIT_BASE_GMEM == r.RB_DEPTH_BUFFER_BASE_GMEM then
+                       sysmem = r.RB_DEPTH_BUFFER_BASE_LO | (r.RB_DEPTH_BUFFER_BASE_HI << 32)
+                       flag = r.RB_DEPTH_FLAG_BUFFER_BASE_LO | (r.RB_DEPTH_FLAG_BUFFER_BASE_HI << 32)
+
+               end
+               --NOTE this can get confused by previous blits:
+               --if sysmem == 0 then
+               --      -- fallback:
+               --      sysmem = r.RB_BLIT_DST_LO | (r.RB_BLIT_DST_HI << 32)
+               --      flag = r.RB_BLIT_FLAG_DST_LO | (r.RB_BLIT_FLAG_DST_HI << 32)
+               --end
+               if not r.RB_BLIT_DST_INFO.FLAGS then
+                       flag = 0
+               end
+               -- TODO maybe just emit RB_BLIT_DST_LO/HI for clears.. otherwise
+               -- we get confused by stale values in registers.. not sure
+               -- if this is a problem w/ blob
+               push_mrt(r.RB_BLIT_DST_INFO.COLOR_FORMAT,
+                       r.RB_BLIT_SCISSOR_BR.X + 1,
+                       r.RB_BLIT_SCISSOR_BR.Y + 1,
+                       r.RB_BLIT_DST_INFO.SAMPLES,
+                       sysmem,
+                       flag,
+                       r.RB_BLIT_BASE_GMEM)
+       elseif m == "RM6_RESOLVE" then
+               resolved[r.RB_BLIT_BASE_GMEM] = 1
+       else
+               printf("I am confused!!!\n")
+       end
+end
+
+function A6XX_TEX_CONST(pkt, size)
+       push_source(pkt[0].FMT,
+               pkt[1].WIDTH, pkt[1].HEIGHT,
+               pkt[0].SAMPLES,
+               pkt[4].BASE_LO | (pkt[5].BASE_HI << 32),
+               pkt[7].FLAG_LO | (pkt[8].FLAG_HI << 32))
+end
+
+function handle_blit()
+       -- blob sometimes uses CP_BLIT for resolves, so filter those out:
+       -- TODO it would be nice to not hard-code GMEM addr:
+       -- TODO I guess the src can be an offset from GMEM addr..
+       if r.SP_PS_2D_SRC_LO == 0x100000 and not r.RB_2D_BLIT_CNTL.SOLID_COLOR then
+               resolved[0] = 1
+               return
+       end
+       if draws > 0 then
+               finish()
+       end
+       reset()
+       drawmode = "BLIT"
+       -- This kinda assumes that we are doing full img blits, which is maybe
+       -- Not completely legit.  We could perhaps instead just track pitch and
+       -- size/pitch??  Or maybe the size doesn't matter much
+       push_mrt(r.RB_2D_DST_INFO.COLOR_FORMAT,
+               r.GRAS_2D_DST_BR.X + 1,
+               r.GRAS_2D_DST_BR.Y + 1,
+               "MSAA_ONE",
+               r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32),
+               r.RB_2D_DST_FLAGS_LO | (r.RB_2D_DST_FLAGS_HI << 32),
+               -1)
+       if r.RB_2D_BLIT_CNTL.SOLID_COLOR then
+               dbg("CLEAR=%x\n", r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32))
+               cleared[r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)] = 1
+       else
+               push_source(r.SP_2D_SRC_FORMAT.COLOR_FORMAT,
+                       r.GRAS_2D_SRC_BR_X.X + 1,
+                       r.GRAS_2D_SRC_BR_Y.Y + 1,
+                       "MSAA_ONE",
+                       r.SP_PS_2D_SRC_LO | (r.SP_PS_2D_SRC_HI << 32),
+                       r.SP_PS_2D_SRC_FLAGS_LO | (r.SP_PS_2D_SRC_FLAGS_HI << 32))
+       end
+       blits = blits + 1
+       finish()
+end
+
+function valid_transition(curmode, newmode)
+       if curmode == "RM6_BINNING" and newmode == "RM6_GMEM" then
+               return true
+       end
+       if curmode == "RM6_GMEM" and newmode == "RM6_RESOLVE" then
+               return true
+       end
+       return false
+end
+
+function draw(primtype, nindx)
+       dbg("draw: %s (%s)\n", primtype, mode)
+       nullbatch = false
+       if primtype == "BLIT_OP_SCALE" then
+               handle_blit()
+               return
+       elseif primtype == "EVENT:BLIT" then
+               return
+       end
+
+       local m = tostring(mode)
+
+       -- detect changes in drawmode which indicate a different
+       -- pass..  BINNING->GMEM means same pass, but other
+       -- transitions mean different pass:
+       if drawmode and m ~= drawmode then
+               dbg("%s -> %s transition\n", drawmode, m)
+               if not valid_transition(drawmode, m) then
+                       dbg("invalid transition, new render pass!\n")
+                       finish()
+                       reset()
+               end
+       end
+
+       if m ~= "RM6_GMEM" and m ~= "RM6_BYPASS" then
+               if m == "RM6_BINNING" then
+                       drawmode = m
+                       return
+               end
+               if m == "RM6_RESOLVE" and primtype == "EVENT:BLIT" then
+                       return
+               end
+               printf("unknown MODE %s for primtype %s\n", m, primtype)
+               return
+       end
+
+       -- Only count the first tile for GMEM mode to avoid counting
+       -- each draw for each tile
+       if m == "RM6_GMEM" then
+               if r.RB_WINDOW_OFFSET.X ~= 0 or r.RB_WINDOW_OFFSET.Y ~= 0 then
+                       return
+               end
+       end
+
+       drawmode = m
+       local render_components = {}
+       render_components[0] = r.RB_RENDER_COMPONENTS.RT0;
+       render_components[1] = r.RB_RENDER_COMPONENTS.RT1;
+       render_components[2] = r.RB_RENDER_COMPONENTS.RT2;
+       render_components[3] = r.RB_RENDER_COMPONENTS.RT3;
+       render_components[4] = r.RB_RENDER_COMPONENTS.RT4;
+       render_components[5] = r.RB_RENDER_COMPONENTS.RT5;
+       render_components[6] = r.RB_RENDER_COMPONENTS.RT6;
+       render_components[7] = r.RB_RENDER_COMPONENTS.RT7;
+       for n = 0,r.RB_FS_OUTPUT_CNTL1.MRT-1 do
+               if render_components[n] ~= 0 then
+                       push_mrt(r.RB_MRT[n].BUF_INFO.COLOR_FORMAT,
+                               r.GRAS_SC_SCREEN_SCISSOR[0].BR.X + 1,
+                               r.GRAS_SC_SCREEN_SCISSOR[0].BR.Y + 1,
+                               r.RB_MSAA_CNTL.SAMPLES,
+                               r.RB_MRT[n].BASE_LO | (r.RB_MRT[n].BASE_HI << 32),
+                               r.RB_MRT_FLAG_BUFFER[n].ADDR_LO | (r.RB_MRT_FLAG_BUFFER[n].ADDR_HI << 32),
+                               r.RB_MRT[n].BASE_GMEM)
+               end
+       end
+
+       local depthbase = r.RB_DEPTH_BUFFER_BASE_LO |
+                       (r.RB_DEPTH_BUFFER_BASE_HI << 32)
+
+       if depthbase ~= 0 then
+               push_mrt(r.RB_DEPTH_BUFFER_INFO.DEPTH_FORMAT,
+                       r.GRAS_SC_SCREEN_SCISSOR[0].BR.X + 1,
+                       r.GRAS_SC_SCREEN_SCISSOR[0].BR.Y + 1,
+                       r.RB_MSAA_CNTL.SAMPLES,
+                       depthbase,
+                       r.RB_DEPTH_FLAG_BUFFER_BASE_LO | (r.RB_DEPTH_FLAG_BUFFER_BASE_HI << 32),
+                       r.RB_DEPTH_BUFFER_BASE_GMEM)
+       end
+
+       if r.RB_DEPTH_CNTL.Z_WRITE_ENABLE then
+               depthwrite = true
+       end
+
+       if r.RB_DEPTH_CNTL.Z_ENABLE then
+               depthtest = true
+       end
+
+       -- clearly 0 != false.. :-/
+       if r.RB_STENCILWRMASK.WRMASK ~= 0 then
+               stencilwrite = true
+       end
+
+       if r.RB_STENCIL_CONTROL.STENCIL_ENABLE then
+               stenciltest = true
+       end
+
+       -- TODO should also check for stencil buffer for z32+s8 case
+
+       if m == "RM6_GMEM" then
+               binw = r.VSC_BIN_SIZE.WIDTH
+               binh = r.VSC_BIN_SIZE.HEIGHT
+               nbins = r.VSC_BIN_COUNT.NX * r.VSC_BIN_COUNT.NY
+       end
+
+       draws = draws + 1
+end
+
diff --git a/src/freedreno/decode/scripts/sanity-a6xx.lua b/src/freedreno/decode/scripts/sanity-a6xx.lua
new file mode 100644 (file)
index 0000000..68e4c73
--- /dev/null
@@ -0,0 +1,76 @@
+-- Parse cmdstream dump and check for common errors
+--  1) Check for overflowing HLSQ_xS_CNTL.CONSTLEN
+--  2) Check for constant uploades that overwrite each other.  The
+--     range checking is reset on  each draw, since it is a valid
+--     use-case to do partial constant upload.  But if we see two
+--     CP_LOAD_STATE* that overwrite the same range of constants
+--     within the same draw, that is almost certainly unintentional.
+--
+-- TODO add more checks
+-- TODO maybe some parts could be shared across
+--      different generations
+
+--local posix = require "posix"
+
+function printf(fmt, ...)
+       return io.write(string.format(fmt, ...))
+end
+
+function dbg(fmt, ...)
+       --printf(fmt, ...)
+end
+
+stages = {
+       "SB6_VS_SHADER",
+       "SB6_HS_SHADER",
+       "SB6_DS_SHADER",
+       "SB6_GS_SHADER",
+       "SB6_FS_SHADER",
+       "SB6_CS_SHADER",
+}
+
+-- maps shader stage to HLSQ_xS_CNTL register name:
+cntl_regs = {
+       ["SB6_VS_SHADER"] = "HLSQ_VS_CNTL",
+       ["SB6_HS_SHADER"] = "HLSQ_HS_CNTL",
+       ["SB6_DS_SHADER"] = "HLSQ_DS_CNTL",
+       ["SB6_GS_SHADER"] = "HLSQ_GS_CNTL",
+       ["SB6_FS_SHADER"] = "HLSQ_FS_CNTL",
+       ["SB6_CS_SHADER"] = "HLSQ_CS_CNTL",
+}
+
+-- initialize constant updated ranges:
+--   constranges[stagename] -> table of offsets that have been uploaded
+constranges = {}
+function reset_constranges()
+       for i,stage in ipairs(stages) do
+               constranges[stage] = {}
+       end
+end
+
+reset_constranges()
+
+printf("Checking cmdstream...\n")
+
+local r = rnn.init("a630")
+
+function draw(primtype, nindx)
+       printf("draw!\n")
+       -- reset ranges of uploaded consts on each draw:
+       reset_constranges()
+end
+
+function CP_LOAD_STATE6(pkt, size)
+       if tostring(pkt[0].STATE_TYPE) ~= "ST6_CONSTANTS" then
+               return
+       end
+       dbg("got CP_LOAD_STATE6\n")
+       stage = tostring(pkt[0].STATE_BLOCK)
+       max = pkt[0].DST_OFF + pkt[0].NUM_UNIT
+       cntl_reg = cntl_regs[stage]
+       dbg("looking for %s.. max=%d vs %d\n", cntl_reg, max, r[cntl_reg].CONSTLEN)
+       if max > r[cntl_reg].CONSTLEN then
+               printf("ERROR: invalid max constant offset for stage %s: %d vs %d\n", stage, max, r[cntl_reg].CONSTLEN)
+       end
+
+end
diff --git a/src/freedreno/decode/scripts/test.lua b/src/freedreno/decode/scripts/test.lua
new file mode 100644 (file)
index 0000000..e9d8db2
--- /dev/null
@@ -0,0 +1,31 @@
+io.write("HELLO WORLD\n")
+
+r = rnn.init("a630")
+
+function start_cmdstream(name)
+  io.write("START: " .. name .. "\n")
+end
+
+function draw(primtype, nindx)
+  io.write("DRAW: " .. primtype .. ", " .. nindx .. "\n")
+  -- io.write("GRAS_CL_VPORT_XOFFSET: " .. r.GRAS_CL_VPORT_XOFFSET .. "\n")
+  io.write("RB_MRT[0].CONTROL.ROP_CODE: " .. r.RB_MRT[0].CONTROL.ROP_CODE .. "\n")
+  io.write("SP_VS_OUT[0].A_COMPMASK: " .. r.SP_VS_OUT[0].A_COMPMASK .. "\n")
+  --io.write("RB_DEPTH_CONTROL.Z_ENABLE: " .. tostring(r.RB_DEPTH_CONTROL.Z_ENABLE) .. "\n")
+  io.write("0x2280: written=" .. regs.written(0x2280) .. ", lastval=" .. regs.lastval(0x2280) .. ", val=" .. regs.val(0x2280) .. "\n")
+end
+
+function A6XX_TEX_CONST(pkt, size)
+  io.write("\n-------- " .. size .. "\n")
+  io.write("-------- w=" .. pkt[1].WIDTH .. ", h=" .. pkt[1].HEIGHT .. "\n")
+  io.write("\n");
+end
+
+function end_cmdstream()
+  io.write("END\n")
+end
+
+function finish()
+  io.write("FINISH\n")
+end
+
diff --git a/src/freedreno/decode/scripts/tex3d-layout.lua b/src/freedreno/decode/scripts/tex3d-layout.lua
new file mode 100644 (file)
index 0000000..2d5069f
--- /dev/null
@@ -0,0 +1,137 @@
+-- Parse logs from test-quad-textured-3d.c to exctract layer/level
+-- offsets
+--
+-- We figure out the offsets from blits, but there may be some
+-- unrelated blits.  So just save all of them until we find the
+-- texture state for the 3d texture.  This gives us the base
+-- address, and the miplevel #0 width/height/depth.  Then work
+-- backwards from there finding the blits to the same dst buffer
+-- and deducing the miplevel from the minified dimensions
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+local allblits = {}
+local nallblits = 0
+local r = rnn.init("a630")
+
+function minify(val, lvls)
+  val = val >> lvls
+  if val < 1 then
+    return 1
+  end
+  return val
+end
+
+function printf(fmt, ...)
+  return io.write(string.format(fmt, ...))
+end
+
+function start_cmdstream(name)
+  io.write("Parsing " .. name .. "\n")
+  allblits = {}
+  nallblits = 0
+end
+
+function draw(primtype, nindx)
+  if primtype ~= "BLIT_OP_SCALE" then
+    return
+  end
+
+  -- Just in case, filter out anything that isn't starting
+  -- at 0,0
+  if r.GRAS_2D_DST_TL.X ~= 0 or r.GRAS_2D_DST_TL.Y ~= 0 then
+    return
+  end
+
+  local blit = {}
+  
+  blit.width   = r.GRAS_2D_DST_BR.X + 1
+  blit.height  = r.GRAS_2D_DST_BR.Y + 1
+  blit.pitch   = r.RB_2D_DST_SIZE.PITCH
+  blit.addr    = r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.endaddr = 0  -- filled in later
+  --printf("Found blit: 0x%x (0x%x)\n", blit.addr, blit.base)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function A6XX_TEX_CONST(pkt, size)
+  -- ignore any texture state w/ DEPTH=1, these aren't the 3d tex state we
+  -- are looking for
+  if pkt[5].DEPTH <= 1 then
+    return
+  end
+
+  local base = pkt[4].BASE_LO | (pkt[5].BASE_HI << 32)
+  local width0  = pkt[1].WIDTH
+  local height0 = pkt[1].HEIGHT
+  local depth0  = pkt[5].DEPTH
+
+  printf("Found texture state: %ux%ux%u (MIN_LAYERSZ=0x%x)\n",
+         width0, height0, depth0, pkt[3].MIN_LAYERSZ)
+
+  -- Note that in some case the texture has some extra page or so
+  -- at the beginning:
+  local basebase = bos.base(base)
+  printf("base: 0x%x (0x%x)\n", base, basebase)
+
+  -- see if we can find the associated blits..  The blob always seems to
+  -- start from the lower (larger) mipmap levels and layers, so we don't
+  -- need to sort by dst address.  Also, while we are at it, fill in the
+  -- end-addr (at least for everything but the last blit)
+  local blits = {}
+  local nblits = 0
+  local lastblit = nil
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    --printf("blit addr: 0x%x (0x%x)\n", blit.addr, blit.base)
+    if blit.base == basebase and blit.addr >= base then
+      blits[nblits] = blit
+      nblits = nblits + 1
+      if lastblit then
+        lastblit.endaddr = blit.addr
+      end
+      lastblit = blit
+    end
+  end
+
+  -- now go thru the relevant blits and print out interesting details
+  local level = 0
+  local layer = 0
+  local w = width0   -- track current width/height to detect changing
+  local h = height0  -- mipmap level
+  for n = 0,nblits-1 do
+    local blit = blits[n]
+    --printf("%u: %ux%u, addr=%x\n", n, blit.width, blit.height, blit.addr)
+    if w ~= blit.width or h ~= blit.height then
+      level = level + 1
+      layer = 0
+
+      if blit.width ~= minify(w, 1) or blit.height ~= minify(h, 1) then
+        printf("I am confused! %ux%u vs %ux%u\n", blit.width, blit.height, minify(w, 1), minify(h, 1))
+       printf("addr=%x\n", blit.addr)
+        --return
+      end
+
+      w = blit.width
+      h = blit.height
+    end
+
+    printf("level=%u, layer=%u, sz=%ux%u, pitch=%u, offset=0x%x, addr=%x",
+           level, layer, w, h, blit.pitch, blit.addr - base, blit.addr)
+    if blit.endaddr ~= 0 then
+      local layersz = blit.endaddr - blit.addr
+      local alignedheight = layersz / blit.pitch
+      printf(", layersz=0x%x, alignedheight=%f", layersz, alignedheight)
+    end
+    printf("\n")
+
+    layer = layer + 1
+  end
+  printf("\n\n")
+end
+
diff --git a/src/freedreno/decode/scripts/texturator-to-unit-test-5xx.lua b/src/freedreno/decode/scripts/texturator-to-unit-test-5xx.lua
new file mode 100644 (file)
index 0000000..b0ac8cb
--- /dev/null
@@ -0,0 +1,200 @@
+-- Parse logs from https://github.com/freedreno/freedreno/
+-- test-texturator.c to generate a src/freedreno/fdl/fd5_layout_test.c
+-- block.  We figure out the offsets from blits, but there may be some
+-- unrelated blits.  So just save all of them until we find the
+-- texture state.  This gives us the base address, and the miplevel #0
+-- width/height/depth.  Then work backwards from there finding the
+-- blits to the same dst buffer and deducing the miplevel from the
+-- minified dimensions
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+local r = rnn.init("a530")
+local found_tex = 0
+
+local allblits = {}
+local nallblits = 0
+
+function get_first_blit(base, width, height)
+  local first_blit = nil
+
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    if blit.base == base and blit.width == width and blit.height == height then
+      if not first_blit or blit.addr < first_blit.addr then
+        first_blit = blit
+      end
+    end
+  end
+
+  return first_blit
+end
+
+function minify(val, lvls)
+  val = val >> lvls
+  if val < 1 then
+    return 1
+  end
+  return val
+end
+
+function printf(fmt, ...)
+  return io.write(string.format(fmt, ...))
+end
+
+function start_cmdstream(name)
+  io.write("Parsing " .. name .. "\n")
+  allblits = {}
+  nallblits = 0
+end
+
+-- Record texture upload blits done through CP_EVENT_WRITE
+function CP_EVENT_WRITE(pkt, size)
+  if tostring(pkt[0].EVENT) ~= "BLIT" then
+    return
+  end
+
+  local blit = {}
+
+  blit.width   = r.RB_RESOLVE_CNTL_2.X + 1
+  blit.height  = r.RB_RESOLVE_CNTL_2.Y + 1
+  blit.pitch   = r.RB_BLIT_DST_PITCH
+  blit.addr    = r.RB_BLIT_DST_LO | (r.RB_BLIT_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.ubwc_addr = r.RB_BLIT_FLAG_DST_LO | (r.RB_BLIT_FLAG_DST_HI << 32)
+  blit.ubwc_base = bos.base(blit.ubwc_addr)
+  blit.ubwc_pitch = r.RB_BLIT_FLAG_DST_PITCH
+  blit.endaddr = 0  -- filled in later
+  printf("Found event blit: 0x%x (0x%x) %dx%d UBWC 0x%x (0x%x) tiled %s\n", blit.addr, blit.base, blit.width, blit.height, blit.ubwc_addr, blit.ubwc_base, r.RB_RESOLVE_CNTL_3.TILED)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function CP_BLIT(pkt, size)
+  -- Just in case, filter out anything that isn't starting
+  -- at 0,0
+  if pkt[1].SRC_X1 ~= 0 or pkt[1].SRC_Y1 ~= 0 then
+    return
+  end
+
+  local blit = {}
+
+  blit.width   = pkt[2].SRC_X2 + 1
+  blit.height  = pkt[2].SRC_Y2 + 1
+  blit.pitch   = r.RB_2D_DST_SIZE.PITCH
+  blit.addr    = r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.ubwc_addr = r.RB_2D_DST_FLAGS_LO | (r.RB_2D_DST_FLAGS_HI << 32)
+  blit.ubwc_base = bos.base(blit.ubwc_addr)
+  blit.ubwc_pitch = r.RB_2D_DST_FLAGS_PITCH
+  blit.endaddr = 0  -- filled in later
+  printf("Found cp blit: 0x%x (0x%x) %dx%d UBWC 0x%x (0x%x) %s\n", blit.addr, blit.base, blit.width, blit.height, blit.ubwc_addr, blit.ubwc_base, r.RB_2D_DST_INFO.TILE_MODE)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function A5XX_TEX_CONST(pkt, size)
+  -- ignore any texture state w/ DEPTH=1, these aren't the 3d tex state we
+  -- are looking for
+
+  local base = pkt[4].BASE_LO | (pkt[5].BASE_HI << 32)
+  -- UBWC base on a5xx seems to be at the start of each miplevel, followed by pixels
+  -- somewhere past that.
+  local ubwc_base = base
+  local width0  = pkt[1].WIDTH
+  local height0 = pkt[1].HEIGHT
+  local depth0  = pkt[5].DEPTH
+
+  if (found_tex ~= 0) then
+    return
+  end
+  found_tex = 1
+
+  printf("Found texture state:\n  %ux%ux%u (%s, %s, UBWC=%s)\n",
+         width0, height0, depth0, pkt[0].FMT, pkt[0].TILE_MODE, tostring(pkt[3].FLAG))
+
+  -- Note that in some case the texture has some extra page or so
+  -- at the beginning:
+  local basebase = bos.base(base)
+  printf("base: 0x%x (0x%x)\n", base, basebase)
+  printf("ubwcbase: 0x%x (0x%x)\n", ubwc_base, bos.base(ubwc_base))
+
+  -- see if we can find the associated blits..  The blob always seems to
+  -- start from the lower (larger) mipmap levels and layers, so we don't
+  -- need to sort by dst address.  Also, while we are at it, fill in the
+  -- end-addr (at least for everything but the last blit)
+  local blits = {}
+  local nblits = 0
+  local lastblit = nil
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    --printf("blit addr: 0x%x (0x%x)\n", blit.addr, blit.base)
+    if blit.base == basebase and blit.addr >= base then
+      blits[nblits] = blit
+      nblits = nblits + 1
+      if lastblit then
+        lastblit.endaddr = blit.addr
+      end
+      lastblit = blit
+    end
+  end
+
+  printf("     {\n")
+  printf("             .format = %s,\n", pkt[0].FMT)
+  if (tostring(pkt[2].TYPE) == "A5XX_TEX_3D") then
+    printf("           .is_3d = true,\n")
+  end
+
+  printf("             .layout = {\n")
+  printf("                     .tile_mode = %s,\n", pkt[0].TILE_MODE)
+  printf("                     .ubwc = %s,\n", tostring(pkt[3].FLAG))
+
+  if (tostring(pkt[2].TYPE) == "A5XX_TEX_3D") then
+    printf("                   .width0 = %d, .height0 = %d, .depth0 = %d,\n", width0, height0, depth0)
+  else
+    printf("                   .width0 = %d, .height0 = %d,\n", width0, height0)
+  end
+
+  printf("                     .slices = {\n")
+  local w = 0
+  local h = 0
+  local level = 0
+  repeat
+    local w = minify(width0, level)
+    local h = minify(height0, level)
+    local blit = get_first_blit(basebase, w, h)
+    if blit then
+      printf("                         { .offset = %d, .pitch = %u },\n",
+          blit.addr - base,
+          blit.pitch);
+    end
+    level = level + 1
+  until w == 1 and h == 1
+  printf("                     },\n")
+
+  if pkt[3].FLAG then
+    printf("                   .ubwc_slices = {\n")
+    level = 0
+    repeat
+      local w = minify(width0, level)
+      local h = minify(height0, level)
+      local blit = get_first_blit(basebase, w, h)
+      if blit then
+        printf("                               { .offset = %d, .pitch = %u },\n",
+            blit.ubwc_addr - ubwc_base,
+            blit.ubwc_pitch);
+      end
+      level = level + 1
+    until w == 1 and h == 1
+    printf("                   },\n")
+  end
+
+  printf("             },\n")
+  printf("     },\n")
+  printf("\n\n")
+end
+
diff --git a/src/freedreno/decode/scripts/texturator-to-unit-test.lua b/src/freedreno/decode/scripts/texturator-to-unit-test.lua
new file mode 100644 (file)
index 0000000..8836d59
--- /dev/null
@@ -0,0 +1,179 @@
+-- Parse logs from https://github.com/freedreno/freedreno/
+-- test-texturator.c to generate a src/freedreno/fdl/fd6_layout_test.c
+-- block.  We figure out the offsets from blits, but there may be some
+-- unrelated blits.  So just save all of them until we find the
+-- texture state.  This gives us the base address, and the miplevel #0
+-- width/height/depth.  Then work backwards from there finding the
+-- blits to the same dst buffer and deducing the miplevel from the
+-- minified dimensions
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+local r = rnn.init("a630")
+local found_tex = 0
+
+local allblits = {}
+local nallblits = 0
+
+function get_first_blit(base, width, height)
+  local first_blit = nil
+
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    if blit.base == base and blit.width == width and blit.height == height then
+      if not first_blit or blit.addr < first_blit.addr then
+        first_blit = blit
+      end
+    end
+  end
+
+  return first_blit
+end
+
+function minify(val, lvls)
+  val = val >> lvls
+  if val < 1 then
+    return 1
+  end
+  return val
+end
+
+function printf(fmt, ...)
+  return io.write(string.format(fmt, ...))
+end
+
+function start_cmdstream(name)
+  io.write("Parsing " .. name .. "\n")
+  allblits = {}
+  nallblits = 0
+end
+
+function draw(primtype, nindx)
+  if primtype ~= "BLIT_OP_SCALE" then
+    return
+  end
+
+  -- Just in case, filter out anything that isn't starting
+  -- at 0,0
+  if r.GRAS_2D_DST_TL.X ~= 0 or r.GRAS_2D_DST_TL.Y ~= 0 then
+    return
+  end
+
+  local blit = {}
+
+  blit.width   = r.GRAS_2D_DST_BR.X + 1
+  blit.height  = r.GRAS_2D_DST_BR.Y + 1
+  blit.pitch   = r.RB_2D_DST_SIZE.PITCH
+  blit.addr    = r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.ubwc_addr = r.RB_2D_DST_FLAGS_LO | (r.RB_2D_DST_FLAGS_HI << 32)
+  blit.ubwc_base = bos.base(blit.uwbc_addr)
+  blit.ubwc_pitch = r.RB_2D_DST_FLAGS_PITCH.PITCH
+  blit.endaddr = 0  -- filled in later
+  printf("Found blit: 0x%x (0x%x) %dx%d UBWC 0x%x (0x%x)\n", blit.addr, blit.base, blit.width, blit.height, blit.ubwc_addr, blit.ubwc_base)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function A6XX_TEX_CONST(pkt, size)
+  -- ignore any texture state w/ DEPTH=1, these aren't the 3d tex state we
+  -- are looking for
+
+  local base = pkt[4].BASE_LO | (pkt[5].BASE_HI << 32)
+  local ubwc_base = pkt[7].FLAG_LO | (pkt[8].FLAG_HI << 32)
+  local width0  = pkt[1].WIDTH
+  local height0 = pkt[1].HEIGHT
+  local depth0  = pkt[5].DEPTH
+
+  if (found_tex ~= 0) then
+    return
+  end
+  found_tex = 1
+
+  printf("Found texture state:\n  %ux%ux%u (%s, %s, MIN_LAYERSZ=0x%x, TILE_ALL=%s, UBWC=%s FLAG_LOG2=%ux%u)\n",
+         width0, height0, depth0, pkt[0].FMT, pkt[0].TILE_MODE, pkt[3].MIN_LAYERSZ, tostring(pkt[3].TILE_ALL), tostring(pkt[3].FLAG), pkt[10].FLAG_BUFFER_LOGW, pkt[10].FLAG_BUFFER_LOGH)
+
+  -- Note that in some case the texture has some extra page or so
+  -- at the beginning:
+  local basebase = bos.base(base)
+  printf("base: 0x%x (0x%x)\n", base, basebase)
+  printf("ubwcbase: 0x%x (0x%x)\n", ubwc_base, bos.base(ubwc_base))
+
+  -- see if we can find the associated blits..  The blob always seems to
+  -- start from the lower (larger) mipmap levels and layers, so we don't
+  -- need to sort by dst address.  Also, while we are at it, fill in the
+  -- end-addr (at least for everything but the last blit)
+  local blits = {}
+  local nblits = 0
+  local lastblit = nil
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    --printf("blit addr: 0x%x (0x%x)\n", blit.addr, blit.base)
+    if blit.base == basebase and blit.addr >= base then
+      blits[nblits] = blit
+      nblits = nblits + 1
+      if lastblit then
+        lastblit.endaddr = blit.addr
+      end
+      lastblit = blit
+    end
+  end
+
+  printf("     {\n")
+  printf("             .format = %s,\n", pkt[0].FMT)
+  if (tostring(pkt[2].TYPE) == "A6XX_TEX_3D") then
+    printf("           .is_3d = true,\n")
+  end
+
+  printf("             .layout = {\n")
+  printf("                     .tile_mode = %s,\n", pkt[0].TILE_MODE)
+  printf("                     .ubwc = %s,\n", tostring(pkt[3].FLAG))
+
+  if (tostring(pkt[2].TYPE) == "A6XX_TEX_3D") then
+    printf("                   .width0 = %d, .height0 = %d, .depth = %d,\n", width0, height0, depth0)
+  else
+    printf("                   .width0 = %d, .height0 = %d,\n", width0, height0)
+  end
+
+  printf("                     .slices = {\n")
+  local w = 0
+  local h = 0
+  local level = 0
+  repeat
+    local w = minify(width0, level)
+    local h = minify(height0, level)
+    local blit = get_first_blit(basebase, w, h)
+    if blit then
+      printf("                         { .offset = %d, .pitch = %u },\n",
+          blit.addr - base,
+          blit.pitch);
+    end
+    level = level + 1
+  until w == 1 and h == 1
+  printf("                     },\n")
+
+  if pkt[3].FLAG then
+    printf("                   .ubwc_slices = {\n")
+    level = 0
+    repeat
+      local w = minify(width0, level)
+      local h = minify(height0, level)
+      local blit = get_first_blit(basebase, w, h)
+      if blit then
+        printf("                               { .offset = %d, .pitch = %u },\n",
+            blit.ubwc_addr - ubwc_base,
+            blit.ubwc_pitch);
+      end
+      level = level + 1
+    until w == 1 and h == 1
+    printf("                   },\n")
+  end
+
+  printf("             },\n")
+  printf("     },\n")
+  printf("\n\n")
+end
+
diff --git a/src/freedreno/decode/util.h b/src/freedreno/decode/util.h
new file mode 100644 (file)
index 0000000..1ec0202
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2012-2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <ctype.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* old-style program binary XOR'd ascii w/ 0xff */
+#ifndef ASCII_XOR
+#  define ASCII_XOR 0
+#endif
+
+static inline const char *tab(int lvl)
+{
+       const char *TAB = "\t\t\t\t\t\t\t\t\0";
+       return &TAB[strlen(TAB) - lvl];
+}
+
+/* convert float to dword */
+static inline float d2f(uint32_t d)
+{
+       union {
+               float f;
+               uint32_t d;
+       } u = {
+               .d = d,
+       };
+       return u.f;
+}
+
+static inline void dump_hex(const void *buf, int sz)
+{
+       uint8_t *ptr = (uint8_t *)buf;
+       uint8_t *end = ptr + sz;
+       int i = 0;
+
+       while (ptr < end) {
+               uint32_t d = 0;
+
+               printf((i % 8) ? " " : "\t");
+
+               d |= *(ptr++) <<  0;
+               d |= *(ptr++) <<  8;
+               d |= *(ptr++) << 16;
+               d |= *(ptr++) << 24;
+
+               printf("%08x", d);
+
+               if ((i % 8) == 7) {
+                       printf("\n");
+               }
+
+               i++;
+       }
+
+       if (i % 8) {
+               printf("\n");
+       }
+}
+
+static inline void
+dump_float(const void *buf, int sz)
+{
+       uint8_t *ptr = (uint8_t *)buf;
+       uint8_t *end = ptr + sz - 3;
+       int i = 0;
+
+       while (ptr < end) {
+               uint32_t d = 0;
+
+               printf((i % 8) ? " " : "\t");
+
+               d |= *(ptr++) <<  0;
+               d |= *(ptr++) <<  8;
+               d |= *(ptr++) << 16;
+               d |= *(ptr++) << 24;
+
+               printf("%8f", d2f(d));
+
+               if ((i % 8) == 7) {
+                       printf("\n");
+               }
+
+               i++;
+       }
+
+       if (i % 8) {
+               printf("\n");
+       }
+}
+
+#define is_ok_ascii(c) \
+       (isascii(c) && ((c == '\t') || !iscntrl(c)))
+
+static inline void
+clean_ascii(char *buf, int sz)
+{
+       uint8_t *ptr = (uint8_t *)buf;
+       uint8_t *end = ptr + sz;
+       while (ptr < end) {
+               *(ptr++) ^= ASCII_XOR;
+       }
+}
+
+static inline void
+dump_ascii(const void *buf, int sz)
+{
+       uint8_t *ptr = (uint8_t *)buf;
+       uint8_t *end = ptr + sz;
+       printf("\t");
+       while (ptr < end) {
+               uint8_t c = *(ptr++) ^ ASCII_XOR;
+               if (c == '\n') {
+                       printf("\n\t");
+               } else if (c == '\0') {
+                       printf("\n\t-----------------------------------\n\t");
+               } else if (is_ok_ascii(c)) {
+                       printf("%c", c);
+               } else {
+                       printf("?");
+               }
+       }
+       printf("\n");
+}
+
+static inline void
+dump_hex_ascii(const void *buf, int sz, int level)
+{
+       uint8_t *ptr = (uint8_t *)buf;
+       uint8_t *end = ptr + sz;
+       uint8_t *ascii = ptr;
+       int i = 0;
+
+       printf("%s-----------------------------------------------\n", tab(level));
+       printf("%s%d (0x%x) bytes\n", tab(level), sz, sz);
+
+       while (ptr < end) {
+               uint32_t d = 0;
+
+               if (i % 4) {
+                       printf(" ");
+               } else {
+                       printf("%s%06x: ", tab(level), (uint32_t)(ptr - (uint8_t *)buf));
+               }
+
+               d |= *(ptr++) <<  0;
+               d |= *(ptr++) <<  8;
+               d |= *(ptr++) << 16;
+               d |= *(ptr++) << 24;
+
+               printf("%08x", d);
+
+               if ((i % 4) == 3) {
+                       int j;
+                       printf("\t|");
+                       for (j = 0; j < 16; j++) {
+                               uint8_t c = *(ascii++);
+                               c ^= ASCII_XOR;
+                               printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
+                       }
+                       printf("|\n");
+               }
+
+               i++;
+       }
+
+       if (i % 4) {
+               for (int j = 4 - (i % 4); j > 0; j--) {
+                       printf("         ");
+               }
+               printf("\t|");
+               while (ascii < end) {
+                       uint8_t c = *(ascii++);
+                       c ^= ASCII_XOR;
+                       printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
+               }
+               printf("|\n");
+       }
+}
+
+#endif /* __UTIL_H__ */
index 7b6ab53..6405a7d 100644 (file)
@@ -19,6 +19,7 @@
 # SOFTWARE.
 
 inc_freedreno = include_directories(['.', './registers'])
+inc_freedreno_rnn = include_directories('rnn')
 
 subdir('common')
 subdir('registers')
@@ -33,6 +34,7 @@ dep_libxml2 = dependency('libxml-2.0', required: false)
 # Everything that depends on rnn requires (indirectly) libxml2:
 if dep_libxml2.found()
   subdir('rnn')
+  subdir('decode')
 endif
 
 if with_tools.contains('drm-shim')