radeonsi: write shader asm annotated with wave info into GPU hang reports
authorMarek Olšák <marek.olsak@amd.com>
Tue, 29 Nov 2016 14:35:11 +0000 (15:35 +0100)
committerMarek Olšák <marek.olsak@amd.com>
Fri, 10 Feb 2017 10:27:50 +0000 (11:27 +0100)
Note that the disassembly is written twice - first the unmodified compiler
output and then the wave-annotated output only if there are waves executing
the shader.

Sample output from a real GPU hang most likely caused by image_sample:

The number of active waves = 28

Pixel Shader - annotated disassembly:
    s_mov_b64 s[6:7], exec                                        ; BE86017E [PC=0x10f3e3800, off=0, size=4]
    s_wqm_b64 exec, exec                                          ; BEFE077E [PC=0x10f3e3804, off=4, size=4]
...
    image_sample v[7:9], v[0:1], s[12:19], s[20:23] dmask:0x7     ; F0800700 00A30700 [PC=0x10f3e3a94, off=660, size=8]
    s_buffer_load_dword s20, s[0:3], 0x50                         ; C0220500 00000050 [PC=0x10f3e3a9c, off=668, size=8]
    s_load_dwordx4 s[24:27], s[4:5], 0x170                        ; C00A0602 00000170 [PC=0x10f3e3aa4, off=676, size=8]
    s_load_dwordx8 s[12:19], s[4:5], 0x140                        ; C00E0302 00000140 [PC=0x10f3e3aac, off=684, size=8]
    s_buffer_load_dword s11, s[0:3], 0x5c                         ; C02202C0 0000005C [PC=0x10f3e3ab4, off=692, size=8]
    s_buffer_load_dword s21, s[0:3], 0x54                         ; C0220540 00000054 [PC=0x10f3e3abc, off=700, size=8]
    s_buffer_load_dword s22, s[0:3], 0x58                         ; C0220580 00000058 [PC=0x10f3e3ac4, off=708, size=8]
    s_waitcnt vmcnt(0)                                            ; BF8C0F70 [PC=0x10f3e3acc, off=716, size=4]
          ^ SE0 SH0 CU1 SIMD1 WAVE0  EXEC=aaaaaaa555aaaaaa  INST32=BF8C0F70
          ^ SE0 SH0 CU1 SIMD2 WAVE0  EXEC=aaaa85555555552a  INST32=BF8C0F70
          ^ SE0 SH0 CU1 SIMD3 WAVE0  EXEC=000000000000000a  INST32=BF8C0F70
          ^ SE0 SH0 CU6 SIMD1 WAVE0  EXEC=25a5a5aa82aaaaaa  INST32=BF8C0F70
          ^ SE0 SH0 CU6 SIMD3 WAVE0  EXEC=50aaaa8fffa55555  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD0 WAVE0  EXEC=5554aaaaaaa1a555  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD0 WAVE1  EXEC=aaaa5555ffffffff  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD1 WAVE0  EXEC=555557aaaaaaaaa5  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD3 WAVE0  EXEC=5555aaaaaaaaaa85  INST32=BF8C0F70
          ^ SE1 SH0 CU3 SIMD1 WAVE0  EXEC=aaaaaaaaaaaaaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD0 WAVE0  EXEC=aaaaaaaa5a5a5a5a  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD1 WAVE0  EXEC=aaaaaaa5a5a5a4a5  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD2 WAVE0  EXEC=5555555000000000  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD3 WAVE0  EXEC=aa555554155aaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD0 WAVE0  EXEC=55ffff55555555aa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD1 WAVE0  EXEC=555555555aaaaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD2 WAVE0  EXEC=a0aaaaaaa8555555  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD3 WAVE0  EXEC=8aaaaaaaaaaaa555  INST32=BF8C0F70
          ^ SE1 SH0 CU6 SIMD0 WAVE0  EXEC=000000002aaaaaaa  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD0 WAVE0  EXEC=5aaaa5400aaaa15a  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD1 WAVE0  EXEC=00aaaaaaaa5555aa  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD2 WAVE0  EXEC=aa00005555554555  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD3 WAVE0  EXEC=aaaaaaa000000000  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD0 WAVE0  EXEC=5555aaaaaaaaaaaa  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD2 WAVE0  EXEC=ffaaaaaaaaaa5555  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD3 WAVE0  EXEC=aaaa55555555aa00  INST32=BF8C0F70
          ^ SE3 SH0 CU5 SIMD0 WAVE0  EXEC=00aaaaaaaaaaaa5a  INST32=BF8C0F70
          ^ SE3 SH0 CU5 SIMD1 WAVE0  EXEC=5a555555005555ff  INST32=BF8C0F70
    v_mul_f32_e32 v7, s6, v7                                      ; 0A0E0E06 [PC=0x10f3e3ad0, off=720, size=4]
...

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_debug.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h

index e4a350f..1f154c2 100644 (file)
@@ -468,6 +468,254 @@ static void si_dump_descriptors(struct si_context *sctx,
                                        num_elements[i], f);
 }
 
+struct si_shader_inst {
+       char text[160];  /* one disasm line */
+       unsigned offset; /* instruction offset */
+       unsigned size;   /* instruction size = 4 or 8 */
+};
+
+/* Split a disassembly string into lines and add them to the array pointed
+ * to by "instructions". */
+static void si_add_split_disasm(const char *disasm,
+                               uint64_t start_addr,
+                               unsigned *num,
+                               struct si_shader_inst *instructions)
+{
+       struct si_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL;
+       char *next;
+
+       while ((next = strchr(disasm, '\n'))) {
+               struct si_shader_inst *inst = &instructions[*num];
+               unsigned len = next - disasm;
+
+               assert(len < ARRAY_SIZE(inst->text));
+               memcpy(inst->text, disasm, len);
+               inst->text[len] = 0;
+               inst->offset = last_inst ? last_inst->offset + last_inst->size : 0;
+
+               const char *semicolon = strchr(disasm, ';');
+               assert(semicolon);
+               /* More than 16 chars after ";" means the instruction is 8 bytes long. */
+               inst->size = next - semicolon > 16 ? 8 : 4;
+
+               snprintf(inst->text + len, ARRAY_SIZE(inst->text) - len,
+                       " [PC=0x%"PRIx64", off=%u, size=%u]",
+                       start_addr + inst->offset, inst->offset, inst->size);
+
+               last_inst = inst;
+               (*num)++;
+               disasm = next + 1;
+       }
+}
+
+#define MAX_WAVES_PER_CHIP (64 * 40)
+
+struct si_wave_info {
+       unsigned se; /* shader engine */
+       unsigned sh; /* shader array */
+       unsigned cu; /* compute unit */
+       unsigned simd;
+       unsigned wave;
+       uint32_t status;
+       uint64_t pc; /* program counter */
+       uint32_t inst_dw0;
+       uint32_t inst_dw1;
+       uint64_t exec;
+       bool matched; /* whether the wave is used by a currently-bound shader */
+};
+
+static int compare_wave(const void *p1, const void *p2)
+{
+       struct si_wave_info *w1 = (struct si_wave_info *)p1;
+       struct si_wave_info *w2 = (struct si_wave_info *)p2;
+
+       /* Sort waves according to PC and then SE, SH, CU, etc. */
+       if (w1->pc < w2->pc)
+               return -1;
+       if (w1->pc > w2->pc)
+               return 1;
+       if (w1->se < w2->se)
+               return -1;
+       if (w1->se > w2->se)
+               return 1;
+       if (w1->sh < w2->sh)
+               return -1;
+       if (w1->sh > w2->sh)
+               return 1;
+       if (w1->cu < w2->cu)
+               return -1;
+       if (w1->cu > w2->cu)
+               return 1;
+       if (w1->simd < w2->simd)
+               return -1;
+       if (w1->simd > w2->simd)
+               return 1;
+       if (w1->wave < w2->wave)
+               return -1;
+       if (w1->wave > w2->wave)
+               return 1;
+
+       return 0;
+}
+
+/* Return wave information. "waves" should be a large enough array. */
+static unsigned si_get_wave_info(struct si_wave_info waves[MAX_WAVES_PER_CHIP])
+{
+       char line[2000];
+       unsigned num_waves = 0;
+
+       FILE *p = popen("umr -wa", "r");
+       if (!p)
+               return 0;
+
+       if (!fgets(line, sizeof(line), p) ||
+           strncmp(line, "SE", 2) != 0) {
+               pclose(p);
+               return 0;
+       }
+
+       while (fgets(line, sizeof(line), p)) {
+               struct si_wave_info *w;
+               uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
+
+               assert(num_waves < MAX_WAVES_PER_CHIP);
+               w = &waves[num_waves];
+
+               if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x",
+                          &w->se, &w->sh, &w->cu, &w->simd, &w->wave,
+                          &w->status, &pc_hi, &pc_lo, &w->inst_dw0,
+                          &w->inst_dw1, &exec_hi, &exec_lo) == 12) {
+                       w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
+                       w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
+                       w->matched = false;
+                       num_waves++;
+               }
+       }
+
+       qsort(waves, num_waves, sizeof(struct si_wave_info), compare_wave);
+
+       pclose(p);
+       return num_waves;
+}
+
+/* If the shader is being executed, print its asm instructions, and annotate
+ * those that are being executed right now with information about waves that
+ * execute them. This is most useful during a GPU hang.
+ */
+static void si_print_annotated_shader(struct si_shader *shader,
+                                     struct si_wave_info *waves,
+                                     unsigned num_waves,
+                                     FILE *f)
+{
+       if (!shader || !shader->binary.disasm_string)
+               return;
+
+       uint64_t start_addr = shader->bo->gpu_address;
+       uint64_t end_addr = start_addr + shader->bo->b.b.width0;
+       unsigned i;
+
+       /* See if any wave executes the shader. */
+       for (i = 0; i < num_waves; i++) {
+               if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
+                       break;
+       }
+       if (i == num_waves)
+               return; /* the shader is not being executed */
+
+       /* Remember the first found wave. The waves are sorted according to PC. */
+       waves = &waves[i];
+       num_waves -= i;
+
+       /* Get the list of instructions.
+        * Buffer size / 4 is the upper bound of the instruction count.
+        */
+       unsigned num_inst = 0;
+       struct si_shader_inst *instructions =
+               calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
+
+       if (shader->prolog) {
+               si_add_split_disasm(shader->prolog->binary.disasm_string,
+                                   start_addr, &num_inst, instructions);
+       }
+       si_add_split_disasm(shader->binary.disasm_string,
+                           start_addr, &num_inst, instructions);
+       if (shader->epilog) {
+               si_add_split_disasm(shader->epilog->binary.disasm_string,
+                                   start_addr, &num_inst, instructions);
+       }
+
+       fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
+               si_get_shader_name(shader, shader->selector->type));
+
+       /* Print instructions with annotations. */
+       for (i = 0; i < num_inst; i++) {
+               struct si_shader_inst *inst = &instructions[i];
+
+               fprintf(f, "%s\n", inst->text);
+
+               /* Print which waves execute the instruction right now. */
+               while (num_waves && start_addr + inst->offset == waves->pc) {
+                       fprintf(f,
+                               "          " COLOR_GREEN "^ SE%u SH%u CU%u "
+                               "SIMD%u WAVE%u  EXEC=%016"PRIx64 "  ",
+                               waves->se, waves->sh, waves->cu, waves->simd,
+                               waves->wave, waves->exec);
+
+                       if (inst->size == 4) {
+                               fprintf(f, "INST32=%08X" COLOR_RESET "\n",
+                                       waves->inst_dw0);
+                       } else {
+                               fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
+                                       waves->inst_dw0, waves->inst_dw1);
+                       }
+
+                       waves->matched = true;
+                       waves = &waves[1];
+                       num_waves--;
+               }
+       }
+
+       fprintf(f, "\n\n");
+       free(instructions);
+}
+
+static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
+{
+       struct si_wave_info waves[MAX_WAVES_PER_CHIP];
+       unsigned num_waves = si_get_wave_info(waves);
+
+       fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
+               "\n\n", num_waves);
+
+       si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
+       si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
+       si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
+       si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
+       si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
+
+       /* Print waves executing shaders that are not currently bound. */
+       unsigned i;
+       bool found = false;
+       for (i = 0; i < num_waves; i++) {
+               if (waves[i].matched)
+                       continue;
+
+               if (!found) {
+                       fprintf(f, COLOR_CYAN
+                               "Waves not executing currently-bound shaders:"
+                               COLOR_RESET "\n");
+                       found = true;
+               }
+               fprintf(f, "    SE%u SH%u CU%u SIMD%u WAVE%u  EXEC=%016"PRIx64
+                       "  INST=%08X %08X  PC=%"PRIx64"\n",
+                       waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
+                       waves[i].wave, waves[i].exec, waves[i].inst_dw0,
+                       waves[i].inst_dw1, waves[i].pc);
+       }
+       if (found)
+               fprintf(f, "\n\n");
+}
+
 static void si_dump_command(const char *title, const char *command, FILE *f)
 {
        char line[2000];
@@ -502,7 +750,8 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
                si_dump_shader(sctx->screen, &sctx->ps_shader, f);
 
                if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
-                       si_dump_command("Active waves", "umr -wa | column -t", f);
+                       si_dump_annotated_shaders(sctx, f);
+                       si_dump_command("Active waves (raw data)", "umr -wa | column -t", f);
                        si_dump_command("Wave information", "umr -O bits -wa", f);
                }
 
index 89229ee..d898afd 100644 (file)
@@ -5918,8 +5918,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                           conf->spilled_vgprs, conf->private_mem_vgprs);
 }
 
-static const char *si_get_shader_name(struct si_shader *shader,
-                                     unsigned processor)
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
 {
        switch (processor) {
        case PIPE_SHADER_VERTEX:
index cf4da85..5464d67 100644 (file)
@@ -576,5 +576,6 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
                                  unsigned symbol_offset);
 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
                                    bool writes_samplemask);
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor);
 
 #endif