From: Marek Olšák Date: Tue, 29 Nov 2016 14:35:11 +0000 (+0100) Subject: radeonsi: write shader asm annotated with wave info into GPU hang reports X-Git-Tag: upstream/17.1.0~2391 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=28c06b3ceb7455538da301e03c9e704902cc1056;p=platform%2Fupstream%2Fmesa.git radeonsi: write shader asm annotated with wave info into GPU hang reports Note that the disassembly is written twice - first the unmodified compiler output and then the wave-annotated output only if there are waves executing the shader. Sample output from a real GPU hang most likely caused by image_sample: The number of active waves = 28 Pixel Shader - annotated disassembly: s_mov_b64 s[6:7], exec ; BE86017E [PC=0x10f3e3800, off=0, size=4] s_wqm_b64 exec, exec ; BEFE077E [PC=0x10f3e3804, off=4, size=4] ... image_sample v[7:9], v[0:1], s[12:19], s[20:23] dmask:0x7 ; F0800700 00A30700 [PC=0x10f3e3a94, off=660, size=8] s_buffer_load_dword s20, s[0:3], 0x50 ; C0220500 00000050 [PC=0x10f3e3a9c, off=668, size=8] s_load_dwordx4 s[24:27], s[4:5], 0x170 ; C00A0602 00000170 [PC=0x10f3e3aa4, off=676, size=8] s_load_dwordx8 s[12:19], s[4:5], 0x140 ; C00E0302 00000140 [PC=0x10f3e3aac, off=684, size=8] s_buffer_load_dword s11, s[0:3], 0x5c ; C02202C0 0000005C [PC=0x10f3e3ab4, off=692, size=8] s_buffer_load_dword s21, s[0:3], 0x54 ; C0220540 00000054 [PC=0x10f3e3abc, off=700, size=8] s_buffer_load_dword s22, s[0:3], 0x58 ; C0220580 00000058 [PC=0x10f3e3ac4, off=708, size=8] s_waitcnt vmcnt(0) ; BF8C0F70 [PC=0x10f3e3acc, off=716, size=4] ^ SE0 SH0 CU1 SIMD1 WAVE0 EXEC=aaaaaaa555aaaaaa INST32=BF8C0F70 ^ SE0 SH0 CU1 SIMD2 WAVE0 EXEC=aaaa85555555552a INST32=BF8C0F70 ^ SE0 SH0 CU1 SIMD3 WAVE0 EXEC=000000000000000a INST32=BF8C0F70 ^ SE0 SH0 CU6 SIMD1 WAVE0 EXEC=25a5a5aa82aaaaaa INST32=BF8C0F70 ^ SE0 SH0 CU6 SIMD3 WAVE0 EXEC=50aaaa8fffa55555 INST32=BF8C0F70 ^ SE0 SH0 CU7 SIMD0 WAVE0 EXEC=5554aaaaaaa1a555 INST32=BF8C0F70 ^ SE0 SH0 CU7 SIMD0 WAVE1 EXEC=aaaa5555ffffffff INST32=BF8C0F70 ^ SE0 SH0 CU7 SIMD1 WAVE0 EXEC=555557aaaaaaaaa5 INST32=BF8C0F70 ^ SE0 SH0 CU7 SIMD3 WAVE0 EXEC=5555aaaaaaaaaa85 INST32=BF8C0F70 ^ SE1 SH0 CU3 SIMD1 WAVE0 EXEC=aaaaaaaaaaaaaaaa INST32=BF8C0F70 ^ SE1 SH0 CU4 SIMD0 WAVE0 EXEC=aaaaaaaa5a5a5a5a INST32=BF8C0F70 ^ SE1 SH0 CU4 SIMD1 WAVE0 EXEC=aaaaaaa5a5a5a4a5 INST32=BF8C0F70 ^ SE1 SH0 CU4 SIMD2 WAVE0 EXEC=5555555000000000 INST32=BF8C0F70 ^ SE1 SH0 CU4 SIMD3 WAVE0 EXEC=aa555554155aaaaa INST32=BF8C0F70 ^ SE1 SH0 CU5 SIMD0 WAVE0 EXEC=55ffff55555555aa INST32=BF8C0F70 ^ SE1 SH0 CU5 SIMD1 WAVE0 EXEC=555555555aaaaaaa INST32=BF8C0F70 ^ SE1 SH0 CU5 SIMD2 WAVE0 EXEC=a0aaaaaaa8555555 INST32=BF8C0F70 ^ SE1 SH0 CU5 SIMD3 WAVE0 EXEC=8aaaaaaaaaaaa555 INST32=BF8C0F70 ^ SE1 SH0 CU6 SIMD0 WAVE0 EXEC=000000002aaaaaaa INST32=BF8C0F70 ^ SE2 SH0 CU1 SIMD0 WAVE0 EXEC=5aaaa5400aaaa15a INST32=BF8C0F70 ^ SE2 SH0 CU1 SIMD1 WAVE0 EXEC=00aaaaaaaa5555aa INST32=BF8C0F70 ^ SE2 SH0 CU1 SIMD2 WAVE0 EXEC=aa00005555554555 INST32=BF8C0F70 ^ SE2 SH0 CU1 SIMD3 WAVE0 EXEC=aaaaaaa000000000 INST32=BF8C0F70 ^ SE3 SH0 CU4 SIMD0 WAVE0 EXEC=5555aaaaaaaaaaaa INST32=BF8C0F70 ^ SE3 SH0 CU4 SIMD2 WAVE0 EXEC=ffaaaaaaaaaa5555 INST32=BF8C0F70 ^ SE3 SH0 CU4 SIMD3 WAVE0 EXEC=aaaa55555555aa00 INST32=BF8C0F70 ^ SE3 SH0 CU5 SIMD0 WAVE0 EXEC=00aaaaaaaaaaaa5a INST32=BF8C0F70 ^ SE3 SH0 CU5 SIMD1 WAVE0 EXEC=5a555555005555ff INST32=BF8C0F70 v_mul_f32_e32 v7, s6, v7 ; 0A0E0E06 [PC=0x10f3e3ad0, off=720, size=4] ... Reviewed-by: Nicolai Hähnle --- diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index e4a350f..1f154c2 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -468,6 +468,254 @@ static void si_dump_descriptors(struct si_context *sctx, num_elements[i], f); } +struct si_shader_inst { + char text[160]; /* one disasm line */ + unsigned offset; /* instruction offset */ + unsigned size; /* instruction size = 4 or 8 */ +}; + +/* Split a disassembly string into lines and add them to the array pointed + * to by "instructions". */ +static void si_add_split_disasm(const char *disasm, + uint64_t start_addr, + unsigned *num, + struct si_shader_inst *instructions) +{ + struct si_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL; + char *next; + + while ((next = strchr(disasm, '\n'))) { + struct si_shader_inst *inst = &instructions[*num]; + unsigned len = next - disasm; + + assert(len < ARRAY_SIZE(inst->text)); + memcpy(inst->text, disasm, len); + inst->text[len] = 0; + inst->offset = last_inst ? last_inst->offset + last_inst->size : 0; + + const char *semicolon = strchr(disasm, ';'); + assert(semicolon); + /* More than 16 chars after ";" means the instruction is 8 bytes long. */ + inst->size = next - semicolon > 16 ? 8 : 4; + + snprintf(inst->text + len, ARRAY_SIZE(inst->text) - len, + " [PC=0x%"PRIx64", off=%u, size=%u]", + start_addr + inst->offset, inst->offset, inst->size); + + last_inst = inst; + (*num)++; + disasm = next + 1; + } +} + +#define MAX_WAVES_PER_CHIP (64 * 40) + +struct si_wave_info { + unsigned se; /* shader engine */ + unsigned sh; /* shader array */ + unsigned cu; /* compute unit */ + unsigned simd; + unsigned wave; + uint32_t status; + uint64_t pc; /* program counter */ + uint32_t inst_dw0; + uint32_t inst_dw1; + uint64_t exec; + bool matched; /* whether the wave is used by a currently-bound shader */ +}; + +static int compare_wave(const void *p1, const void *p2) +{ + struct si_wave_info *w1 = (struct si_wave_info *)p1; + struct si_wave_info *w2 = (struct si_wave_info *)p2; + + /* Sort waves according to PC and then SE, SH, CU, etc. */ + if (w1->pc < w2->pc) + return -1; + if (w1->pc > w2->pc) + return 1; + if (w1->se < w2->se) + return -1; + if (w1->se > w2->se) + return 1; + if (w1->sh < w2->sh) + return -1; + if (w1->sh > w2->sh) + return 1; + if (w1->cu < w2->cu) + return -1; + if (w1->cu > w2->cu) + return 1; + if (w1->simd < w2->simd) + return -1; + if (w1->simd > w2->simd) + return 1; + if (w1->wave < w2->wave) + return -1; + if (w1->wave > w2->wave) + return 1; + + return 0; +} + +/* Return wave information. "waves" should be a large enough array. */ +static unsigned si_get_wave_info(struct si_wave_info waves[MAX_WAVES_PER_CHIP]) +{ + char line[2000]; + unsigned num_waves = 0; + + FILE *p = popen("umr -wa", "r"); + if (!p) + return 0; + + if (!fgets(line, sizeof(line), p) || + strncmp(line, "SE", 2) != 0) { + pclose(p); + return 0; + } + + while (fgets(line, sizeof(line), p)) { + struct si_wave_info *w; + uint32_t pc_hi, pc_lo, exec_hi, exec_lo; + + assert(num_waves < MAX_WAVES_PER_CHIP); + w = &waves[num_waves]; + + if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", + &w->se, &w->sh, &w->cu, &w->simd, &w->wave, + &w->status, &pc_hi, &pc_lo, &w->inst_dw0, + &w->inst_dw1, &exec_hi, &exec_lo) == 12) { + w->pc = ((uint64_t)pc_hi << 32) | pc_lo; + w->exec = ((uint64_t)exec_hi << 32) | exec_lo; + w->matched = false; + num_waves++; + } + } + + qsort(waves, num_waves, sizeof(struct si_wave_info), compare_wave); + + pclose(p); + return num_waves; +} + +/* If the shader is being executed, print its asm instructions, and annotate + * those that are being executed right now with information about waves that + * execute them. This is most useful during a GPU hang. + */ +static void si_print_annotated_shader(struct si_shader *shader, + struct si_wave_info *waves, + unsigned num_waves, + FILE *f) +{ + if (!shader || !shader->binary.disasm_string) + return; + + uint64_t start_addr = shader->bo->gpu_address; + uint64_t end_addr = start_addr + shader->bo->b.b.width0; + unsigned i; + + /* See if any wave executes the shader. */ + for (i = 0; i < num_waves; i++) { + if (start_addr <= waves[i].pc && waves[i].pc <= end_addr) + break; + } + if (i == num_waves) + return; /* the shader is not being executed */ + + /* Remember the first found wave. The waves are sorted according to PC. */ + waves = &waves[i]; + num_waves -= i; + + /* Get the list of instructions. + * Buffer size / 4 is the upper bound of the instruction count. + */ + unsigned num_inst = 0; + struct si_shader_inst *instructions = + calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); + + if (shader->prolog) { + si_add_split_disasm(shader->prolog->binary.disasm_string, + start_addr, &num_inst, instructions); + } + si_add_split_disasm(shader->binary.disasm_string, + start_addr, &num_inst, instructions); + if (shader->epilog) { + si_add_split_disasm(shader->epilog->binary.disasm_string, + start_addr, &num_inst, instructions); + } + + fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n", + si_get_shader_name(shader, shader->selector->type)); + + /* Print instructions with annotations. */ + for (i = 0; i < num_inst; i++) { + struct si_shader_inst *inst = &instructions[i]; + + fprintf(f, "%s\n", inst->text); + + /* Print which waves execute the instruction right now. */ + while (num_waves && start_addr + inst->offset == waves->pc) { + fprintf(f, + " " COLOR_GREEN "^ SE%u SH%u CU%u " + "SIMD%u WAVE%u EXEC=%016"PRIx64 " ", + waves->se, waves->sh, waves->cu, waves->simd, + waves->wave, waves->exec); + + if (inst->size == 4) { + fprintf(f, "INST32=%08X" COLOR_RESET "\n", + waves->inst_dw0); + } else { + fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", + waves->inst_dw0, waves->inst_dw1); + } + + waves->matched = true; + waves = &waves[1]; + num_waves--; + } + } + + fprintf(f, "\n\n"); + free(instructions); +} + +static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f) +{ + struct si_wave_info waves[MAX_WAVES_PER_CHIP]; + unsigned num_waves = si_get_wave_info(waves); + + fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET + "\n\n", num_waves); + + si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f); + + /* Print waves executing shaders that are not currently bound. */ + unsigned i; + bool found = false; + for (i = 0; i < num_waves; i++) { + if (waves[i].matched) + continue; + + if (!found) { + fprintf(f, COLOR_CYAN + "Waves not executing currently-bound shaders:" + COLOR_RESET "\n"); + found = true; + } + fprintf(f, " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016"PRIx64 + " INST=%08X %08X PC=%"PRIx64"\n", + waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, + waves[i].wave, waves[i].exec, waves[i].inst_dw0, + waves[i].inst_dw1, waves[i].pc); + } + if (found) + fprintf(f, "\n\n"); +} + static void si_dump_command(const char *title, const char *command, FILE *f) { char line[2000]; @@ -502,7 +750,8 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, si_dump_shader(sctx->screen, &sctx->ps_shader, f); if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) { - si_dump_command("Active waves", "umr -wa | column -t", f); + si_dump_annotated_shaders(sctx, f); + si_dump_command("Active waves (raw data)", "umr -wa | column -t", f); si_dump_command("Wave information", "umr -O bits -wa", f); } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 89229ee..d898afd 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5918,8 +5918,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen, conf->spilled_vgprs, conf->private_mem_vgprs); } -static const char *si_get_shader_name(struct si_shader *shader, - unsigned processor) +const char *si_get_shader_name(struct si_shader *shader, unsigned processor) { switch (processor) { case PIPE_SHADER_VERTEX: diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index cf4da85..5464d67 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -576,5 +576,6 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary, unsigned symbol_offset); unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask); +const char *si_get_shader_name(struct si_shader *shader, unsigned processor); #endif