From de7ad2c88f4ec243c95eaed22c41d0e537912e01 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 7 Mar 2014 00:49:45 -0800 Subject: [PATCH] i965: Accurately bail on SIMD16 compiles. Ideally, we'd like to never even attempt the SIMD16 compile if we could know ahead of time that it won't succeed---it's purely a waste of time. This is especially important for state-based recompiles, which happen at draw time. The fragment shader compiler has a number of checks like: if (dispatch_width == 16) fail("...some reason..."); This patch introduces a new no16() function which replaces the above pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag. Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and issue a helpful performance warning if INTEL_DEBUG=perf is set. (In SIMD16 mode, no16() calls fail(), for safety's sake.) The great part is that this is not a heuristic---if the flag is set, we know with 100% certainty that the SIMD16 compile would fail. (It might fail anyway if we run out of registers, but it's always worth trying.) v2: Fix missing va_end in early-return case (caught by Ilia Mirkin). Signed-off-by: Kenneth Graunke Reviewed-by: Chris Forbes [v1] Reviewed-by: Ian Romanick [v1] Reviewed-by: Eric Anholt --- src/mesa/drivers/dri/i965/brw_fs.cpp | 68 +++++++++++++++++++++++----- src/mesa/drivers/dri/i965/brw_fs.h | 4 ++ src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 44 +++++++++--------- 3 files changed, 82 insertions(+), 34 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 1f85901..1b32d63 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -647,9 +647,8 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, } void -fs_visitor::fail(const char *format, ...) +fs_visitor::vfail(const char *format, va_list va) { - va_list va; char *msg; if (failed) @@ -657,9 +656,7 @@ fs_visitor::fail(const char *format, ...) failed = true; - va_start(va, format); msg = ralloc_vasprintf(mem_ctx, format, va); - va_end(va); msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); this->fail_msg = msg; @@ -669,6 +666,48 @@ fs_visitor::fail(const char *format, ...) } } +void +fs_visitor::fail(const char *format, ...) +{ + va_list va; + + va_start(va, format); + vfail(format, va); + va_end(va); +} + +/** + * Mark this program as impossible to compile in SIMD16 mode. + * + * During the SIMD8 compile (which happens first), we can detect and flag + * things that are unsupported in SIMD16 mode, so the compiler can skip + * the SIMD16 compile altogether. + * + * During a SIMD16 compile (if one happens anyway), this just calls fail(). + */ +void +fs_visitor::no16(const char *format, ...) +{ + va_list va; + + va_start(va, format); + + if (dispatch_width == 16) { + vfail(format, va); + } else { + simd16_unsupported = true; + + if (INTEL_DEBUG & DEBUG_PERF) { + if (no16_msg) + ralloc_vasprintf_append(&no16_msg, format, va); + else + no16_msg = ralloc_vasprintf(mem_ctx, format, va); + } + } + + va_end(va); +} + fs_inst * fs_visitor::emit(enum opcode opcode) { @@ -1356,8 +1395,8 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) switch (opcode) { case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: - if (brw->gen >= 7 && dispatch_width == 16) - fail("SIMD16 INTDIV unsupported\n"); + if (brw->gen >= 7) + no16("SIMD16 INTDIV unsupported\n"); break; case SHADER_OPCODE_POW: break; @@ -3505,13 +3544,18 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, exec_list *simd16_instructions = NULL; fs_visitor v2(brw, c, prog, fp, 16); if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) { - /* Try a SIMD16 compile */ - v2.import_uniforms(&v); - if (!v2.run()) { - perf_debug("SIMD16 shader failed to compile, falling back to " - "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg); + if (!v.simd16_unsupported) { + /* Try a SIMD16 compile */ + v2.import_uniforms(&v); + if (!v2.run()) { + perf_debug("SIMD16 shader failed to compile, falling back to " + "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg); + } else { + simd16_instructions = &v2.instructions; + } } else { - simd16_instructions = &v2.instructions; + perf_debug("SIMD16 shader unsupported, falling back to " + "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 9de1f3a..0d064f6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -381,7 +381,9 @@ public: void insert_gen4_send_dependency_workarounds(); void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst); void insert_gen4_post_send_dependency_workarounds(fs_inst *inst); + void vfail(const char *msg, va_list args); void fail(const char *msg, ...); + void no16(const char *msg, ...); void lower_uniform_pull_constant_loads(); void push_force_uncompressed(); @@ -541,6 +543,8 @@ public: bool failed; char *fail_msg; + bool simd16_unsupported; + char *no16_msg; /* Result of last visit() method. */ fs_reg result; diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index cd90e23..a832268 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -462,8 +462,8 @@ fs_visitor::visit(ir_expression *ir) * FINISHME: Emit just the MUL if we know an operand is small * enough. */ - if (brw->gen >= 7 && dispatch_width == 16) - fail("SIMD16 explicit accumulator operands unsupported\n"); + if (brw->gen >= 7) + no16("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), this->result.type); @@ -475,8 +475,8 @@ fs_visitor::visit(ir_expression *ir) } break; case ir_binop_imul_high: { - if (brw->gen >= 7 && dispatch_width == 16) - fail("SIMD16 explicit accumulator operands unsupported\n"); + if (brw->gen >= 7) + no16("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), this->result.type); @@ -490,8 +490,8 @@ fs_visitor::visit(ir_expression *ir) emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]); break; case ir_binop_carry: { - if (brw->gen >= 7 && dispatch_width == 16) - fail("SIMD16 explicit accumulator operands unsupported\n"); + if (brw->gen >= 7) + no16("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); @@ -500,8 +500,8 @@ fs_visitor::visit(ir_expression *ir) break; } case ir_binop_borrow: { - if (brw->gen >= 7 && dispatch_width == 16) - fail("SIMD16 explicit accumulator operands unsupported\n"); + if (brw->gen >= 7) + no16("SIMD16 explicit accumulator operands unsupported\n"); struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); @@ -1290,8 +1290,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, next.reg_offset++; break; case ir_txd: { - if (dispatch_width == 16) - fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode."); + no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode."); /* Load dPdx and the coordinate together: * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z @@ -1364,8 +1363,8 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, break; case ir_tg4: if (has_nonconstant_offset) { - if (ir->shadow_comparitor && dispatch_width == 16) - fail("Gen7 does not support gather4_po_c in SIMD16 mode."); + if (ir->shadow_comparitor) + no16("Gen7 does not support gather4_po_c in SIMD16 mode."); /* More crazy intermixing */ ir->offset->accept(this); @@ -1464,8 +1463,8 @@ fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate, 0 }; + no16("rectangle scale uniform setup not supported on SIMD16\n"); if (dispatch_width == 16) { - fail("rectangle scale uniform setup not supported on SIMD16\n"); return coordinate; } @@ -2183,8 +2182,8 @@ fs_visitor::try_replace_with_sel() void fs_visitor::visit(ir_if *ir) { - if (brw->gen < 6 && dispatch_width == 16) { - fail("Can't support (non-uniform) control flow on SIMD16\n"); + if (brw->gen < 6) { + no16("Can't support (non-uniform) control flow on SIMD16\n"); } /* Don't point the annotation at the if statement, because then it plus @@ -2226,8 +2225,8 @@ fs_visitor::visit(ir_if *ir) void fs_visitor::visit(ir_loop *ir) { - if (brw->gen < 6 && dispatch_width == 16) { - fail("Can't support (non-uniform) control flow on SIMD16\n"); + if (brw->gen < 6) { + no16("Can't support (non-uniform) control flow on SIMD16\n"); } this->base_ir = NULL; @@ -2725,9 +2724,10 @@ fs_visitor::emit_fb_writes() bool do_dual_src = this->dual_src_output.file != BAD_FILE; bool src0_alpha_to_render_target = false; - if (dispatch_width == 16 && do_dual_src) { - fail("GL_ARB_blend_func_extended not yet supported in SIMD16."); - do_dual_src = false; + if (do_dual_src) { + no16("GL_ARB_blend_func_extended not yet supported in SIMD16."); + if (dispatch_width == 16) + do_dual_src = false; } /* From the Sandy Bridge PRM, volume 4, page 198: @@ -2778,13 +2778,13 @@ fs_visitor::emit_fb_writes() nr += reg_width; if (c->source_depth_to_render_target) { - if (brw->gen == 6 && dispatch_width == 16) { + if (brw->gen == 6) { /* For outputting oDepth on gen6, SIMD8 writes have to be * used. This would require SIMD8 moves of each half to * message regs, kind of like pre-gen5 SIMD16 FB writes. * Just bail on doing so for now. */ - fail("Missing support for simd16 depth writes on gen6\n"); + no16("Missing support for simd16 depth writes on gen6\n"); } if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { -- 2.7.4