From d5016495cc1b50b1673d0d3ab8e6af8249b071d5 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 6 Dec 2012 12:15:13 -0800 Subject: [PATCH] i965/fs: Rewrite discards to use a flag subreg to track discarded pixels. This makes much more sense on gen6+, and will also prove useful for early exit of shaders on discard. v2: fix up a stale comment from before converting gen4-5. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_defines.h | 1 - src/mesa/drivers/dri/i965/brw_fs.cpp | 8 +++ src/mesa/drivers/dri/i965/brw_fs.h | 1 - src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 76 +++++----------------- src/mesa/drivers/dri/i965/brw_fs_fp.cpp | 17 ++--- src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 3 - .../dri/i965/brw_fs_schedule_instructions.cpp | 1 - src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 12 +++- 8 files changed, 46 insertions(+), 73 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 5e00b40..2b77ae6 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -673,7 +673,6 @@ enum opcode { FS_OPCODE_PIXEL_Y, FS_OPCODE_CINTERP, FS_OPCODE_LINTERP, - FS_OPCODE_DISCARD, FS_OPCODE_SPILL, FS_OPCODE_UNSPILL, FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index dbf48f8..ac0bb56 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2434,6 +2434,14 @@ fs_visitor::run() else emit_interpolation_setup_gen6(); + /* We handle discards by keeping track of the still-live pixels in f0.1. + * Initialize it with the dispatched pixels. + */ + if (fp->UsesKill) { + fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + discard_init->flag_subreg = 1; + } + /* Generate FS IR for main(). (the visitor only descends into * functions called "main"). */ diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b60a37e..b00755f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -501,7 +501,6 @@ private: void generate_math_gen4(fs_inst *inst, struct brw_reg dst, struct brw_reg src); - void generate_discard(fs_inst *inst); void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src); void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, bool negate_value); diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index b3d7f81..f185eb5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -74,6 +74,17 @@ fs_generator::generate_fb_write(fs_inst *inst) brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_compression_control(p, BRW_COMPRESSION_NONE); + if (fp->UsesKill) { + struct brw_reg pixel_mask; + + if (intel->gen >= 6) + pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); + else + pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + + brw_MOV(p, pixel_mask, brw_flag_reg(0, 1)); + } + if (inst->header_present) { if (intel->gen >= 6) { brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); @@ -514,58 +525,6 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src } void -fs_generator::generate_discard(fs_inst *inst) -{ - struct brw_reg f0 = brw_flag_reg(0, 0); - - if (intel->gen >= 6) { - struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); - struct brw_reg some_register; - - /* As of gen6, we no longer have the mask register to look at, - * so life gets a bit more complicated. - */ - - /* Load the flag register with all ones. */ - brw_push_insn_state(p); - brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, f0, brw_imm_uw(0xffff)); - brw_pop_insn_state(p); - - /* Do a comparison that should always fail, to produce 0s in the flag - * reg where we have active channels. - */ - some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); - brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), - BRW_CONDITIONAL_NZ, some_register, some_register); - - /* Undo CMP's whacking of predication*/ - brw_set_predicate_control(p, BRW_PREDICATE_NONE); - - brw_push_insn_state(p); - brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_AND(p, g1, f0, g1); - brw_pop_insn_state(p); - } else { - struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); - - brw_push_insn_state(p); - brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_set_compression_control(p, BRW_COMPRESSION_NONE); - - /* Unlike the 965, we have the mask reg, so we just need - * somewhere to invert that (containing channels to be disabled) - * so it can be ANDed with the mask of pixels still to be - * written. Use the flag reg for consistency with gen6+. - */ - brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */ - brw_AND(p, g0, f0, g0); - - brw_pop_insn_state(p); - } -} - -void fs_generator::generate_spill(fs_inst *inst, struct brw_reg src) { assert(inst->mlen != 0); @@ -745,12 +704,16 @@ void fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst) { struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg); - struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); + struct brw_reg dispatch_mask; + + if (intel->gen >= 6) + dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); + else + dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); - assert (intel->gen >= 6); brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, flags, g1); + brw_MOV(p, flags, dispatch_mask); brw_pop_insn_state(p); } @@ -1083,9 +1046,6 @@ fs_generator::generate_code(exec_list *instructions) case SHADER_OPCODE_TXS: generate_tex(inst, dst, src[0]); break; - case FS_OPCODE_DISCARD: - generate_discard(inst); - break; case FS_OPCODE_DDX: generate_ddx(inst, dst, src[0]); break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp index 4be7779..bebf059 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp @@ -252,14 +252,15 @@ fs_visitor::emit_fragment_program_code() continue; } - emit(CMP(null, regoffset(src[0], i), fs_reg(0.0f), - BRW_CONDITIONAL_L)); - - if (intel->gen < 6 && dispatch_width == 16) - fail("Can't support (non-uniform) control flow on 16-wide"); - emit(IF(BRW_PREDICATE_NORMAL)); - emit(FS_OPCODE_DISCARD); - emit(BRW_OPCODE_ENDIF); + + /* Emit an instruction that's predicated on the current + * undiscarded pixels, and updates just those pixels to be + * turned off. + */ + fs_inst *cmp = emit(CMP(null, regoffset(src[0], i), fs_reg(0.0f), + BRW_CONDITIONAL_GE)); + cmp->predicate = BRW_PREDICATE_NORMAL; + cmp->flag_subreg = 1; } break; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index a4fc032..d1147f5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -277,9 +277,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g, payload_last_use_ip[0 / reg_width] = use_ip; payload_last_use_ip[1 / reg_width] = use_ip; break; - case FS_OPCODE_DISCARD: - payload_last_use_ip[1 / reg_width] = use_ip; - break; case FS_OPCODE_LINTERP: /* On gen6+ in 16-wide, there are 4 adjacent registers (so 2 nodes) diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp index 96d1131..e9c25b0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp @@ -299,7 +299,6 @@ instruction_scheduler::calculate_deps() } if (inst->predicate) { - assert(last_conditional_mod[inst->flag_subreg]); add_dep(last_conditional_mod[inst->flag_subreg], n); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index f1c6860..98cd064 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -1435,7 +1435,17 @@ fs_visitor::visit(ir_discard *ir) { assert(ir->condition == NULL); /* FINISHME */ - emit(FS_OPCODE_DISCARD); + /* We track our discarded pixels in f0.1. By predicating on it, we can + * update just the flag bits that aren't yet discarded. By emitting a + * CMP of g0 != g0, all our currently executing channels will get turned + * off. + */ + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW)); + fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg, + BRW_CONDITIONAL_NZ)); + cmp->predicate = BRW_PREDICATE_NORMAL; + cmp->flag_subreg = 1; } void -- 2.7.4