From 8d3b86e34a7b0f77613c7f5669891e54d76f0cbf Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Sat, 4 Jan 2020 16:16:24 -0800 Subject: [PATCH] intel/fs/gen7+: Implement discard/demote for SIMD32 programs. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit At this point this simply involves fixing the initialization of the sample mask flag register to take the right dispatch mask from the thread payload, and fixing sample_mask_reg() to return f1.1 for the second half of a SIMD32 thread. This improves Manhattan 3.1 performance by 2.4%±0.31% (N>40) on my ICL with SIMD32 enabled relative to falling back to SIMD16 for the shaders that use discard. Reviewed-by: Kenneth Graunke --- src/intel/compiler/brw_fs.cpp | 18 +++++++++++------- src/intel/compiler/brw_fs_nir.cpp | 4 +++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 963d1c1..fd9217b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -4285,8 +4285,8 @@ sample_mask_reg(const fs_builder &bld) if (v->stage != MESA_SHADER_FRAGMENT) { return brw_imm_ud(0xffffffff); } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) { - assert(bld.group() < 16 && bld.dispatch_width() <= 16); - return brw_flag_subreg(sample_mask_flag_subreg(v)); + assert(bld.dispatch_width() <= 16); + return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16); } else { assert(v->devinfo->gen >= 6 && bld.dispatch_width() <= 16); return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7), @@ -8171,11 +8171,15 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) * Initialize it with the dispatched pixels. */ if (wm_prog_data->uses_kill) { - const fs_reg dispatch_mask = - devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0); - bld.exec_all().group(1, 0) - .MOV(sample_mask_reg(bld), - retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); + const unsigned lower_width = MIN2(dispatch_width, 16); + for (unsigned i = 0; i < dispatch_width / lower_width; i++) { + const fs_reg dispatch_mask = + devinfo->gen >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) : + brw_vec1_grf(0, 0); + bld.exec_all().group(1, 0) + .MOV(sample_mask_reg(bld.group(lower_width, i)), + retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); + } } emit_nir_code(); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 5d66ead..3b34c40 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3562,7 +3562,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, emit_discard_jump(); } - limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); + if (devinfo->gen < 7) + limit_dispatch_width( + 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); break; } -- 2.7.4