From: Ben Widawsky Date: Tue, 20 Oct 2015 21:29:39 +0000 (-0700) Subject: i965: Implement ARB_shader_stencil_export (gen9+) X-Git-Tag: upstream/17.1.0~15144 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1db44252d01bf7539452ccc2b5210c74b8dcd573;p=platform%2Fupstream%2Fmesa.git i965: Implement ARB_shader_stencil_export (gen9+) v2: remove useless source_stencil_to_render_target (Ken) Squash in the actual packing function, which also got to v2: Move the definition of the OPCODE outside of FB_WRITE opcodes (Matt) Reorder the regioning to be in VWH order (Matt) Don't retype src in the backend, just assert instead (Matt) Rename the debug prints to something better (Matt) Signed-off-by: Ben Widawsky Reviewed-by: Kenneth Graunke --- diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 742bac4..68a93a6 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -335,6 +335,7 @@ struct brw_wm_prog_data { } binding_table; uint8_t computed_depth_mode; + bool computed_stencil; bool early_fragment_tests; bool no_8; diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 457f49c..6433cff 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -919,6 +919,7 @@ enum opcode { FS_OPCODE_BLORP_FB_WRITE, FS_OPCODE_REP_FB_WRITE, + FS_OPCODE_PACK_STENCIL_REF, SHADER_OPCODE_RCP, SHADER_OPCODE_RSQ, SHADER_OPCODE_SQRT, @@ -1330,6 +1331,7 @@ enum fb_write_logical_srcs { FB_WRITE_LOGICAL_SRC_SRC0_ALPHA, FB_WRITE_LOGICAL_SRC_SRC_DEPTH, /* gl_FragDepth */ FB_WRITE_LOGICAL_SRC_DST_DEPTH, /* GEN4-5: passthrough from thread */ + FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */ FB_WRITE_LOGICAL_SRC_OMASK, /* Sample Mask (gl_SampleMask) */ FB_WRITE_LOGICAL_SRC_COMPONENTS, /* REQUIRED */ }; diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f2f598c..1c96cf2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3357,6 +3357,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA]; const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH]; const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH]; + const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; const unsigned components = inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud; @@ -3449,6 +3450,17 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, length++; } + if (src_stencil.file != BAD_FILE) { + assert(devinfo->gen >= 9); + assert(bld.dispatch_width() != 16); + + sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.exec_all().annotate("FB write OS") + .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length], + retype(src_stencil, BRW_REGISTER_TYPE_UB)); + length++; + } + fs_inst *load; if (devinfo->gen >= 7) { /* Send from the GRF */ @@ -5223,6 +5235,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, prog_data->uses_omask = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 50e98be..d98769d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -337,6 +337,7 @@ public: int *push_constant_loc; fs_reg frag_depth; + fs_reg frag_stencil; fs_reg sample_mask; fs_reg outputs[VARYING_SLOT_MAX]; unsigned output_components[VARYING_SLOT_MAX]; @@ -427,6 +428,8 @@ private: void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload); void generate_urb_write(fs_inst *inst, struct brw_reg payload); void generate_cs_terminate(fs_inst *inst, struct brw_reg payload); + void generate_stencil_ref_packing(fs_inst *inst, struct brw_reg dst, + struct brw_reg src); void generate_barrier(fs_inst *inst, struct brw_reg src); void generate_blorp_fb_write(fs_inst *inst); void generate_linterp(fs_inst *inst, struct brw_reg dst, diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index bb7e792..b016b56 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -317,6 +317,14 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) brw_imm_ud(inst->target)); } + /* Set computes stencil to render target */ + if (prog_data->computed_stencil) { + brw_OR(p, + vec1(retype(payload, BRW_REGISTER_TYPE_UD)), + vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(0x1 << 14)); + } + implied_header = brw_null_reg(); } else { implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); @@ -437,6 +445,47 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) } void +fs_generator::generate_stencil_ref_packing(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src) +{ + assert(dispatch_width == 8); + assert(devinfo->gen >= 9); + + /* Stencil value updates are provided in 8 slots of 1 byte per slot. + * Presumably, in order to save memory bandwidth, the stencil reference + * values written from the FS need to be packed into 2 dwords (this makes + * sense because the stencil values are limited to 1 byte each and a SIMD8 + * send, so stencil slots 0-3 in dw0, and 4-7 in dw1.) + * + * The spec is confusing here because in the payload definition of MDP_RTW_S8 + * (Message Data Payload for Render Target Writes with Stencil 8b) the + * stencil value seems to be dw4.0-dw4.7. However, if you look at the type of + * dw4 it is type MDPR_STENCIL (Message Data Payload Register) which is the + * packed values specified above and diagrammed below: + * + * 31 0 + * -------------------------------- + * DW | | + * 2-7 | IGNORED | + * | | + * -------------------------------- + * DW1 | STC | STC | STC | STC | + * | slot7 | slot6 | slot5 | slot4| + * -------------------------------- + * DW0 | STC | STC | STC | STC | + * | slot3 | slot2 | slot1 | slot0| + * -------------------------------- + */ + + src.vstride = BRW_VERTICAL_STRIDE_4; + src.width = BRW_WIDTH_1; + src.hstride = BRW_HORIZONTAL_STRIDE_0; + assert(src.type == BRW_REGISTER_TYPE_UB); + brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UB), src); +} + +void fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src) { brw_barrier(p, src); @@ -2182,6 +2231,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) generate_barrier(inst, src[0]); break; + case FS_OPCODE_PACK_STENCIL_REF: + generate_stencil_ref_packing(inst, dst, src[0]); + break; + default: unreachable("Unsupported opcode"); diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index e1fb120..acb51c0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -114,6 +114,8 @@ fs_visitor::nir_setup_outputs() } } else if (var->data.location == FRAG_RESULT_DEPTH) { this->frag_depth = reg; + } else if (var->data.location == FRAG_RESULT_STENCIL) { + this->frag_stencil = reg; } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) { this->sample_mask = reg; } else { diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 9e2b221..5c57944 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -697,7 +697,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld, const fs_reg dst_depth = (payload.dest_depth_reg ? fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) : fs_reg()); - fs_reg src_depth; + fs_reg src_depth, src_stencil; if (source_depth_to_render_target) { if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) @@ -706,9 +706,12 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld, src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)); } + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) + src_stencil = frag_stencil; + const fs_reg sources[] = { - color0, color1, src0_alpha, src_depth, dst_depth, sample_mask, - fs_reg(components) + color0, color1, src0_alpha, src_depth, dst_depth, src_stencil, + sample_mask, fs_reg(components) }; assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS); fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(), @@ -741,6 +744,16 @@ fs_visitor::emit_fb_writes() no16("Missing support for simd16 depth writes on gen6\n"); } + if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) { + /* From the 'Render Target Write message' section of the docs: + * "Output Stencil is not supported with SIMD16 Render Target Write + * Messages." + * + * FINISHME: split 16 into 2 8s + */ + no16("FINISHME: support 2 simd8 writes for gl_FragStencilRefARB\n"); + } + if (do_dual_src) { const fs_builder abld = bld.annotate("FB dual-source write"); diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 0ac4f2f..6f6f77a 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -295,6 +295,8 @@ brw_instruction_name(enum opcode op) return "fb_write"; case FS_OPCODE_FB_WRITE_LOGICAL: return "fb_write_logical"; + case FS_OPCODE_PACK_STENCIL_REF: + return "pack_stencil_ref"; case FS_OPCODE_BLORP_FB_WRITE: return "blorp_fb_write"; case FS_OPCODE_REP_FB_WRITE: diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 8f05074..10e433b 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -95,6 +95,11 @@ gen8_upload_ps_extra(struct brw_context *brw, !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; + if (prog_data->computed_stencil) { + assert(brw->gen >= 9); + dw1 |= GEN9_PSX_SHADER_COMPUTES_STENCIL; + } + BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2)); OUT_BATCH(dw1);