From b7793783b3df94880655234bc2a9054eddf01913 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 25 Nov 2015 17:54:22 -0800 Subject: [PATCH] i965: Relase input URB Handles on Gen7/7.5 when TCS threads finish. Pre-Broadwell hardware requires us to manually release the ICP Handles by issuing URB read messages with the "Complete" bit set. We can do this in pairs to use fewer URB read messages. Based heavily on work from Chris Forbes. Signed-off-by: Kenneth Graunke Reviewed-by: Edward O'Callaghan Reviewed-by: Jordan Justen --- src/mesa/drivers/dri/i965/brw_defines.h | 2 ++ src/mesa/drivers/dri/i965/brw_shader.cpp | 5 +++ src/mesa/drivers/dri/i965/brw_vec4.cpp | 1 + src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 46 ++++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp | 40 ++++++++++++++++++++- 5 files changed, 93 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 61bcebd..d013748 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1313,6 +1313,8 @@ enum opcode { TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, TCS_OPCODE_GET_PRIMITIVE_ID, TCS_OPCODE_CREATE_BARRIER_HEADER, + TCS_OPCODE_SRC0_010_IS_ZERO, + TCS_OPCODE_RELEASE_INPUT, TES_OPCODE_GET_PRIMITIVE_ID, TES_OPCODE_CREATE_INPUT_READ_HEADER, diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 3a36678..f692bc2 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op) return "tcs_get_primitive_id"; case TCS_OPCODE_CREATE_BARRIER_HEADER: return "tcs_create_barrier_header"; + case TCS_OPCODE_SRC0_010_IS_ZERO: + return "tcs_src0<0,1,0>_is_zero"; + case TCS_OPCODE_RELEASE_INPUT: + return "tcs_release_input"; case TES_OPCODE_CREATE_INPUT_READ_HEADER: return "tes_create_input_read_header"; case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: @@ -1009,6 +1013,7 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: case FS_OPCODE_FB_WRITE: case SHADER_OPCODE_BARRIER: + case TCS_OPCODE_RELEASE_INPUT: return true; default: return false; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 116dd35..f1c3d37 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -157,6 +157,7 @@ vec4_instruction::is_send_from_grf() case SHADER_OPCODE_TYPED_SURFACE_WRITE: case VEC4_OPCODE_URB_READ: case TCS_OPCODE_URB_WRITE: + case TCS_OPCODE_RELEASE_INPUT: case SHADER_OPCODE_BARRIER: return true; default: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index cbf8b1d..cce2b4d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -933,6 +933,42 @@ generate_vec4_urb_read(struct brw_codegen *p, } static void +generate_tcs_release_input(struct brw_codegen *p, + struct brw_reg header, + struct brw_reg vertex, + struct brw_reg is_unpaired) +{ + const struct brw_device_info *devinfo = p->devinfo; + + assert(vertex.file == BRW_IMMEDIATE_VALUE); + assert(vertex.type == BRW_REGISTER_TYPE_UD); + + /* m0.0-0.1: URB handles */ + struct brw_reg urb_handles = + retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), + BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, header, brw_imm_ud(0)); + brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); + brw_pop_insn_state(p); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, brw_null_reg()); + brw_set_src0(p, send, header); + brw_set_message_descriptor(p, send, BRW_SFID_URB, + 1 /* mlen */, 0 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); + brw_inst_set_urb_complete(devinfo, send, 1); + brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? + BRW_URB_SWIZZLE_NONE : + BRW_URB_SWIZZLE_INTERLEAVE); +} + +static void generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) { brw_push_insn_state(p); @@ -1846,6 +1882,16 @@ generate_code(struct brw_codegen *p, generate_tes_get_primitive_id(p, dst); break; + case TCS_OPCODE_SRC0_010_IS_ZERO: + /* If src_reg had stride like fs_reg, we wouldn't need this. */ + brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); + brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z); + break; + + case TCS_OPCODE_RELEASE_INPUT: + generate_tcs_release_input(p, dst, src[0], src[1]); + break; + case SHADER_OPCODE_BARRIER: brw_barrier(p, src[0]); brw_WAIT(p); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp index 507db749..7693f09 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -156,16 +156,54 @@ vec4_tcs_visitor::emit_prolog() void vec4_tcs_visitor::emit_thread_end() { + vec4_instruction *inst; current_annotation = "thread end"; if (nir->info.tcs.vertices_out % 2) { emit(BRW_OPCODE_ENDIF); } + if (devinfo->gen == 7) { + struct brw_tcs_prog_data *tcs_prog_data = + (struct brw_tcs_prog_data *) prog_data; + + current_annotation = "release input vertices"; + + /* Synchronize all threads, so we know that no one is still + * using the input URB handles. + */ + if (tcs_prog_data->instances > 1) { + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); + } + + /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. + * We want to compare the bottom half of invocation_id with 0, but + * use that truth value for the top half as well. Unfortunately, + * we don't have stride in the vec4 world, nor UV immediates in + * align16, so we need an opcode to get invocation_id<0,4,0>. + */ + emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id); + emit(IF(BRW_PREDICATE_NORMAL)); + for (unsigned i = 0; i < key->input_vertices; i += 2) { + /* If we have an odd number of input vertices, the last will be + * unpaired. We don't want to use an interleaved URB write in + * that case. + */ + const bool is_unpaired = i == key->input_vertices - 1; + + dst_reg header(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), + brw_imm_ud(is_unpaired)); + } + emit(BRW_OPCODE_ENDIF); + } + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) emit_shader_time_end(); - vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); + inst = emit(VS_OPCODE_URB_WRITE); inst->mlen = 1; /* just the header, no data. */ inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE; } -- 2.7.4