From 33f73e93ff6e14f72153d3df7e80763137fcb943 Mon Sep 17 00:00:00 2001 From: Neil Roberts Date: Tue, 24 Mar 2015 15:52:20 +0000 Subject: [PATCH] i965/skl: Add the header for constant loads outside of the generator Commit 5a06ee738 added a step to the generator to set up the message header when generating the VS_OPCODE_PULL_CONSTANT_LOAD_GEN7 instruction. That pseudo opcode is implemented in terms of multiple actual opcodes, one of which writes to one of the source registers in order to set up the message header. This causes problems because the scheduler isn't aware that the source register is written to and it can end up reorganising the instructions incorrectly such that the write to the source register overwrites a needed value from a previous instruction. This problem was presenting itself as a rendering error in the weapon in Enemy Territory: Quake Wars. Since commit 588859e1 there is an additional problem that the double register allocated to include the message header would end up being split into two. This wasn't happening previously because the code to split registers was explicitly avoided for instructions that are sending from the GRF. This patch fixes both problems by splitting the code to set up the message header into a new pseudo opcode so that it will be done outside of the generator. This new opcode has the header register as a destination so the scheduler can recognise that the register is written to. This has the additional benefit that the scheduler can optimise the message header slightly better by moving the mov instructions further away from the send instructions. On Skylake it appears to fix the following three Piglit tests without causing any regressions: gs-float-array-variable-index gs-mat3x4-row-major gs-mat4x3-row-major I think we actually may need to do something similar for the fs backend and possibly for message headers from regular texture sampling but I'm not entirely sure. v2: Make sure the exec-size is retained as 8 for the mov instruction to initialise the header from g0. This was accidentally lost during a rebase on top of 07c571a39fa1. Split the patch into two so that the helper function is a separate change. Fix emitting the MOV instruction on Gen7. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89058 Reviewed-by: Ben Widawsky --- src/mesa/drivers/dri/i965/brw_defines.h | 1 + src/mesa/drivers/dri/i965/brw_shader.cpp | 4 ++ src/mesa/drivers/dri/i965/brw_vec4.h | 2 + .../dri/i965/brw_vec4_dead_code_eliminate.cpp | 1 + src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 52 +++++++++++----------- src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 38 ++++++++++++---- 6 files changed, 63 insertions(+), 35 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index da6ed5b..a97a944 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -948,6 +948,7 @@ enum opcode { VS_OPCODE_URB_WRITE, VS_OPCODE_PULL_CONSTANT_LOAD, VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, + VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, VS_OPCODE_UNPACK_FLAGS_SIMD4X2, /** diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 335a800..0d6ac0c 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op) return "pull_constant_load"; case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: return "pull_constant_load_gen7"; + + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + return "set_simd4x2_header_gen9"; + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: return "unpack_flags_simd4x2"; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 0363924..a0ee2cc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -500,6 +500,8 @@ private: struct brw_reg dst, struct brw_reg surf_index, struct brw_reg offset); + void generate_set_simd4x2_header_gen9(vec4_instruction *inst, + struct brw_reg dst); void generate_unpack_flags(struct brw_reg dst); void generate_untyped_atomic(vec4_instruction *inst, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp index 980e266..70d2af5 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp @@ -44,6 +44,7 @@ can_do_writemask(const struct brw_context *brw, case SHADER_OPCODE_GEN4_SCRATCH_READ: case VS_OPCODE_PULL_CONSTANT_LOAD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: return false; default: /* The MATH instruction on Gen6 only executes in align1 mode, which does diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index e4addf7..b22a555 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1039,38 +1039,18 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, { assert(surf_index.type == BRW_REGISTER_TYPE_UD); - struct brw_reg src = offset; - bool header_present = false; - int mlen = 1; - - if (brw->gen >= 9) { - /* Skylake requires a message header in order to use SIMD4x2 mode. */ - src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD); - mlen = 2; - header_present = true; - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - brw_set_default_access_mode(p, BRW_ALIGN_1); - - brw_MOV(p, get_element_ud(src, 2), - brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); - brw_pop_insn_state(p); - } - if (surf_index.file == BRW_IMMEDIATE_VALUE) { brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); brw_set_dest(p, insn, dst); - brw_set_src0(p, insn, src); + brw_set_src0(p, insn, offset); brw_set_sampler_message(p, insn, surf_index.dw1.ud, 0, /* LD message ignores sampler unit */ GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1, /* rlen */ - mlen, - header_present, + inst->mlen, + inst->header_present, BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0); @@ -1095,14 +1075,14 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, /* dst = send(offset, a0.0 | ) */ brw_inst *insn = brw_send_indirect_message( - p, BRW_SFID_SAMPLER, dst, src, addr); + p, BRW_SFID_SAMPLER, dst, offset, addr); brw_set_sampler_message(p, insn, 0 /* surface */, 0 /* sampler */, GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1 /* rlen */, - mlen /* mlen */, - header_present /* header */, + inst->mlen, + inst->header_present, BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0); @@ -1113,6 +1093,22 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, } void +vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst, + struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, get_element_ud(dst, 2), + brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); + + brw_pop_insn_state(p); +} + +void vec4_generator::generate_untyped_atomic(vec4_instruction *inst, struct brw_reg dst, struct brw_reg atomic_op, @@ -1435,6 +1431,10 @@ vec4_generator::generate_code(const cfg_t *cfg) generate_pull_constant_load_gen7(inst, dst, src[0], src[1]); break; + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + generate_set_simd4x2_header_gen9(inst, dst); + break; + case GS_OPCODE_URB_WRITE: generate_gs_urb_write(inst); break; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index f7d542b..3d16caa 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1313,16 +1313,36 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, vec4_instruction *pull; - if (brw->gen >= 7) { - dst_reg grf_offset = dst_reg(this, glsl_type::int_type); + if (brw->gen >= 9) { + /* Gen9+ needs a message header in order to use SIMD4x2 mode */ + src_reg header(this, glsl_type::uvec4_type, 2); - /* We have to use a message header on Skylake to get SIMD4x2 mode. - * Reserve space for the register. - */ - if (brw->gen >= 9) { - grf_offset.reg_offset++; - alloc.sizes[grf_offset.reg] = 2; - } + pull = new(mem_ctx) + vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, + dst_reg(header)); + + if (before_inst) + emit_before(before_block, before_inst, pull); + else + emit(pull); + + dst_reg index_reg = retype(offset(dst_reg(header), 1), + offset_reg.type); + pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg); + + if (before_inst) + emit_before(before_block, before_inst, pull); + else + emit(pull); + + pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, + dst, + surf_index, + header); + pull->mlen = 2; + pull->header_present = true; + } else if (brw->gen >= 7) { + dst_reg grf_offset = dst_reg(this, glsl_type::int_type); grf_offset.type = offset_reg.type; -- 2.7.4