From bdc76680089c4d91ce0d42dffed00e273418516d Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 27 Jun 2022 15:34:01 -0700 Subject: [PATCH] intel/fs: Lower URB messages to SEND MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Before rebasing on top of Ken's split-SEND optimization (see !17018), this commit just caused some scheduling changes in various tessellation and geometry shaders. These changes were caused by the addition of real latency information for the URB messages. With the addition of the split-SEND optimization, the changes are... staggering. All of the shaders helped for spills and fills are vertex shaders from Batman Arkham Origins. What surprises me is that these shaders account for such a high percentage of the spills and fills in fossil-db. 85%?!? v2: Use FIXED_GRF instead of BRW_GENERAL_REGISTER_FILE in an assertion. Suggested by Ken. Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20013625 -> 19954020 (-0.30%) instructions in affected programs: 4007157 -> 3947552 (-1.49%) helped: 31161 HURT: 0 helped stats (abs) min: 1 max: 400 x̄: 1.91 x̃: 2 helped stats (rel) min: 0.08% max: 59.70% x̄: 2.20% x̃: 1.83% 95% mean confidence interval for instructions value: -1.97 -1.86 95% mean confidence interval for instructions %-change: -2.22% -2.18% Instructions are helped. total cycles in shared programs: 859337569 -> 858636788 (-0.08%) cycles in affected programs: 74168298 -> 73467517 (-0.94%) helped: 13812 HURT: 16846 helped stats (abs) min: 1 max: 291078 x̄: 82.83 x̃: 4 helped stats (rel) min: <.01% max: 37.09% x̄: 3.47% x̃: 2.02% HURT stats (abs) min: 1 max: 1543 x̄: 26.31 x̃: 14 HURT stats (rel) min: <.01% max: 77.97% x̄: 4.11% x̃: 2.58% 95% mean confidence interval for cycles value: -55.10 9.39 95% mean confidence interval for cycles %-change: 0.62% 0.77% Inconclusive result (value mean confidence interval includes 0). Broadwell total cycles in shared programs: 904844939 -> 904832320 (<.01%) cycles in affected programs: 525360 -> 512741 (-2.40%) helped: 215 HURT: 4 helped stats (abs) min: 4 max: 1018 x̄: 60.16 x̃: 39 helped stats (rel) min: 0.14% max: 15.85% x̄: 2.16% x̃: 2.04% HURT stats (abs) min: 79 max: 79 x̄: 79.00 x̃: 79 HURT stats (rel) min: 1.31% max: 1.57% x̄: 1.43% x̃: 1.43% 95% mean confidence interval for cycles value: -75.02 -40.22 95% mean confidence interval for cycles %-change: -2.37% -1.81% Cycles are helped. No shader-db changes on any older Intel platforms. Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142622800 -> 141461114 (-0.8%) Instructions helped: 197186 Cycles in all programs: 9101223846 -> 9099440025 (-0.0%) Cycles helped: 37963 Cycles hurt: 151233 Spills in all programs: 98829 -> 13695 (-86.1%) Spills helped: 2159 Fills in all programs: 128142 -> 18400 (-85.6%) Fills helped: 2159 Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_ir_performance.cpp | 4 ++ src/intel/compiler/brw_lower_logical_sends.cpp | 76 ++++++++++++++++++++---- src/intel/compiler/brw_schedule_instructions.cpp | 4 ++ 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index 43dd3ab..f7292df 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -1152,6 +1152,10 @@ namespace { return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0, 10 /* XXX */, 0, 0, 0, 0, 0); + case BRW_SFID_URB: + return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */, + 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); + default: abort(); } diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 0354dac..e1845a4 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -31,15 +31,71 @@ using namespace brw; static void -lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) +lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst, + bool per_slot_present) { - inst->opcode = op; + const intel_device_info *devinfo = bld.shader->devinfo; + + + assert(inst->size_written % REG_SIZE == 0); + assert(inst->src[0].type == BRW_REGISTER_TYPE_UD); + assert(inst->src[0].file == FIXED_GRF || inst->src[0].file == VGRF); + + inst->opcode = SHADER_OPCODE_SEND; + inst->header_size = 1; + + inst->sfid = BRW_SFID_URB; + inst->desc = brw_urb_desc(devinfo, + GFX8_URB_OPCODE_SIMD8_READ, + per_slot_present, + false, + inst->offset); + + inst->ex_desc = 0; + inst->ex_mlen = 0; + inst->send_is_volatile = true; + + fs_reg tmp = inst->src[0]; + + inst->resize_sources(4); + + inst->src[0] = brw_imm_ud(0); /* desc */ + inst->src[1] = brw_imm_ud(0); /* ex_desc */ + inst->src[2] = tmp; + inst->src[3] = brw_null_reg(); } static void -lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) +lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, + bool per_slot_present, bool channel_mask_present) { - inst->opcode = op; + const intel_device_info *devinfo = bld.shader->devinfo; + + assert(inst->header_size == 0); + + inst->opcode = SHADER_OPCODE_SEND; + inst->header_size = 1; + inst->dst = brw_null_reg(); + + inst->sfid = BRW_SFID_URB; + inst->desc = brw_urb_desc(devinfo, + GFX8_URB_OPCODE_SIMD8_WRITE, + per_slot_present, + channel_mask_present, + inst->offset); + + inst->ex_desc = 0; + inst->ex_mlen = 0; + inst->send_has_side_effects = true; + + fs_reg tmp = inst->src[0]; + + inst->resize_sources(4); + + inst->src[0] = brw_imm_ud(0); /* desc */ + inst->src[1] = brw_imm_ud(0); /* ex_desc */ + inst->src[2] = tmp; + inst->src[3] = brw_null_reg(); } static void @@ -2642,23 +2698,23 @@ fs_visitor::lower_logical_sends() break; case SHADER_OPCODE_URB_READ_LOGICAL: - lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8); + lower_urb_read_logical_send(ibld, inst, false); break; case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL: - lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT); + lower_urb_read_logical_send(ibld, inst, true); break; case SHADER_OPCODE_URB_WRITE_LOGICAL: - lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8); + lower_urb_write_logical_send(ibld, inst, false, false); break; case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL: - lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT); + lower_urb_write_logical_send(ibld, inst, true, false); break; case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL: - lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED); + lower_urb_write_logical_send(ibld, inst, false, true); break; case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL: - lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT); + lower_urb_write_logical_send(ibld, inst, true, true); break; default: diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 5ae9382..3286e3f 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -575,6 +575,10 @@ schedule_node::set_latency_gfx7(bool is_haswell) latency = 200; break; + case BRW_SFID_URB: + latency = 200; + break; + default: unreachable("Unknown SFID"); } -- 2.7.4