intel/fs: Lower URB messages to SEND
authorIan Romanick <ian.d.romanick@intel.com>
Mon, 27 Jun 2022 22:34:01 +0000 (15:34 -0700)
committerMarge Bot <emma+marge@anholt.net>
Fri, 8 Jul 2022 19:45:34 +0000 (19:45 +0000)
Before rebasing on top of Ken's split-SEND optimization (see !17018),
this commit just caused some scheduling changes in various tessellation
and geometry shaders.  These changes were caused by the addition of real
latency information for the URB messages.

With the addition of the split-SEND optimization, the changes
are... staggering.  All of the shaders helped for spills and fills are
vertex shaders from Batman Arkham Origins.  What surprises me is that
these shaders account for such a high percentage of the spills and fills
in fossil-db.  85%?!?

v2: Use FIXED_GRF instead of BRW_GENERAL_REGISTER_FILE in an assertion.
Suggested by Ken.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20013625 -> 19954020 (-0.30%)
instructions in affected programs: 4007157 -> 3947552 (-1.49%)
helped: 31161
HURT: 0
helped stats (abs) min: 1 max: 400 x̄: 1.91 x̃: 2
helped stats (rel) min: 0.08% max: 59.70% x̄: 2.20% x̃: 1.83%
95% mean confidence interval for instructions value: -1.97 -1.86
95% mean confidence interval for instructions %-change: -2.22% -2.18%
Instructions are helped.

total cycles in shared programs: 859337569 -> 858636788 (-0.08%)
cycles in affected programs: 74168298 -> 73467517 (-0.94%)
helped: 13812
HURT: 16846
helped stats (abs) min: 1 max: 291078 x̄: 82.83 x̃: 4
helped stats (rel) min: <.01% max: 37.09% x̄: 3.47% x̃: 2.02%
HURT stats (abs)   min: 1 max: 1543 x̄: 26.31 x̃: 14
HURT stats (rel)   min: <.01% max: 77.97% x̄: 4.11% x̃: 2.58%
95% mean confidence interval for cycles value: -55.10 9.39
95% mean confidence interval for cycles %-change: 0.62% 0.77%
Inconclusive result (value mean confidence interval includes 0).

Broadwell
total cycles in shared programs: 904844939 -> 904832320 (<.01%)
cycles in affected programs: 525360 -> 512741 (-2.40%)
helped: 215
HURT: 4
helped stats (abs) min: 4 max: 1018 x̄: 60.16 x̃: 39
helped stats (rel) min: 0.14% max: 15.85% x̄: 2.16% x̃: 2.04%
HURT stats (abs)   min: 79 max: 79 x̄: 79.00 x̃: 79
HURT stats (rel)   min: 1.31% max: 1.57% x̄: 1.43% x̃: 1.43%
95% mean confidence interval for cycles value: -75.02 -40.22
95% mean confidence interval for cycles %-change: -2.37% -1.81%
Cycles are helped.

No shader-db changes on any older Intel platforms.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 142622800 -> 141461114 (-0.8%)
Instructions helped: 197186

Cycles in all programs: 9101223846 -> 9099440025 (-0.0%)
Cycles helped: 37963
Cycles hurt: 151233

Spills in all programs: 98829 -> 13695 (-86.1%)
Spills helped: 2159

Fills in all programs: 128142 -> 18400 (-85.6%)
Fills helped: 2159

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17379>

src/intel/compiler/brw_ir_performance.cpp
src/intel/compiler/brw_lower_logical_sends.cpp
src/intel/compiler/brw_schedule_instructions.cpp

index 43dd3ab..f7292df 100644 (file)
@@ -1152,6 +1152,10 @@ namespace {
             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
                                   10 /* XXX */, 0, 0, 0, 0, 0);
 
+         case BRW_SFID_URB:
+            return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
+                                  32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
+
          default:
             abort();
          }
index 0354dac..e1845a4 100644 (file)
 using namespace brw;
 
 static void
-lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst,
+                            bool per_slot_present)
 {
-   inst->opcode = op;
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+
+   assert(inst->size_written % REG_SIZE == 0);
+   assert(inst->src[0].type == BRW_REGISTER_TYPE_UD);
+   assert(inst->src[0].file == FIXED_GRF || inst->src[0].file == VGRF);
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->header_size = 1;
+
+   inst->sfid = BRW_SFID_URB;
+   inst->desc = brw_urb_desc(devinfo,
+                             GFX8_URB_OPCODE_SIMD8_READ,
+                             per_slot_present,
+                             false,
+                             inst->offset);
+
+   inst->ex_desc = 0;
+   inst->ex_mlen = 0;
+   inst->send_is_volatile = true;
+
+   fs_reg tmp = inst->src[0];
+
+   inst->resize_sources(4);
+
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = tmp;
+   inst->src[3] = brw_null_reg();
 }
 
 static void
-lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+                             bool per_slot_present, bool channel_mask_present)
 {
-   inst->opcode = op;
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   assert(inst->header_size == 0);
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->header_size = 1;
+   inst->dst = brw_null_reg();
+
+   inst->sfid = BRW_SFID_URB;
+   inst->desc = brw_urb_desc(devinfo,
+                             GFX8_URB_OPCODE_SIMD8_WRITE,
+                             per_slot_present,
+                             channel_mask_present,
+                             inst->offset);
+
+   inst->ex_desc = 0;
+   inst->ex_mlen = 0;
+   inst->send_has_side_effects = true;
+
+   fs_reg tmp = inst->src[0];
+
+   inst->resize_sources(4);
+
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = tmp;
+   inst->src[3] = brw_null_reg();
 }
 
 static void
@@ -2642,23 +2698,23 @@ fs_visitor::lower_logical_sends()
          break;
 
       case SHADER_OPCODE_URB_READ_LOGICAL:
-         lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8);
+         lower_urb_read_logical_send(ibld, inst, false);
          break;
       case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
-         lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT);
+         lower_urb_read_logical_send(ibld, inst, true);
          break;
 
       case SHADER_OPCODE_URB_WRITE_LOGICAL:
-         lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8);
+         lower_urb_write_logical_send(ibld, inst, false, false);
          break;
       case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
-         lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT);
+         lower_urb_write_logical_send(ibld, inst, true, false);
          break;
       case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
-         lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED);
+         lower_urb_write_logical_send(ibld, inst, false, true);
          break;
       case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
-         lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT);
+         lower_urb_write_logical_send(ibld, inst, true, true);
          break;
 
       default:
index 5ae9382..3286e3f 100644 (file)
@@ -575,6 +575,10 @@ schedule_node::set_latency_gfx7(bool is_haswell)
          latency = 200;
          break;
 
+      case BRW_SFID_URB:
+         latency = 200;
+         break;
+
       default:
          unreachable("Unknown SFID");
       }