From 85b1c89e206763acc49822c5c53097bf894ec7d2 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Marcin=20=C5=9Alusarz?= <marcin.slusarz@intel.com>
Date: Fri, 2 Dec 2022 10:47:00 +0100
Subject: [PATCH] intel/compiler: split lower_cs_intrinsics_convert_block

No functional changes.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20292>
---
 src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 266 ++++++++++++-----------
 1 file changed, 137 insertions(+), 129 deletions(-)

diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index 962c977..c1a797a 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -31,6 +31,142 @@ struct lower_intrinsics_state {
    nir_builder builder;
 };
 
+static void
+compute_local_index_id(nir_builder *b,
+                       nir_shader *nir,
+                       nir_ssa_def **local_index,
+                       nir_ssa_def **local_id)
+{
+   nir_ssa_def *subgroup_id = nir_load_subgroup_id(b);
+
+   nir_ssa_def *thread_local_id =
+      nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
+   nir_ssa_def *channel = nir_load_subgroup_invocation(b);
+   nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
+
+   nir_ssa_def *size_x;
+   nir_ssa_def *size_y;
+   if (nir->info.workgroup_size_variable) {
+      nir_ssa_def *size_xyz = nir_load_workgroup_size(b);
+      size_x = nir_channel(b, size_xyz, 0);
+      size_y = nir_channel(b, size_xyz, 1);
+   } else {
+      size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+      size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+   }
+   nir_ssa_def *size_xy = nir_imul(b, size_x, size_y);
+
+   /* The local invocation index and ID must respect the following
+    *
+    *    gl_LocalInvocationID.x =
+    *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+    *    gl_LocalInvocationID.y =
+    *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+    *       gl_WorkGroupSize.y;
+    *    gl_LocalInvocationID.z =
+    *       (gl_LocalInvocationIndex /
+    *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+    *       gl_WorkGroupSize.z;
+    *
+    * However, the final % gl_WorkGroupSize.z does nothing unless we
+    * accidentally end up with a gl_LocalInvocationIndex that is too
+    * large so it can safely be omitted.
+    */
+
+   nir_ssa_def *id_x, *id_y, *id_z;
+   switch (nir->info.cs.derivative_group) {
+   case DERIVATIVE_GROUP_NONE:
+      if (nir->info.num_images == 0 &&
+          nir->info.num_textures == 0) {
+         /* X-major lid order. Optimal for linear accesses only,
+          * which are usually buffers. X,Y ordering will look like:
+          * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
+          */
+         id_x = nir_umod(b, linear, size_x);
+         id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+         *local_index = linear;
+      } else if (!nir->info.workgroup_size_variable &&
+                 nir->info.workgroup_size[1] % 4 == 0) {
+         /* 1x4 block X-major lid order. Same as X-major except increments in
+          * blocks of width=1 height=4. Always optimal for tileY and usually
+          * optimal for linear accesses.
+          *   x = (linear / 4) % size_x
+          *   y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
+          * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
+          * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
+          */
+         const unsigned height = 4;
+         nir_ssa_def *block = nir_udiv_imm(b, linear, height);
+         id_x = nir_umod(b, block, size_x);
+         id_y = nir_umod(b,
+                         nir_iadd(b,
+                                  nir_umod(b, linear, nir_imm_int(b, height)),
+                                  nir_imul_imm(b,
+                                               nir_udiv(b, block, size_x),
+                                               height)),
+                         size_y);
+      } else {
+         /* Y-major lid order. Optimal for tileY accesses only,
+          * which are usually images. X,Y ordering will look like:
+          * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
+          */
+         id_y = nir_umod(b, linear, size_y);
+         id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
+      }
+
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      if (!*local_index) {
+         *local_index = nir_iadd(b, nir_iadd(b, id_x,
+                                                nir_imul(b, id_y, size_x)),
+                                                nir_imul(b, id_z, size_xy));
+      }
+      break;
+   case DERIVATIVE_GROUP_LINEAR:
+      /* For linear, just set the local invocation index linearly,
+       * and calculate local invocation ID from that.
+       */
+      id_x = nir_umod(b, linear, size_x);
+      id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      *local_index = linear;
+      break;
+   case DERIVATIVE_GROUP_QUADS: {
+      /* For quads, first we figure out the 2x2 grid the invocation
+       * belongs to -- treating extra Z layers as just more rows.
+       * Then map that into local invocation ID (trivial) and local
+       * invocation index.  Skipping Z simplify index calculation.
+       */
+
+      nir_ssa_def *one = nir_imm_int(b, 1);
+      nir_ssa_def *double_size_x = nir_ishl(b, size_x, one);
+
+      /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
+      nir_ssa_def *row_pair_id = nir_umod(b, linear, double_size_x);
+      nir_ssa_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
+
+      nir_ssa_def *x =
+         nir_ior(b,
+                 nir_iand(b, row_pair_id, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one),
+                          nir_imm_int(b, 0xfffffffe)));
+      nir_ssa_def *y =
+         nir_ior(b,
+                 nir_ishl(b, y_row_pairs, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one), one));
+
+      *local_id = nir_vec3(b, x,
+                           nir_umod(b, y, size_y),
+                           nir_udiv(b, y, size_y));
+      *local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
+      break;
+   }
+   default:
+      unreachable("invalid derivative group");
+   }
+}
+
 static bool
 lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
                                   nir_block *block)
@@ -79,135 +215,7 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
          /* First time we are using those, so let's calculate them. */
          if (!local_index) {
             assert(!local_id);
-
-            nir_ssa_def *subgroup_id = nir_load_subgroup_id(b);
-
-            nir_ssa_def *thread_local_id =
-               nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
-            nir_ssa_def *channel = nir_load_subgroup_invocation(b);
-            nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
-
-            nir_ssa_def *size_x;
-            nir_ssa_def *size_y;
-            if (state->nir->info.workgroup_size_variable) {
-               nir_ssa_def *size_xyz = nir_load_workgroup_size(b);
-               size_x = nir_channel(b, size_xyz, 0);
-               size_y = nir_channel(b, size_xyz, 1);
-            } else {
-               size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
-               size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
-            }
-            nir_ssa_def *size_xy = nir_imul(b, size_x, size_y);
-
-            /* The local invocation index and ID must respect the following
-             *
-             *    gl_LocalInvocationID.x =
-             *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
-             *    gl_LocalInvocationID.y =
-             *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
-             *       gl_WorkGroupSize.y;
-             *    gl_LocalInvocationID.z =
-             *       (gl_LocalInvocationIndex /
-             *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
-             *       gl_WorkGroupSize.z;
-             *
-             * However, the final % gl_WorkGroupSize.z does nothing unless we
-             * accidentally end up with a gl_LocalInvocationIndex that is too
-             * large so it can safely be omitted.
-             */
-
-            nir_ssa_def *id_x, *id_y, *id_z;
-            switch (state->nir->info.cs.derivative_group) {
-            case DERIVATIVE_GROUP_NONE:
-               if (nir->info.num_images == 0 &&
-                   nir->info.num_textures == 0) {
-                  /* X-major lid order. Optimal for linear accesses only,
-                   * which are usually buffers. X,Y ordering will look like:
-                   * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
-                   */
-                  id_x = nir_umod(b, linear, size_x);
-                  id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
-                  local_index = linear;
-               } else if (!nir->info.workgroup_size_variable &&
-                          nir->info.workgroup_size[1] % 4 == 0) {
-                  /* 1x4 block X-major lid order. Same as X-major except increments in
-                   * blocks of width=1 height=4. Always optimal for tileY and usually
-                   * optimal for linear accesses.
-                   *   x = (linear / 4) % size_x
-                   *   y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
-                   * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
-                   * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
-                   */
-                  const unsigned height = 4;
-                  nir_ssa_def *block = nir_udiv_imm(b, linear, height);
-                  id_x = nir_umod(b, block, size_x);
-                  id_y = nir_umod(b,
-                                  nir_iadd(b,
-                                           nir_umod(b, linear, nir_imm_int(b, height)),
-                                           nir_imul_imm(b,
-                                                        nir_udiv(b, block, size_x),
-                                                        height)),
-                                  size_y);
-               } else {
-                  /* Y-major lid order. Optimal for tileY accesses only,
-                   * which are usually images. X,Y ordering will look like:
-                   * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
-                   */
-                  id_y = nir_umod(b, linear, size_y);
-                  id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
-               }
-
-               id_z = nir_udiv(b, linear, size_xy);
-               local_id = nir_vec3(b, id_x, id_y, id_z);
-               if (!local_index) {
-                  local_index = nir_iadd(b, nir_iadd(b, id_x,
-                                                        nir_imul(b, id_y, size_x)),
-                                                        nir_imul(b, id_z, size_xy));
-               }
-               break;
-            case DERIVATIVE_GROUP_LINEAR:
-               /* For linear, just set the local invocation index linearly,
-                * and calculate local invocation ID from that.
-                */
-               id_x = nir_umod(b, linear, size_x);
-               id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
-               id_z = nir_udiv(b, linear, size_xy);
-               local_id = nir_vec3(b, id_x, id_y, id_z);
-               local_index = linear;
-               break;
-            case DERIVATIVE_GROUP_QUADS: {
-               /* For quads, first we figure out the 2x2 grid the invocation
-                * belongs to -- treating extra Z layers as just more rows.
-                * Then map that into local invocation ID (trivial) and local
-                * invocation index.  Skipping Z simplify index calculation.
-                */
-
-               nir_ssa_def *one = nir_imm_int(b, 1);
-               nir_ssa_def *double_size_x = nir_ishl(b, size_x, one);
-
-               /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
-               nir_ssa_def *row_pair_id = nir_umod(b, linear, double_size_x);
-               nir_ssa_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
-
-               nir_ssa_def *x =
-                  nir_ior(b,
-                          nir_iand(b, row_pair_id, one),
-                          nir_iand(b, nir_ishr(b, row_pair_id, one),
-                                   nir_imm_int(b, 0xfffffffe)));
-               nir_ssa_def *y =
-                  nir_ior(b,
-                          nir_ishl(b, y_row_pairs, one),
-                          nir_iand(b, nir_ishr(b, row_pair_id, one), one));
-
-               local_id = nir_vec3(b, x,
-                                   nir_umod(b, y, size_y),
-                                   nir_udiv(b, y, size_y));
-               local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
-               break;
-            }
-            default:
-               unreachable("invalid derivative group");
-            }
+            compute_local_index_id(b, nir, &local_index, &local_id);
          }
 
          assert(local_id);
-- 
2.7.4