nir_builder builder;
};
+static void
+compute_local_index_id(nir_builder *b,
+ nir_shader *nir,
+ nir_ssa_def **local_index,
+ nir_ssa_def **local_id)
+{
+ nir_ssa_def *subgroup_id = nir_load_subgroup_id(b);
+
+ nir_ssa_def *thread_local_id =
+ nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
+ nir_ssa_def *channel = nir_load_subgroup_invocation(b);
+ nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
+
+ nir_ssa_def *size_x;
+ nir_ssa_def *size_y;
+ if (nir->info.workgroup_size_variable) {
+ nir_ssa_def *size_xyz = nir_load_workgroup_size(b);
+ size_x = nir_channel(b, size_xyz, 0);
+ size_y = nir_channel(b, size_xyz, 1);
+ } else {
+ size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+ size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+ }
+ nir_ssa_def *size_xy = nir_imul(b, size_x, size_y);
+
+ /* The local invocation index and ID must respect the following
+ *
+ * gl_LocalInvocationID.x =
+ * gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+ * gl_LocalInvocationID.y =
+ * (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+ * gl_WorkGroupSize.y;
+ * gl_LocalInvocationID.z =
+ * (gl_LocalInvocationIndex /
+ * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+ * gl_WorkGroupSize.z;
+ *
+ * However, the final % gl_WorkGroupSize.z does nothing unless we
+ * accidentally end up with a gl_LocalInvocationIndex that is too
+ * large so it can safely be omitted.
+ */
+
+ nir_ssa_def *id_x, *id_y, *id_z;
+ switch (nir->info.cs.derivative_group) {
+ case DERIVATIVE_GROUP_NONE:
+ if (nir->info.num_images == 0 &&
+ nir->info.num_textures == 0) {
+ /* X-major lid order. Optimal for linear accesses only,
+ * which are usually buffers. X,Y ordering will look like:
+ * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
+ */
+ id_x = nir_umod(b, linear, size_x);
+ id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+ *local_index = linear;
+ } else if (!nir->info.workgroup_size_variable &&
+ nir->info.workgroup_size[1] % 4 == 0) {
+ /* 1x4 block X-major lid order. Same as X-major except increments in
+ * blocks of width=1 height=4. Always optimal for tileY and usually
+ * optimal for linear accesses.
+ * x = (linear / 4) % size_x
+ * y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
+ * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
+ * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
+ */
+ const unsigned height = 4;
+ nir_ssa_def *block = nir_udiv_imm(b, linear, height);
+ id_x = nir_umod(b, block, size_x);
+ id_y = nir_umod(b,
+ nir_iadd(b,
+ nir_umod(b, linear, nir_imm_int(b, height)),
+ nir_imul_imm(b,
+ nir_udiv(b, block, size_x),
+ height)),
+ size_y);
+ } else {
+ /* Y-major lid order. Optimal for tileY accesses only,
+ * which are usually images. X,Y ordering will look like:
+ * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
+ */
+ id_y = nir_umod(b, linear, size_y);
+ id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
+ }
+
+ id_z = nir_udiv(b, linear, size_xy);
+ *local_id = nir_vec3(b, id_x, id_y, id_z);
+ if (!*local_index) {
+ *local_index = nir_iadd(b, nir_iadd(b, id_x,
+ nir_imul(b, id_y, size_x)),
+ nir_imul(b, id_z, size_xy));
+ }
+ break;
+ case DERIVATIVE_GROUP_LINEAR:
+ /* For linear, just set the local invocation index linearly,
+ * and calculate local invocation ID from that.
+ */
+ id_x = nir_umod(b, linear, size_x);
+ id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+ id_z = nir_udiv(b, linear, size_xy);
+ *local_id = nir_vec3(b, id_x, id_y, id_z);
+ *local_index = linear;
+ break;
+ case DERIVATIVE_GROUP_QUADS: {
+ /* For quads, first we figure out the 2x2 grid the invocation
+ * belongs to -- treating extra Z layers as just more rows.
+ * Then map that into local invocation ID (trivial) and local
+ * invocation index. Skipping Z simplify index calculation.
+ */
+
+ nir_ssa_def *one = nir_imm_int(b, 1);
+ nir_ssa_def *double_size_x = nir_ishl(b, size_x, one);
+
+ /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
+ nir_ssa_def *row_pair_id = nir_umod(b, linear, double_size_x);
+ nir_ssa_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
+
+ nir_ssa_def *x =
+ nir_ior(b,
+ nir_iand(b, row_pair_id, one),
+ nir_iand(b, nir_ishr(b, row_pair_id, one),
+ nir_imm_int(b, 0xfffffffe)));
+ nir_ssa_def *y =
+ nir_ior(b,
+ nir_ishl(b, y_row_pairs, one),
+ nir_iand(b, nir_ishr(b, row_pair_id, one), one));
+
+ *local_id = nir_vec3(b, x,
+ nir_umod(b, y, size_y),
+ nir_udiv(b, y, size_y));
+ *local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
+ break;
+ }
+ default:
+ unreachable("invalid derivative group");
+ }
+}
+
static bool
lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
nir_block *block)
/* First time we are using those, so let's calculate them. */
if (!local_index) {
assert(!local_id);
-
- nir_ssa_def *subgroup_id = nir_load_subgroup_id(b);
-
- nir_ssa_def *thread_local_id =
- nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
- nir_ssa_def *channel = nir_load_subgroup_invocation(b);
- nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
-
- nir_ssa_def *size_x;
- nir_ssa_def *size_y;
- if (state->nir->info.workgroup_size_variable) {
- nir_ssa_def *size_xyz = nir_load_workgroup_size(b);
- size_x = nir_channel(b, size_xyz, 0);
- size_y = nir_channel(b, size_xyz, 1);
- } else {
- size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
- size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
- }
- nir_ssa_def *size_xy = nir_imul(b, size_x, size_y);
-
- /* The local invocation index and ID must respect the following
- *
- * gl_LocalInvocationID.x =
- * gl_LocalInvocationIndex % gl_WorkGroupSize.x;
- * gl_LocalInvocationID.y =
- * (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
- * gl_WorkGroupSize.y;
- * gl_LocalInvocationID.z =
- * (gl_LocalInvocationIndex /
- * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
- * gl_WorkGroupSize.z;
- *
- * However, the final % gl_WorkGroupSize.z does nothing unless we
- * accidentally end up with a gl_LocalInvocationIndex that is too
- * large so it can safely be omitted.
- */
-
- nir_ssa_def *id_x, *id_y, *id_z;
- switch (state->nir->info.cs.derivative_group) {
- case DERIVATIVE_GROUP_NONE:
- if (nir->info.num_images == 0 &&
- nir->info.num_textures == 0) {
- /* X-major lid order. Optimal for linear accesses only,
- * which are usually buffers. X,Y ordering will look like:
- * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
- */
- id_x = nir_umod(b, linear, size_x);
- id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
- local_index = linear;
- } else if (!nir->info.workgroup_size_variable &&
- nir->info.workgroup_size[1] % 4 == 0) {
- /* 1x4 block X-major lid order. Same as X-major except increments in
- * blocks of width=1 height=4. Always optimal for tileY and usually
- * optimal for linear accesses.
- * x = (linear / 4) % size_x
- * y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
- * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
- * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
- */
- const unsigned height = 4;
- nir_ssa_def *block = nir_udiv_imm(b, linear, height);
- id_x = nir_umod(b, block, size_x);
- id_y = nir_umod(b,
- nir_iadd(b,
- nir_umod(b, linear, nir_imm_int(b, height)),
- nir_imul_imm(b,
- nir_udiv(b, block, size_x),
- height)),
- size_y);
- } else {
- /* Y-major lid order. Optimal for tileY accesses only,
- * which are usually images. X,Y ordering will look like:
- * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
- */
- id_y = nir_umod(b, linear, size_y);
- id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
- }
-
- id_z = nir_udiv(b, linear, size_xy);
- local_id = nir_vec3(b, id_x, id_y, id_z);
- if (!local_index) {
- local_index = nir_iadd(b, nir_iadd(b, id_x,
- nir_imul(b, id_y, size_x)),
- nir_imul(b, id_z, size_xy));
- }
- break;
- case DERIVATIVE_GROUP_LINEAR:
- /* For linear, just set the local invocation index linearly,
- * and calculate local invocation ID from that.
- */
- id_x = nir_umod(b, linear, size_x);
- id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
- id_z = nir_udiv(b, linear, size_xy);
- local_id = nir_vec3(b, id_x, id_y, id_z);
- local_index = linear;
- break;
- case DERIVATIVE_GROUP_QUADS: {
- /* For quads, first we figure out the 2x2 grid the invocation
- * belongs to -- treating extra Z layers as just more rows.
- * Then map that into local invocation ID (trivial) and local
- * invocation index. Skipping Z simplify index calculation.
- */
-
- nir_ssa_def *one = nir_imm_int(b, 1);
- nir_ssa_def *double_size_x = nir_ishl(b, size_x, one);
-
- /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
- nir_ssa_def *row_pair_id = nir_umod(b, linear, double_size_x);
- nir_ssa_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
-
- nir_ssa_def *x =
- nir_ior(b,
- nir_iand(b, row_pair_id, one),
- nir_iand(b, nir_ishr(b, row_pair_id, one),
- nir_imm_int(b, 0xfffffffe)));
- nir_ssa_def *y =
- nir_ior(b,
- nir_ishl(b, y_row_pairs, one),
- nir_iand(b, nir_ishr(b, row_pair_id, one), one));
-
- local_id = nir_vec3(b, x,
- nir_umod(b, y, size_y),
- nir_udiv(b, y, size_y));
- local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
- break;
- }
- default:
- unreachable("invalid derivative group");
- }
+ compute_local_index_id(b, nir, &local_index, &local_id);
}
assert(local_id);