From 85b1c89e206763acc49822c5c53097bf894ec7d2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marcin=20=C5=9Alusarz?= Date: Fri, 2 Dec 2022 10:47:00 +0100 Subject: [PATCH] intel/compiler: split lower_cs_intrinsics_convert_block No functional changes. Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 266 ++++++++++++----------- 1 file changed, 137 insertions(+), 129 deletions(-) diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c index 962c977..c1a797a 100644 --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c @@ -31,6 +31,142 @@ struct lower_intrinsics_state { nir_builder builder; }; +static void +compute_local_index_id(nir_builder *b, + nir_shader *nir, + nir_ssa_def **local_index, + nir_ssa_def **local_id) +{ + nir_ssa_def *subgroup_id = nir_load_subgroup_id(b); + + nir_ssa_def *thread_local_id = + nir_imul(b, subgroup_id, nir_load_simd_width_intel(b)); + nir_ssa_def *channel = nir_load_subgroup_invocation(b); + nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id); + + nir_ssa_def *size_x; + nir_ssa_def *size_y; + if (nir->info.workgroup_size_variable) { + nir_ssa_def *size_xyz = nir_load_workgroup_size(b); + size_x = nir_channel(b, size_xyz, 0); + size_y = nir_channel(b, size_xyz, 1); + } else { + size_x = nir_imm_int(b, nir->info.workgroup_size[0]); + size_y = nir_imm_int(b, nir->info.workgroup_size[1]); + } + nir_ssa_def *size_xy = nir_imul(b, size_x, size_y); + + /* The local invocation index and ID must respect the following + * + * gl_LocalInvocationID.x = + * gl_LocalInvocationIndex % gl_WorkGroupSize.x; + * gl_LocalInvocationID.y = + * (gl_LocalInvocationIndex / gl_WorkGroupSize.x) % + * gl_WorkGroupSize.y; + * gl_LocalInvocationID.z = + * (gl_LocalInvocationIndex / + * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) % + * gl_WorkGroupSize.z; + * + * However, the final % gl_WorkGroupSize.z does nothing unless we + * accidentally end up with a gl_LocalInvocationIndex that is too + * large so it can safely be omitted. + */ + + nir_ssa_def *id_x, *id_y, *id_z; + switch (nir->info.cs.derivative_group) { + case DERIVATIVE_GROUP_NONE: + if (nir->info.num_images == 0 && + nir->info.num_textures == 0) { + /* X-major lid order. Optimal for linear accesses only, + * which are usually buffers. X,Y ordering will look like: + * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ... + */ + id_x = nir_umod(b, linear, size_x); + id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y); + *local_index = linear; + } else if (!nir->info.workgroup_size_variable && + nir->info.workgroup_size[1] % 4 == 0) { + /* 1x4 block X-major lid order. Same as X-major except increments in + * blocks of width=1 height=4. Always optimal for tileY and usually + * optimal for linear accesses. + * x = (linear / 4) % size_x + * y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y + * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1) + * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ... + */ + const unsigned height = 4; + nir_ssa_def *block = nir_udiv_imm(b, linear, height); + id_x = nir_umod(b, block, size_x); + id_y = nir_umod(b, + nir_iadd(b, + nir_umod(b, linear, nir_imm_int(b, height)), + nir_imul_imm(b, + nir_udiv(b, block, size_x), + height)), + size_y); + } else { + /* Y-major lid order. Optimal for tileY accesses only, + * which are usually images. X,Y ordering will look like: + * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ... + */ + id_y = nir_umod(b, linear, size_y); + id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x); + } + + id_z = nir_udiv(b, linear, size_xy); + *local_id = nir_vec3(b, id_x, id_y, id_z); + if (!*local_index) { + *local_index = nir_iadd(b, nir_iadd(b, id_x, + nir_imul(b, id_y, size_x)), + nir_imul(b, id_z, size_xy)); + } + break; + case DERIVATIVE_GROUP_LINEAR: + /* For linear, just set the local invocation index linearly, + * and calculate local invocation ID from that. + */ + id_x = nir_umod(b, linear, size_x); + id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y); + id_z = nir_udiv(b, linear, size_xy); + *local_id = nir_vec3(b, id_x, id_y, id_z); + *local_index = linear; + break; + case DERIVATIVE_GROUP_QUADS: { + /* For quads, first we figure out the 2x2 grid the invocation + * belongs to -- treating extra Z layers as just more rows. + * Then map that into local invocation ID (trivial) and local + * invocation index. Skipping Z simplify index calculation. + */ + + nir_ssa_def *one = nir_imm_int(b, 1); + nir_ssa_def *double_size_x = nir_ishl(b, size_x, one); + + /* ID within a pair of rows, where each group of 4 is 2x2 quad. */ + nir_ssa_def *row_pair_id = nir_umod(b, linear, double_size_x); + nir_ssa_def *y_row_pairs = nir_udiv(b, linear, double_size_x); + + nir_ssa_def *x = + nir_ior(b, + nir_iand(b, row_pair_id, one), + nir_iand(b, nir_ishr(b, row_pair_id, one), + nir_imm_int(b, 0xfffffffe))); + nir_ssa_def *y = + nir_ior(b, + nir_ishl(b, y_row_pairs, one), + nir_iand(b, nir_ishr(b, row_pair_id, one), one)); + + *local_id = nir_vec3(b, x, + nir_umod(b, y, size_y), + nir_udiv(b, y, size_y)); + *local_index = nir_iadd(b, x, nir_imul(b, y, size_x)); + break; + } + default: + unreachable("invalid derivative group"); + } +} + static bool lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, nir_block *block) @@ -79,135 +215,7 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, /* First time we are using those, so let's calculate them. */ if (!local_index) { assert(!local_id); - - nir_ssa_def *subgroup_id = nir_load_subgroup_id(b); - - nir_ssa_def *thread_local_id = - nir_imul(b, subgroup_id, nir_load_simd_width_intel(b)); - nir_ssa_def *channel = nir_load_subgroup_invocation(b); - nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id); - - nir_ssa_def *size_x; - nir_ssa_def *size_y; - if (state->nir->info.workgroup_size_variable) { - nir_ssa_def *size_xyz = nir_load_workgroup_size(b); - size_x = nir_channel(b, size_xyz, 0); - size_y = nir_channel(b, size_xyz, 1); - } else { - size_x = nir_imm_int(b, nir->info.workgroup_size[0]); - size_y = nir_imm_int(b, nir->info.workgroup_size[1]); - } - nir_ssa_def *size_xy = nir_imul(b, size_x, size_y); - - /* The local invocation index and ID must respect the following - * - * gl_LocalInvocationID.x = - * gl_LocalInvocationIndex % gl_WorkGroupSize.x; - * gl_LocalInvocationID.y = - * (gl_LocalInvocationIndex / gl_WorkGroupSize.x) % - * gl_WorkGroupSize.y; - * gl_LocalInvocationID.z = - * (gl_LocalInvocationIndex / - * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) % - * gl_WorkGroupSize.z; - * - * However, the final % gl_WorkGroupSize.z does nothing unless we - * accidentally end up with a gl_LocalInvocationIndex that is too - * large so it can safely be omitted. - */ - - nir_ssa_def *id_x, *id_y, *id_z; - switch (state->nir->info.cs.derivative_group) { - case DERIVATIVE_GROUP_NONE: - if (nir->info.num_images == 0 && - nir->info.num_textures == 0) { - /* X-major lid order. Optimal for linear accesses only, - * which are usually buffers. X,Y ordering will look like: - * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ... - */ - id_x = nir_umod(b, linear, size_x); - id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y); - local_index = linear; - } else if (!nir->info.workgroup_size_variable && - nir->info.workgroup_size[1] % 4 == 0) { - /* 1x4 block X-major lid order. Same as X-major except increments in - * blocks of width=1 height=4. Always optimal for tileY and usually - * optimal for linear accesses. - * x = (linear / 4) % size_x - * y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y - * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1) - * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ... - */ - const unsigned height = 4; - nir_ssa_def *block = nir_udiv_imm(b, linear, height); - id_x = nir_umod(b, block, size_x); - id_y = nir_umod(b, - nir_iadd(b, - nir_umod(b, linear, nir_imm_int(b, height)), - nir_imul_imm(b, - nir_udiv(b, block, size_x), - height)), - size_y); - } else { - /* Y-major lid order. Optimal for tileY accesses only, - * which are usually images. X,Y ordering will look like: - * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ... - */ - id_y = nir_umod(b, linear, size_y); - id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x); - } - - id_z = nir_udiv(b, linear, size_xy); - local_id = nir_vec3(b, id_x, id_y, id_z); - if (!local_index) { - local_index = nir_iadd(b, nir_iadd(b, id_x, - nir_imul(b, id_y, size_x)), - nir_imul(b, id_z, size_xy)); - } - break; - case DERIVATIVE_GROUP_LINEAR: - /* For linear, just set the local invocation index linearly, - * and calculate local invocation ID from that. - */ - id_x = nir_umod(b, linear, size_x); - id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y); - id_z = nir_udiv(b, linear, size_xy); - local_id = nir_vec3(b, id_x, id_y, id_z); - local_index = linear; - break; - case DERIVATIVE_GROUP_QUADS: { - /* For quads, first we figure out the 2x2 grid the invocation - * belongs to -- treating extra Z layers as just more rows. - * Then map that into local invocation ID (trivial) and local - * invocation index. Skipping Z simplify index calculation. - */ - - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *double_size_x = nir_ishl(b, size_x, one); - - /* ID within a pair of rows, where each group of 4 is 2x2 quad. */ - nir_ssa_def *row_pair_id = nir_umod(b, linear, double_size_x); - nir_ssa_def *y_row_pairs = nir_udiv(b, linear, double_size_x); - - nir_ssa_def *x = - nir_ior(b, - nir_iand(b, row_pair_id, one), - nir_iand(b, nir_ishr(b, row_pair_id, one), - nir_imm_int(b, 0xfffffffe))); - nir_ssa_def *y = - nir_ior(b, - nir_ishl(b, y_row_pairs, one), - nir_iand(b, nir_ishr(b, row_pair_id, one), one)); - - local_id = nir_vec3(b, x, - nir_umod(b, y, size_y), - nir_udiv(b, y, size_y)); - local_index = nir_iadd(b, x, nir_imul(b, y, size_x)); - break; - } - default: - unreachable("invalid derivative group"); - } + compute_local_index_id(b, nir, &local_index, &local_id); } assert(local_id); -- 2.7.4