aco: Split opcodes for GFX6 and GFX10 emulated bpermute.
authorTimur Kristóf <timur.kristof@gmail.com>
Tue, 13 Dec 2022 08:39:30 +0000 (09:39 +0100)
committerMarge Bot <emma+marge@anholt.net>
Wed, 14 Dec 2022 13:54:04 +0000 (13:54 +0000)
Different sequences are emitted for these, so it makes sense to
have different opcodes too.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20293>

src/amd/compiler/aco_instruction_selection.cpp
src/amd/compiler/aco_lower_to_hw_instr.cpp
src/amd/compiler/aco_opcodes.py
src/amd/compiler/aco_optimizer.cpp

index 6be4c9a..5258236 100644 (file)
@@ -199,8 +199,8 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
       index_op.setLateKill(true);
       input_data.setLateKill(true);
 
-      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
-                        index_op, input_data);
+      return bld.pseudo(aco_opcode::p_bpermute_gfx6, bld.def(v1), bld.def(bld.lm),
+                        bld.def(bld.lm, vcc), index_op, input_data);
    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
 
       /* GFX10 wave64 mode: emulate full-wave bpermute */
@@ -223,7 +223,7 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
        * Note, that these have twice the allocation granularity of normal VGPRs */
       ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
 
-      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
+      return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2), bld.def(s1, scc),
                         index_x4, input_data, same_half);
    } else {
       /* GFX8-9 or GFX10 wave32: bpermute works normally */
index 562637b..3d7f1f3 100644 (file)
@@ -2193,13 +2193,12 @@ lower_to_hw_instr(Program* program)
                }
                break;
             }
-            case aco_opcode::p_bpermute: {
-               if (ctx.program->gfx_level <= GFX7)
-                  emit_gfx6_bpermute(program, instr, bld);
-               else if (ctx.program->gfx_level >= GFX10 && ctx.program->wave_size == 64)
-                  emit_gfx10_wave64_bpermute(program, instr, bld);
-               else
-                  unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
+            case aco_opcode::p_bpermute_gfx6: {
+               emit_gfx6_bpermute(program, instr, bld);
+               break;
+            }
+            case aco_opcode::p_bpermute_gfx10w64: {
+               emit_gfx10_wave64_bpermute(program, instr, bld);
                break;
             }
             case aco_opcode::p_constaddr: {
index 2c11cf2..3e3e67f 100644 (file)
@@ -315,8 +315,15 @@ opcode("p_demote_to_helper")
 opcode("p_is_helper")
 opcode("p_exit_early_if")
 
-# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
-opcode("p_bpermute")
+# simulates proper bpermute behavior on GFX6
+# definitions: result VGPR, temp EXEC, clobbered VCC
+# operands: index, input data
+opcode("p_bpermute_gfx6")
+
+# simulates proper bpermute behavior on GFX10
+# definitions: result VGPR, temp EXEC, clobbered SCC
+# operands: index * 4, input data, same half (bool)
+opcode("p_bpermute_gfx10w64")
 
 # creates a lane mask where only the first active lane is selected
 opcode("p_elect")
index 0952ace..30990ef 100644 (file)
@@ -673,7 +673,8 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
    case aco_opcode::v_readfirstlane_b32:
    case aco_opcode::p_extract:
    case aco_opcode::p_insert: return operand != 0;
-   case aco_opcode::p_bpermute:
+   case aco_opcode::p_bpermute_gfx6:
+   case aco_opcode::p_bpermute_gfx10w64:
    case aco_opcode::p_interp_gfx11:
    case aco_opcode::p_dual_src_export_gfx11: return false;
    default: return true;