[interp] Squash multiple call args moves into single opcode (#52242)
authorVlad Brezae <brezaevlad@gmail.com>
Fri, 14 May 2021 17:46:38 +0000 (20:46 +0300)
committerGitHub <noreply@github.com>
Fri, 14 May 2021 17:46:38 +0000 (20:46 +0300)
* [interp] Replace multiplication and division by 1 with simple mov

* [interp] Skip emitting redundant branch to next basic block

* [interp] Squash multiple call args moves into single opcode

Some vars cannot be used directly as an argument to another call. In this case, the var offset allocator generates new intermediary vars. For methods with a lot of parameters, we can end up with quite a lot of these stores.

As an example, for the following method:
```
public static void MethodPartial (int a, int b, object c, object d)
{
MethodFull (a, b, c, d, 12523);
}
```

Before:
```
IR_0000: ldc.i8         [72 <- nil], 12523
IR_0006: mov.4          [40 <- 0],
IR_0009: mov.4          [48 <- 8],
IR_000c: mov.8          [56 <- 16],
IR_000f: mov.8          [64 <- 24],
IR_0012: call           [32 <- 40], 0
IR_0016: ret.void       [nil <- nil],
```

After:
```
IR_0000: ldc.i8         [72 <- nil], 12523
IR_0006: mov.8.4        [nil <- nil], 40 <- 0, 48 <- 8, 56 <- 16, 64 <- 24
IR_000f: call           [32 <- 40], 0
IR_0013: ret.void       [nil <- nil]
```

src/mono/mono/mini/interp/interp.c
src/mono/mono/mini/interp/mintops.def
src/mono/mono/mini/interp/mintops.h
src/mono/mono/mini/interp/transform.c

index 5fb08cc4f23d82e07a69de69c1ae8d9bd9010608..2b92e7f160e22e1255c8784436f9c727aa75ef4c 100644 (file)
@@ -6576,6 +6576,25 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK;
                        MINT_IN_BREAK;
                }
 
+               MINT_IN_CASE(MINT_MOV_8_2)
+                       LOCAL_VAR (ip [1], guint64) = LOCAL_VAR (ip [2], guint64);
+                       LOCAL_VAR (ip [3], guint64) = LOCAL_VAR (ip [4], guint64);
+                       ip += 5;
+                       MINT_IN_BREAK;
+               MINT_IN_CASE(MINT_MOV_8_3)
+                       LOCAL_VAR (ip [1], guint64) = LOCAL_VAR (ip [2], guint64);
+                       LOCAL_VAR (ip [3], guint64) = LOCAL_VAR (ip [4], guint64);
+                       LOCAL_VAR (ip [5], guint64) = LOCAL_VAR (ip [6], guint64);
+                       ip += 7;
+                       MINT_IN_BREAK;
+               MINT_IN_CASE(MINT_MOV_8_4)
+                       LOCAL_VAR (ip [1], guint64) = LOCAL_VAR (ip [2], guint64);
+                       LOCAL_VAR (ip [3], guint64) = LOCAL_VAR (ip [4], guint64);
+                       LOCAL_VAR (ip [5], guint64) = LOCAL_VAR (ip [6], guint64);
+                       LOCAL_VAR (ip [7], guint64) = LOCAL_VAR (ip [8], guint64);
+                       ip += 9;
+                       MINT_IN_BREAK;
+
                MINT_IN_CASE(MINT_LOCALLOC) {
                        int len = LOCAL_VAR (ip [2], gint32);
                        gpointer mem = frame_data_allocator_alloc (&context->data_stack, frame, ALIGN_TO (len, MINT_VT_ALIGNMENT));
index a82cdb8623ecd8104b4a122274868d066c8ed2d2..1f55c92ec5900050d194c59e380e73d92085b02d 100644 (file)
@@ -108,6 +108,12 @@ OPDEF(MINT_MOV_4, "mov.4", 3, 1, 1, MintOpNoArgs)
 OPDEF(MINT_MOV_8, "mov.8", 3, 1, 1, MintOpNoArgs)
 OPDEF(MINT_MOV_VT, "mov.vt", 4, 1, 1, MintOpShortInt)
 
+// These opcodes represent multiple moves stacked together. They have multiple src and dst
+// but they are not represented here. They are generated by the var offset allocator.
+OPDEF(MINT_MOV_8_2, "mov.8.2", 5, 0, 0, MintOpPair2)
+OPDEF(MINT_MOV_8_3, "mov.8.3", 7, 0, 0, MintOpPair3)
+OPDEF(MINT_MOV_8_4, "mov.8.4", 9, 0, 0, MintOpPair4)
+
 OPDEF(MINT_LDLOCA_S, "ldloca.s", 3, 1, 0, MintOpUShortInt)
 
 OPDEF(MINT_LDIND_I1, "ldind.i1", 3, 1, 1, MintOpNoArgs)
index ec5c95298c0467eea67e139014020a9190bf4303..82c78ac9243bfa256548714a5ffde7c6cb46ff3f 100644 (file)
@@ -24,7 +24,10 @@ typedef enum
        MintOpClassToken,
        MintOpTwoShorts,
        MintOpShortAndInt,
-       MintOpShortAndShortBranch
+       MintOpShortAndShortBranch,
+       MintOpPair2,
+       MintOpPair3,
+       MintOpPair4
 } MintOpArgType;
 
 #define OPDEF(a,b,c,d,e,f) a,
@@ -74,6 +77,8 @@ typedef enum {
 #define MINT_CALL_ARGS 2
 #define MINT_CALL_ARGS_SREG -2
 
+#define MINT_MOV_PAIRS_MAX 4
+
 extern unsigned char const mono_interp_oplen[];
 extern int const mono_interp_op_dregs [];
 extern int const mono_interp_op_sregs [];
index c640ad4990dac985e8791f19d6e57df06fde1829..6a1fb715a07b15e4395c55ccf117d4a92763968c 100644 (file)
@@ -1411,6 +1411,14 @@ dump_interp_ins_data (InterpInst *ins, gint32 ins_offset, const guint16 *data, g
                        target = ins_offset + *(gint16*)(data + 1);
                        g_string_append_printf (str, " %u, IR_%04x", *(guint16*)data, target);
                }
+       case MintOpPair2:
+               g_string_append_printf (str, " %u <- %u, %u <- %u", data [0], data [1], data [2], data [3]);
+               break;
+       case MintOpPair3:
+               g_string_append_printf (str, " %u <- %u, %u <- %u, %u <- %u", data [0], data [1], data [2], data [3], data [4], data [5]);
+               break;
+       case MintOpPair4:
+               g_string_append_printf (str, " %u <- %u, %u <- %u, %u <- %u, %u <- %u", data [0], data [1], data [2], data [3], data [4], data [5], data [6], data [7]);
                break;
        default:
                g_string_append_printf (str, "unknown arg type\n");
@@ -7549,6 +7557,9 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
                if (ins->info.target_bb->native_offset >= 0) {
                        // Backwards branch. We can already patch it.
                        *ip++ = ins->info.target_bb->native_offset - br_offset;
+               } else if (opcode == MINT_BR_S && ins->info.target_bb == td->cbb->next_bb) {
+                       // Ignore branch to the next basic block. Revert the added MINT_BR_S.
+                       ip--;
                } else {
                        // We don't know the in_offset of the target, add a reloc
                        Reloc *reloc = (Reloc*)mono_mempool_alloc0 (td->mempool, sizeof (Reloc));
@@ -7647,6 +7658,12 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
                for (int i = size - 1; i < (jit_call2_size - 1); i++)
                        *ip++ = MINT_NIY;
 #endif
+       } else if (opcode >= MINT_MOV_8_2 && opcode <= MINT_MOV_8_4) {
+               // This instruction is not marked as operating on any vars, all instruction slots are
+               // actually vas. Resolve their offset
+               int num_vars = mono_interp_oplen [opcode] - 1;
+               for (int i = 0; i < num_vars; i++)
+                       *ip++ = td->locals [ins->data [i]].offset;
        } else {
                if (mono_interp_op_dregs [opcode])
                        *ip++ = td->locals [ins->dreg].offset;
@@ -7696,6 +7713,7 @@ generate_compacted_code (TransformData *td)
        for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) {
                InterpInst *ins = bb->first_ins;
                bb->native_offset = ip - td->new_code;
+               td->cbb = bb;
                while (ins) {
                        ip = emit_compacted_instruction (td, ip, ins);
                        ins = ins->next;
@@ -7984,7 +8002,7 @@ interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, LocalValue *
 
 
 static InterpInst*
-interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins)
+interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins, gboolean *folded)
 {
        int *local_ref_count = td->local_ref_count;
        // ins should be a binop, therefore it should have a single dreg and two sregs
@@ -7995,6 +8013,8 @@ interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins)
        LocalValue *val2 = &local_defs [sreg2];
        LocalValue result;
 
+       *folded = FALSE;
+
        if (val1->type != LOCAL_VALUE_I4 && val1->type != LOCAL_VALUE_I8)
                return ins;
        if (val2->type != LOCAL_VALUE_I4 && val2->type != LOCAL_VALUE_I8)
@@ -8066,7 +8086,7 @@ interp_fold_binop (TransformData *td, LocalValue *local_defs, InterpInst *ins)
        // with a LDC of the constant. We leave alone the sregs of this instruction, for
        // deadce to kill the instructions initializing them.
        mono_interp_stats.constant_folds++;
-
+       *folded = TRUE;
        if (result.type == LOCAL_VALUE_I4)
                ins = interp_get_ldc_i4_from_const (td, ins, result.i, dreg);
        else if (result.type == LOCAL_VALUE_I8)
@@ -8341,7 +8361,42 @@ retry:
                        } else if (MINT_IS_UNOP_CONDITIONAL_BRANCH (opcode)) {
                                ins = interp_fold_unop_cond_br (td, bb, local_defs, ins);
                        } else if (MINT_IS_BINOP (opcode)) {
-                               ins = interp_fold_binop (td, local_defs, ins);
+                               gboolean folded;
+                               ins = interp_fold_binop (td, local_defs, ins, &folded);
+                               if (!folded) {
+                                       int sreg = -1;
+                                       int mov_op;
+                                       if ((opcode == MINT_MUL_I4 || opcode == MINT_DIV_I4) &&
+                                                       local_defs [ins->sregs [1]].type == LOCAL_VALUE_I4 &&
+                                                       local_defs [ins->sregs [1]].i == 1) {
+                                               sreg = ins->sregs [0];
+                                               mov_op = MINT_MOV_4;
+                                       } else if ((opcode == MINT_MUL_I8 || opcode == MINT_DIV_I8) &&
+                                                       local_defs [ins->sregs [1]].type == LOCAL_VALUE_I8 &&
+                                                       local_defs [ins->sregs [1]].l == 1) {
+                                               sreg = ins->sregs [0];
+                                               mov_op = MINT_MOV_8;
+                                       } else if (opcode == MINT_MUL_I4 &&
+                                                       local_defs [ins->sregs [0]].type == LOCAL_VALUE_I4 &&
+                                                       local_defs [ins->sregs [0]].i == 1) {
+                                               sreg = ins->sregs [1];
+                                               mov_op = MINT_MOV_4;
+                                       } else if (opcode == MINT_MUL_I8 &&
+                                                       local_defs [ins->sregs [0]].type == LOCAL_VALUE_I8 &&
+                                                       local_defs [ins->sregs [0]].l == 1) {
+                                               sreg = ins->sregs [1];
+                                               mov_op = MINT_MOV_8;
+                                       }
+                                       if (sreg != -1) {
+                                               ins->opcode = mov_op;
+                                               ins->sregs [0] = sreg;
+                                               if (td->verbose_level) {
+                                                       g_print ("Replace idempotent binop :\n\t");
+                                                       dump_interp_inst (ins);
+                                               }
+                                               needs_retry = TRUE;
+                                       }
+                               }
                        } else if (MINT_IS_BINOP_CONDITIONAL_BRANCH (opcode)) {
                                ins = interp_fold_binop_cond_br (td, bb, local_defs, ins);
                        } else if (MINT_IS_LDFLD (opcode) && ins->data [0] == 0) {
@@ -9105,7 +9160,11 @@ interp_alloc_offsets (TransformData *td)
                        if (ins->flags & INTERP_INST_FLAG_CALL) {
                                int *call_args = ins->info.call_args;
                                if (call_args) {
+                                       int pair_sregs [MINT_MOV_PAIRS_MAX];
+                                       int pair_dregs [MINT_MOV_PAIRS_MAX];
+                                       int num_pairs = 0;
                                        int var = *call_args;
+
                                        while (var != -1) {
                                                if (td->locals [var].flags & INTERP_LOCAL_FLAG_GLOBAL ||
                                                                td->locals [var].flags & INTERP_LOCAL_FLAG_NO_CALL_ARGS) {
@@ -9114,17 +9173,27 @@ interp_alloc_offsets (TransformData *td)
                                                        int new_var = create_interp_local (td, td->locals [var].type);
                                                        td->locals [new_var].call = ins;
                                                        td->locals [new_var].flags |= INTERP_LOCAL_FLAG_CALL_ARGS;
-                                                       int opcode = get_mov_for_type (mint_type (td->locals [var].type), FALSE);
-                                                       InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode);
-                                                       interp_ins_set_dreg (new_inst, new_var);
-                                                       interp_ins_set_sreg (new_inst, var);
-                                                       if (opcode == MINT_MOV_VT)
-                                                               new_inst->data [0] = td->locals [var].size;
-                                                       // The arg of the call is no longer global
-                                                       *call_args = new_var;
-                                                       // Also update liveness for this instruction
-                                                       foreach_local_var (td, new_inst, ins_index, set_var_live_range);
-                                                       ins_index++;
+
+                                                       int mt = mint_type (td->locals [var].type);
+                                                       if (mt != MINT_TYPE_VT && num_pairs < MINT_MOV_PAIRS_MAX) {
+                                                               pair_sregs [num_pairs] = var;
+                                                               pair_dregs [num_pairs] = new_var;
+                                                               num_pairs++;
+                                                               // The arg of the call is no longer global
+                                                               *call_args = new_var;
+                                                       } else {
+                                                               int opcode = get_mov_for_type (mt, FALSE);
+                                                               InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode);
+                                                               interp_ins_set_dreg (new_inst, new_var);
+                                                               interp_ins_set_sreg (new_inst, var);
+                                                               if (opcode == MINT_MOV_VT)
+                                                                       new_inst->data [0] = td->locals [var].size;
+                                                               // The arg of the call is no longer global
+                                                               *call_args = new_var;
+                                                               // Also update liveness for this instruction
+                                                               foreach_local_var (td, new_inst, ins_index, set_var_live_range);
+                                                               ins_index++;
+                                                       }
                                                } else {
                                                        // Flag this var as it has special storage on the call args stack
                                                        td->locals [var].call = ins;
@@ -9133,6 +9202,30 @@ interp_alloc_offsets (TransformData *td)
                                                call_args++;
                                                var = *call_args;
                                        }
+                                       if (num_pairs > 0) {
+                                               int i;
+                                               for (i = 0; i < num_pairs; i++) {
+                                                       set_var_live_range (td, pair_sregs [i], ins_index);
+                                                       set_var_live_range (td, pair_dregs [i], ins_index);
+                                               }
+                                               if (num_pairs == 1) {
+                                                       int mt = mint_type (td->locals [pair_sregs [0]].type);
+                                                       int opcode = get_mov_for_type (mt, FALSE);
+                                                       InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode);
+                                                       interp_ins_set_dreg (new_inst, pair_dregs [0]);
+                                                       interp_ins_set_sreg (new_inst, pair_sregs [0]);
+                                               } else {
+                                                       // Squash together multiple moves to the param area into a single opcode
+                                                       int opcode = MINT_MOV_8_2 + num_pairs - 2;
+                                                       InterpInst *new_inst = interp_insert_ins_bb (td, bb, ins->prev, opcode);
+                                                       int k = 0;
+                                                       for (i = 0; i < num_pairs; i++) {
+                                                               new_inst->data [k++] = pair_dregs [i];
+                                                               new_inst->data [k++] = pair_sregs [i];
+                                                       }
+                                               }
+                                               ins_index++;
+                                       }
                                }
                        }
                        // Set live_start and live_end for every referenced local that is not global