[mono][interp] Improve tiering performance (#70649)
authorVlad Brezae <brezaevlad@gmail.com>
Mon, 1 Aug 2022 08:18:14 +0000 (11:18 +0300)
committerGitHub <noreply@github.com>
Mon, 1 Aug 2022 08:18:14 +0000 (11:18 +0300)
* [mono][interp] Don't allocate some vars as execution stack

It is not really true and it serves no purpose here.

* [mono][interp] Print code before any optimizations take place

Fix bitrotten mono_interp_print_td_code. Don't print IL_SEQ_POINT opcodes since they are too noisy.

* [mono][interp] Use td->optimized directly in more places

* [mono][interp] Add dummy MINT_LDNULL instruction

We were pushing local that wasn't defined by any instruction, potentially confusing the var offset allocator.

* [mono][interp] Add fast offset allocator, to be used by unoptimized code

This is the old offset allocation scheme that we were using originally, before the var offset allocator was added. Vars have the same offset as they would have in IL code, based on their position on the execution stack at the moment when they are pushed. Whenever we push/pop on the execution stack we keep track of the used stack size. Every var pushed on the execution stack will therefore have this stack_offset remembered. Once the entire IL code is traversed and we have all the global locals allocated, the real offset of the execution stack locals can be determined. It is computed as the originally determined stack_offset added with the offset of the execution stack start (size of the global locals space).

With this offset allocator, calls no longer need to store all the call args sregs and the return_offset is always the same as call_args_offset. This is because all vars are directly placed in the right position and no optimizations can move them around. The offset of the return value will therefore be also the offset where all the args are placed.

The limitation with this way of allocating offsets is that we run into the same problems with opcodes that don't have typical stack usage (use values, pop them, store result). This happens with newobj opcodes. The opcode receives the params, and then it needs to call a ctor with these same params and a newly allocated this object. Since we can't use a var offset allocation pass to compute the offset ideally, the newobj opcodes in the case of unoptimized code must move these params around on the stack, in order to make room for `this`.

* [mono][interp] Add dreg to all calls in unoptimized code

All calls need to have a dreg (a dummy one if it is void call), in order for unoptimized offset allocator to determine the offset of the call. In unoptimized code, the offset of the first argument is always the same as the offset of the return, if any.

* [mono][interp] Fix issue with passing of exvars

Unoptimized code can't use a global local directly (like the exvar), it must first be pushed to a new var on the execution stack. Add a mov instruction when we start executing the basic block for a handler.

src/mono/mono/mini/interp/interp.c
src/mono/mono/mini/interp/mintops.def
src/mono/mono/mini/interp/transform.c
src/mono/mono/mini/interp/transform.h

index f3fe3cf..f43100e 100644 (file)
@@ -5374,6 +5374,19 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK;
                        ip += 4;
                        goto call;
                }
+               MINT_IN_CASE(MINT_NEWOBJ_STRING_UNOPT) {
+                       // Same as MINT_NEWOBJ_STRING but copy params into right place on stack
+                       cmethod = (InterpMethod*)frame->imethod->data_items [ip [2]];
+                       return_offset = ip [1];
+                       call_args_offset = ip [1];
+
+                       int param_size = ip [3];
+                        if (param_size)
+                                memmove (locals + call_args_offset + MINT_STACK_SLOT_SIZE, locals + call_args_offset, param_size);
+                       LOCAL_VAR (call_args_offset, gpointer) = NULL;
+                       ip += 4;
+                       goto call;
+               }
                MINT_IN_CASE(MINT_NEWOBJ) {
                        MonoVTable *vtable = (MonoVTable*) frame->imethod->data_items [ip [4]];
                        INIT_VTABLE (vtable);
@@ -5474,6 +5487,49 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK;
                        ip += 4;
                        goto call;
                }
+               MINT_IN_CASE(MINT_NEWOBJ_SLOW_UNOPT) {
+                       call_args_offset = ip [1];
+                       guint16 param_size = ip [3];
+                       guint16 ret_size = ip [4];
+                       gpointer this_ptr;
+
+                       // Should only be called in unoptimized code. This opcode moves the params around
+                       // to compensate for the lack of use of a proper offset allocator in unoptimized code.
+                       gboolean is_vt = ret_size != 0;
+                       if (!is_vt)
+                               ret_size = MINT_STACK_SLOT_SIZE;
+
+                       cmethod = (InterpMethod*)frame->imethod->data_items [ip [2]];
+
+                       MonoClass *newobj_class = cmethod->method->klass;
+
+                       // We allocate space on the stack for return value and for this pointer, that is passed to ctor
+                       if (param_size)
+                               memmove (locals + call_args_offset + ret_size + MINT_STACK_SLOT_SIZE, locals + call_args_offset, param_size);
+
+                       if (is_vt) {
+                               this_ptr = locals + call_args_offset;
+                               memset (this_ptr, 0, ret_size);
+                               call_args_offset += ret_size;
+                       } else {
+                               // FIXME push/pop LMF
+                               MonoVTable *vtable = mono_class_vtable_checked (newobj_class, error);
+                               if (!is_ok (error) || !mono_runtime_class_init_full (vtable, error)) {
+                                       MonoException *exc = interp_error_convert_to_exception (frame, error, ip);
+                                       g_assert (exc);
+                                       THROW_EX (exc, ip);
+                               }
+                               error_init_reuse (error);
+                               this_ptr = mono_object_new_checked (newobj_class, error);
+                               mono_interp_error_cleanup (error); // FIXME: do not swallow the error
+                               LOCAL_VAR (call_args_offset, gpointer) = this_ptr; // return value
+                               call_args_offset += MINT_STACK_SLOT_SIZE;
+                       }
+                       LOCAL_VAR (call_args_offset, gpointer) = this_ptr;
+                       return_offset = call_args_offset; // unused, prevent warning
+                       ip += 5;
+                       goto call;
+               }
                MINT_IN_CASE(MINT_INTRINS_SPAN_CTOR) {
                        gpointer ptr = LOCAL_VAR (ip [2], gpointer);
                        int len = LOCAL_VAR (ip [3], gint32);
index 3f67b60..b237b7a 100644 (file)
@@ -338,6 +338,8 @@ OPDEF(MINT_JMP, "jmp", 2, 0, 0, MintOpMethodToken)
 
 OPDEF(MINT_ENDFILTER, "endfilter", 2, 0, 1, MintOpNoArgs)
 
+OPDEF(MINT_NEWOBJ_SLOW_UNOPT, "newobj_slow_unopt", 5, 1, 0, MintOpMethodToken)
+OPDEF(MINT_NEWOBJ_STRING_UNOPT, "newobj_string_unopt", 4, 1, 0, MintOpMethodToken)
 OPDEF(MINT_NEWOBJ_SLOW, "newobj_slow", 4, 1, 1, MintOpMethodToken)
 OPDEF(MINT_NEWOBJ_ARRAY, "newobj_array", 5, 1, 1, MintOpMethodToken)
 OPDEF(MINT_NEWOBJ_STRING, "newobj_string", 4, 1, 1, MintOpMethodToken)
index 1a0ff8c..a4f826e 100644 (file)
@@ -273,11 +273,17 @@ interp_prev_ins (InterpInst *ins)
                                stack_size, n, (td)->ip - (td)->il_code); \
        } while (0)
 
+#define ENSURE_STACK_SIZE(td, size) \
+       do { \
+               if ((size) > td->max_stack_size) \
+                       td->max_stack_size = size; \
+       } while (0)
+
 #define ENSURE_I4(td, sp_off) \
        do { \
-               if ((td)->sp [-sp_off].type == STACK_TYPE_I8) { \
+               if ((td)->sp [-(sp_off)].type == STACK_TYPE_I8) { \
                        /* Same representation in memory, nothing to do */ \
-                       (td)->sp [-sp_off].type = STACK_TYPE_I4; \
+                       (td)->sp [-(sp_off)].type = STACK_TYPE_I4; \
                } \
        } while (0)
 
@@ -407,12 +413,27 @@ create_interp_local_explicit (TransformData *td, MonoType *type, int size)
 }
 
 static int
-create_interp_stack_local (TransformData *td, int type, MonoClass *k, int type_size)
+get_tos_offset (TransformData *td)
 {
-       int local = create_interp_local_explicit (td, get_type_from_stack (type, k), type_size);
+       if (td->sp == td->stack)
+               return 0;
+       else
+               return td->sp [-1].offset + td->sp [-1].size;
+}
+
+// Create a local for sp
+static void
+create_interp_stack_local (TransformData *td, StackInfo *sp, int type_size)
+{
+       int local = create_interp_local_explicit (td, get_type_from_stack (sp->type, sp->klass), type_size);
 
        td->locals [local].flags |= INTERP_LOCAL_FLAG_EXECUTION_STACK;
-       return local;
+       if (!td->optimized) {
+               td->locals [local].stack_offset = sp->offset;
+               // Additional space that is allocated for the frame, when we don't run the var offset allocator
+               ENSURE_STACK_SIZE(td, sp->offset + sp->size);
+       }
+       sp->local = local;
 }
 
 static void
@@ -433,8 +454,9 @@ push_type_explicit (TransformData *td, int type, MonoClass *k, int type_size)
        td->sp->type = GINT_TO_UINT8 (type);
        td->sp->klass = k;
        td->sp->flags = 0;
-       td->sp->local = create_interp_stack_local (td, type, k, type_size);
+       td->sp->offset = get_tos_offset (td);
        td->sp->size = ALIGN_TO (type_size, MINT_STACK_SLOT_SIZE);
+       create_interp_stack_local (td, td->sp, type_size);
        td->sp++;
 }
 
@@ -475,7 +497,7 @@ static void
 set_type_and_local (TransformData *td, StackInfo *sp, MonoClass *klass, int type)
 {
        SET_TYPE (sp, type, klass);
-       sp->local = create_interp_stack_local (td, type, NULL, MINT_STACK_SLOT_SIZE);
+       create_interp_stack_local (td, sp, MINT_STACK_SLOT_SIZE);
 }
 
 static void
@@ -702,6 +724,10 @@ get_mov_for_type (int mt, gboolean needs_sext)
 static void
 fixup_newbb_stack_locals (TransformData *td, InterpBasicBlock *newbb)
 {
+       // If not optimized, it is enough for vars to have same offset on the stack. It is not
+       // mandatory for sregs and dregs to match.
+       if (!td->optimized)
+               return;
        if (newbb->stack_height <= 0)
                return;
 
@@ -762,9 +788,9 @@ handle_branch (TransformData *td, int long_op, int offset)
        if (offset < 0 && td->sp == td->stack && !td->inlined_method) {
                // Backwards branch inside unoptimized method where the IL stack is empty
                // This is candidate for a patchpoint
-               if (!td->rtm->optimized)
+               if (!td->optimized)
                        target_bb->emit_patchpoint = TRUE;
-               if (mono_interp_tiering_enabled () && !target_bb->patchpoint_data && td->rtm->optimized) {
+               if (mono_interp_tiering_enabled () && !target_bb->patchpoint_data && td->optimized) {
                        // The optimized imethod will store mapping from bb index to native offset so it
                        // can resume execution in the optimized method, once we tier up in patchpoint
                        td->patchpoint_data_n++;
@@ -1214,16 +1240,22 @@ interp_generate_mae_throw (TransformData *td, MonoMethod *method, MonoMethod *ta
        td->last_ins->data [0] = get_data_item_index (td, target_method);
 
        td->sp -= 2;
-       int *call_args = (int*)mono_mempool_alloc (td->mempool, 3 * sizeof (int));
-       call_args [0] = td->sp [0].local;
-       call_args [1] = td->sp [1].local;
-       call_args [2] = -1;
 
        interp_add_ins (td, MINT_ICALL_PP_V);
        interp_ins_set_sreg (td->last_ins, MINT_CALL_ARGS_SREG);
        td->last_ins->data [0] = get_data_item_index (td, (gpointer)info->func);
-       td->last_ins->info.call_args = call_args;
        td->last_ins->flags |= INTERP_INST_FLAG_CALL;
+       if (td->optimized) {
+               int *call_args = (int*)mono_mempool_alloc (td->mempool, 3 * sizeof (int));
+               call_args [0] = td->sp [0].local;
+               call_args [1] = td->sp [1].local;
+               call_args [2] = -1;
+               td->last_ins->info.call_args = call_args;
+       } else {
+               // Unoptimized code needs every call to have a dreg for offset allocation,
+               // even if call is void
+               td->last_ins->dreg = td->sp [0].local;
+       }
 }
 
 static void
@@ -1236,6 +1268,11 @@ interp_generate_void_throw (TransformData *td, MonoJitICallId icall_id)
        td->last_ins->data [0] = get_data_item_index (td, (gpointer)info->func);
        td->last_ins->info.call_args = NULL;
        td->last_ins->flags |= INTERP_INST_FLAG_CALL;
+       if (!td->optimized) {
+               push_simple_type (td, STACK_TYPE_I4);
+               td->sp--;
+               td->last_ins->dreg = td->sp [0].local;
+       }
 }
 
 static void
@@ -1251,15 +1288,21 @@ interp_generate_ipe_throw_with_msg (TransformData *td, MonoError *error_msg)
        td->last_ins->data [0] = get_data_item_index (td, msg);
 
        td->sp -= 1;
-       int *call_args = (int*)mono_mempool_alloc (td->mempool, 2 * sizeof (int));
-       call_args [0] = td->sp [0].local;
-       call_args [1] = -1;
 
        interp_add_ins (td, MINT_ICALL_P_V);
        interp_ins_set_sreg (td->last_ins, MINT_CALL_ARGS_SREG);
        td->last_ins->data [0] = get_data_item_index (td, (gpointer)info->func);
-       td->last_ins->info.call_args = call_args;
        td->last_ins->flags |= INTERP_INST_FLAG_CALL;
+       if (td->optimized) {
+               int *call_args = (int*)mono_mempool_alloc (td->mempool, 2 * sizeof (int));
+               call_args [0] = td->sp [0].local;
+               call_args [1] = -1;
+               td->last_ins->info.call_args = call_args;
+       } else {
+               // Unoptimized code needs every call to have a dreg for offset allocation,
+               // even if call is void
+               td->last_ins->dreg = td->sp [0].local;
+       }
 }
 
 static int
@@ -1490,12 +1533,15 @@ dump_interp_inst (InterpInst *ins)
        g_string_free (str, TRUE);
 }
 
-static G_GNUC_UNUSED void
+static void
 dump_interp_bb (InterpBasicBlock *bb)
 {
        g_print ("BB%d:\n", bb->index);
-       for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next)
-               dump_interp_inst (ins);
+       for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) {
+               // Avoid some noise
+               if (ins->opcode != MINT_NOP && ins->opcode != MINT_IL_SEQ_POINT)
+                       dump_interp_inst (ins);
+       }
 }
 
 
@@ -1521,15 +1567,9 @@ mono_interp_print_code (InterpMethod *imethod)
 void
 mono_interp_print_td_code (TransformData *td)
 {
-       InterpInst *ins = td->first_ins;
-
-       char *name = mono_method_full_name (td->method, TRUE);
-       g_print ("IR for \"%s\"\n", name);
-       g_free (name);
-       while (ins) {
-               dump_interp_inst (ins);
-               ins = ins->next;
-       }
+       g_print ("Unoptimized IR:\n");
+       for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb)
+               dump_interp_bb (bb);
 }
 
 
@@ -2783,7 +2823,7 @@ interp_inline_newobj (TransformData *td, MonoMethod *target_method, MonoMethodSi
                else
                        vtsize = MINT_STACK_SLOT_SIZE;
 
-               dreg = create_interp_stack_local (td, stack_type [ret_mt], klass, vtsize);
+               dreg = create_interp_local (td, get_type_from_stack (stack_type [ret_mt], klass));
 
                // For valuetypes, we need to control the lifetime of the valuetype.
                // MINT_NEWOBJ_VT_INLINED takes the address of this reg and we should keep
@@ -2791,7 +2831,7 @@ interp_inline_newobj (TransformData *td, MonoMethod *target_method, MonoMethodSi
                interp_add_ins (td, MINT_DEF);
                interp_ins_set_dreg (td->last_ins, dreg);
        } else {
-               dreg = create_interp_stack_local (td, stack_type [ret_mt], klass, MINT_STACK_SLOT_SIZE);
+               dreg = create_interp_local (td, get_type_from_stack (stack_type [ret_mt], klass));
        }
 
        // Allocate `this` pointer
@@ -2941,6 +2981,9 @@ get_virt_method_slot (MonoMethod *method)
 static int*
 create_call_args (TransformData *td, int num_args)
 {
+       // We don't need to know the sregs for calls in unoptimized code
+       if (!td->optimized)
+               return NULL;
        int *call_args = (int*) mono_mempool_alloc (td->mempool, (num_args + 1) * sizeof (int));
        for (int i = 0; i < num_args; i++)
                call_args [i] = td->sp [i].local;
@@ -2990,11 +3033,29 @@ interp_transform_call (TransformData *td, MonoMethod *method, MonoMethod *target
                                calli = FALSE;
                                native = FALSE;
                                // The function pointer is passed last, but the wrapper expects it as first argument
-                               // Switch the arguments
-                               StackInfo sp_fp = td->sp [-1];
-                               StackInfo *start = &td->sp [-csignature->param_count - 1];
-                               memmove (start + 1, start, csignature->param_count * sizeof (StackInfo));
-                               *start = sp_fp;
+                               // Switch the arguments.
+                               // When the var offset allocator is not used, in unoptimized code, we have to manually
+                               // push the values into the correct order. In optimized code, we just need to know what
+                               // local is the execution stack position during compilation, so we can just do a memmove
+                               // of the StackInfo
+                               if (td->optimized) {
+                                       StackInfo sp_fp = td->sp [-1];
+                                       StackInfo *start = &td->sp [-csignature->param_count - 1];
+                                       memmove (start + 1, start, csignature->param_count * sizeof (StackInfo));
+                                       *start = sp_fp;
+                               } else {
+                                       int *arg_locals = mono_mempool_alloc0 (td->mempool, sizeof (int) * csignature->param_count);
+                                       int fp_local = create_interp_local (td, m_class_get_byval_arg (mono_defaults.int_class));
+                                       // Pop everything into locals. Push after into correct order
+                                       store_local (td, fp_local);
+                                       for (int i = csignature->param_count - 1; i >= 0; i--) {
+                                               arg_locals [i] = create_interp_local (td, csignature->params [i]);
+                                               store_local (td, arg_locals [i]);
+                                       }
+                                       load_local (td, fp_local);
+                                       for (int i = 0; i < csignature->param_count; i++)
+                                               load_local (td, arg_locals [i]);
+                               }
 
                                // The method we are calling has a different signature
                                csignature = mono_method_signature_internal (target_method);
@@ -3135,8 +3196,6 @@ interp_transform_call (TransformData *td, MonoMethod *method, MonoMethod *target
                        td->sp -= num_args;
                        guint32 params_stack_size = get_stack_size (td->sp, num_args);
 
-                       int *call_args = create_call_args (td, num_args);
-
                        if (is_virtual) {
                                interp_add_ins (td, MINT_CKNULL);
                                interp_ins_set_sreg (td->last_ins, td->sp->local);
@@ -3152,7 +3211,16 @@ interp_transform_call (TransformData *td, MonoMethod *method, MonoMethod *target
                        td->last_ins->data [0] = get_data_item_index_imethod (td, mono_interp_get_imethod (target_method));
                        td->last_ins->data [1] = GUINT32_TO_UINT16 (params_stack_size);
                        td->last_ins->flags |= INTERP_INST_FLAG_CALL;
-                       td->last_ins->info.call_args = call_args;
+
+                       if (td->optimized) {
+                               int *call_args = create_call_args (td, num_args);
+                               td->last_ins->info.call_args = call_args;
+                       } else {
+                               // Dummy dreg
+                               push_simple_type (td, STACK_TYPE_I4);
+                               interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
+                               td->sp--;
+                       }
 
                        int in_offset = GPTRDIFF_TO_INT (td->ip - td->il_code);
                        if (interp_ip_in_cbb (td, in_offset + 5))
@@ -4307,7 +4375,7 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header,
                if (td->verbose_level) {
                        char *tmp = mono_disasm_code (NULL, method, td->ip, end);
                        char *name = mono_method_full_name (method, TRUE);
-                       g_print ("Method %s, optimized %d, original code:\n", name, rtm->optimized);
+                       g_print ("Method %s, optimized %d, original code:\n", name, td->optimized);
                        g_print ("%s\n", tmp);
                        g_free (tmp);
                        g_free (name);
@@ -4426,6 +4494,20 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header,
                                init_bb_stack_state (td, new_bb);
                        }
                        link_bblocks = TRUE;
+                       // Unoptimized code cannot access exception object directly from the exvar, we need
+                       // to push it explicitly on the execution stack
+                       if (!td->optimized) {
+                                int index = td->clause_indexes [in_offset];
+                                if (index != -1 && new_bb->stack_height == 1 && header->clauses [index].handler_offset == in_offset) {
+                                       int exvar = td->clause_vars [index];
+                                       g_assert (td->stack [0].local == exvar);
+                                       td->sp--;
+                                       push_simple_type (td, STACK_TYPE_O);
+                                       interp_add_ins (td, MINT_MOV_P);
+                                       interp_ins_set_sreg (td->last_ins, exvar);
+                                       interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
+                                }
+                        }
                }
                td->offset_to_bb [in_offset] = td->cbb;
                td->in_start = td->ip;
@@ -5533,24 +5615,36 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header,
                                td->last_ins->flags |= INTERP_INST_FLAG_CALL;
                                td->last_ins->info.call_args = call_args;
                        } else if (klass == mono_defaults.string_class) {
-                               int *call_args = (int*)mono_mempool_alloc (td->mempool, (csignature->param_count + 2) * sizeof (int));
-                               td->sp -= csignature->param_count;
+                               if (!td->optimized) {
+                                       int tos_offset = get_tos_offset (td);
+                                       td->sp -= csignature->param_count;
+                                       guint32 params_stack_size = tos_offset - get_tos_offset (td);
+
+                                       interp_add_ins (td, MINT_NEWOBJ_STRING_UNOPT);
+                                       td->last_ins->data [0] = get_data_item_index (td, mono_interp_get_imethod (m));
+                                       td->last_ins->data [1] = params_stack_size;
+                                       push_type (td, stack_type [ret_mt], klass);
+                                       interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
+                               } else {
+                                       int *call_args = (int*)mono_mempool_alloc (td->mempool, (csignature->param_count + 2) * sizeof (int));
+                                       td->sp -= csignature->param_count;
 
-                               // First arg is dummy var, it is null when passed to the ctor
-                               call_args [0] = create_interp_stack_local (td, stack_type [ret_mt], NULL, MINT_STACK_SLOT_SIZE);
-                               for (int i = 0; i < csignature->param_count; i++) {
-                                       call_args [i + 1] = td->sp [i].local;
-                               }
-                               call_args [csignature->param_count + 1] = -1;
+                                       // First arg is dummy var, it is null when passed to the ctor
+                                       call_args [0] = create_interp_local (td, get_type_from_stack (stack_type [ret_mt], NULL));
+                                       for (int i = 0; i < csignature->param_count; i++) {
+                                               call_args [i + 1] = td->sp [i].local;
+                                       }
+                                       call_args [csignature->param_count + 1] = -1;
 
-                               interp_add_ins (td, MINT_NEWOBJ_STRING);
-                               td->last_ins->data [0] = get_data_item_index_imethod (td, mono_interp_get_imethod (m));
-                               push_type (td, stack_type [ret_mt], klass);
+                                       interp_add_ins (td, MINT_NEWOBJ_STRING);
+                                       td->last_ins->data [0] = get_data_item_index_imethod (td, mono_interp_get_imethod (m));
+                                       push_type (td, stack_type [ret_mt], klass);
 
-                               interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
-                               interp_ins_set_sreg (td->last_ins, MINT_CALL_ARGS_SREG);
-                               td->last_ins->flags |= INTERP_INST_FLAG_CALL;
-                               td->last_ins->info.call_args = call_args;
+                                       interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
+                                       interp_ins_set_sreg (td->last_ins, MINT_CALL_ARGS_SREG);
+                                       td->last_ins->flags |= INTERP_INST_FLAG_CALL;
+                                       td->last_ins->info.call_args = call_args;
+                               }
                        } else if (m_class_get_image (klass) == mono_defaults.corlib &&
                                        (!strcmp (m_class_get_name (m->klass), "Span`1") ||
                                        !strcmp (m_class_get_name (m->klass), "ReadOnlySpan`1")) &&
@@ -5564,6 +5658,31 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header,
                                interp_ins_set_sregs2 (td->last_ins, td->sp [0].local, td->sp [1].local);
                                push_type_vt (td, klass, mono_class_value_size (klass, NULL));
                                interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
+                       } else if (!td->optimized) {
+                               int tos = get_tos_offset (td);
+                               td->sp -= csignature->param_count;
+                               int param_size = tos - get_tos_offset (td);
+
+                               interp_add_ins (td, MINT_NEWOBJ_SLOW_UNOPT);
+                               td->last_ins->data [0] = get_data_item_index_imethod (td, mono_interp_get_imethod (m));
+                               td->last_ins->data [1] = param_size;
+
+                               gboolean is_vt = m_class_is_valuetype (klass);
+                               if (is_vt) {
+                                       int vtsize = mono_class_value_size (klass, NULL);
+                                       vtsize = ALIGN_TO (vtsize, MINT_STACK_SLOT_SIZE);
+                                       td->last_ins->data [2] = vtsize;
+                                       ENSURE_STACK_SIZE(td, (int)(tos + vtsize + MINT_STACK_SLOT_SIZE));
+                                       if (ret_mt == MINT_TYPE_VT)
+                                               push_type_vt (td, klass, vtsize);
+                                       else
+                                               push_type (td, stack_type [ret_mt], klass);
+                               } else {
+                                       td->last_ins->data [2] = 0;
+                                       ENSURE_STACK_SIZE(td, (int)(tos + 2 * MINT_STACK_SLOT_SIZE));
+                                       push_type (td, stack_type [ret_mt], klass);
+                               }
+                               interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
                        } else {
                                td->sp -= csignature->param_count;
 
@@ -6811,10 +6930,18 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header,
                                                mt = mint_type (info->sig->ret);
                                                push_simple_type (td, stack_type [mt]);
                                                dreg = td->sp [-1].local;
+                                       } else if (!td->optimized) {
+                                               // Dummy dreg
+                                               push_simple_type (td, stack_type [STACK_TYPE_I4]);
+                                               dreg = td->sp [-1].local;
+                                               td->sp--;
                                        }
 
                                        if (jit_icall_id == MONO_JIT_ICALL_mono_threads_attach_coop) {
                                                rtm->needs_thread_attach = 1;
+                                               // Add dummy return value
+                                               interp_add_ins (td, MINT_LDNULL);
+                                               interp_ins_set_dreg (td->last_ins, dreg);
                                        } else if (jit_icall_id == MONO_JIT_ICALL_mono_threads_detach_coop) {
                                                g_assert (rtm->needs_thread_attach);
                                        } else {
@@ -6977,7 +7104,9 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header,
 
                                if (info->sig->ret->type != MONO_TYPE_VOID) {
                                        // Push a dummy coop gc var
+                                       interp_add_ins (td, MINT_LDNULL);
                                        push_simple_type (td, STACK_TYPE_I);
+                                       td->last_ins->dreg = td->sp [-1].local;
                                        interp_add_ins (td, MINT_MONO_ENABLE_GCTRANS);
                                } else {
                                        // Pop the unused gc var
@@ -7470,6 +7599,23 @@ handle_relocations (TransformData *td)
        }
 }
 
+static void
+alloc_unopt_global_local (TransformData *td, int local, gpointer data)
+{
+       // Execution stack locals are resolved when we emit the instruction in the code stream,
+       // once all global locals have their offset resolved
+       if (td->locals [local].flags & INTERP_LOCAL_FLAG_EXECUTION_STACK)
+               return;
+       // Check if already resolved
+       if (td->locals [local].offset != -1)
+               return;
+
+       int offset = td->total_locals_size;
+       int size = td->locals [local].size;
+       td->locals [local].offset = offset;
+       td->total_locals_size = ALIGN_TO (offset + size, MINT_STACK_SLOT_SIZE);
+}
+
 static int
 get_inst_length (InterpInst *ins)
 {
@@ -7483,6 +7629,34 @@ get_inst_length (InterpInst *ins)
                return mono_interp_oplen [ins->opcode];
 }
 
+static void
+foreach_local_var (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int, gpointer))
+{
+       int opcode = ins->opcode;
+       if (mono_interp_op_sregs [opcode]) {
+               for (int i = 0; i < mono_interp_op_sregs [opcode]; i++) {
+                       int sreg = ins->sregs [i];
+
+                       if (sreg == MINT_CALL_ARGS_SREG) {
+                               int *call_args = ins->info.call_args;
+                               if (call_args) {
+                                       int var = *call_args;
+                                       while (var != -1) {
+                                               callback (td, var, data);
+                                               call_args++;
+                                               var = *call_args;
+                                       }
+                               }
+                       } else {
+                               callback (td, sreg, data);
+                       }
+               }
+       }
+
+       if (mono_interp_op_dregs [opcode])
+               callback (td, ins->dreg, data);
+}
+
 static int
 compute_native_offset_estimates (TransformData *td)
 {
@@ -7500,6 +7674,8 @@ compute_native_offset_estimates (TransformData *td)
                        if (MINT_IS_NOP (opcode))
                                continue;
                        noe += get_inst_length (ins);
+                       if (!td->optimized)
+                               foreach_local_var (td, ins, NULL, alloc_unopt_global_local);
                }
        }
        return noe;
@@ -7540,6 +7716,27 @@ get_short_brop (int opcode)
        return opcode;
 }
 
+static int
+get_local_offset (TransformData *td, int local)
+{
+       if (td->locals [local].offset != -1)
+               return td->locals [local].offset;
+
+       // FIXME Some vars might end up with unitialized offset because they are not declared at all in the code.
+       // This can happen if the bblock declaring the var gets removed, while other unreachable bblocks, that access
+       // the var are also not removed. This limitation is due to bblock removal using IN count for removing a bblock,
+       // which doesn't account for cycles.
+       if (td->optimized)
+               return -1;
+
+       // If we use the optimized offset allocator, all locals should have had their offsets already allocated
+       g_assert (!td->optimized);
+       // The only remaining locals to allocate are the ones from the execution stack
+       g_assert (td->locals [local].flags & INTERP_LOCAL_FLAG_EXECUTION_STACK);
+
+       td->locals [local].offset = td->total_locals_size + td->locals [local].stack_offset;
+       return td->locals [local].offset;
+}
 
 static guint16*
 emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *ins)
@@ -7565,7 +7762,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
        *ip++ = opcode;
        if (opcode == MINT_SWITCH) {
                int labels = READ32 (&ins->data [0]);
-               *ip++ = GINT_TO_UINT16 (td->locals [ins->sregs [0]].offset);
+               *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [0]));
                // Write number of switch labels
                *ip++ = ins->data [0];
                *ip++ = ins->data [1];
@@ -7583,7 +7780,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
                const int br_offset = GPTRDIFF_TO_INT (start_ip - td->new_code);
                gboolean has_imm = opcode >= MINT_BEQ_I4_IMM_SP && opcode <= MINT_BLT_UN_I8_IMM_SP;
                for (int i = 0; i < mono_interp_op_sregs [opcode]; i++)
-                       *ip++ = GINT_TO_UINT16 (td->locals [ins->sregs [i]].offset);
+                       *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [i]));
                if (has_imm)
                        *ip++ = ins->data [0];
 
@@ -7651,8 +7848,8 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
                guint16 mt = ins->data [1];
                guint16 fsize = ins->data [2];
 
-               int dest_off = td->locals [ins->dreg].offset;
-               int src_off = td->locals [ins->sregs [0]].offset + foff;
+               int dest_off = get_local_offset (td, ins->dreg);
+               int src_off = get_local_offset (td, ins->sregs [0]) + foff;
                if (mt == MINT_TYPE_VT || fsize)
                        opcode = MINT_MOV_VT;
                else
@@ -7692,21 +7889,29 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
                // actually vars. Resolve their offset
                int num_vars = mono_interp_oplen [opcode] - 1;
                for (int i = 0; i < num_vars; i++)
-                       *ip++ = GINT_TO_UINT16 (td->locals [ins->data [i]].offset);
+                       *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->data [i]));
        } else {
                if (mono_interp_op_dregs [opcode])
-                       *ip++ = GINT_TO_UINT16 (td->locals [ins->dreg].offset);
+                       *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->dreg));
 
                if (mono_interp_op_sregs [opcode]) {
                        for (int i = 0; i < mono_interp_op_sregs [opcode]; i++) {
-                               if (ins->sregs [i] == MINT_CALL_ARGS_SREG)
-                                       *ip++ = GINT_TO_UINT16 (td->locals [ins->info.call_args [0]].offset);
-                               else
-                                       *ip++ = GINT_TO_UINT16 (td->locals [ins->sregs [i]].offset);
+                               if (ins->sregs [i] == MINT_CALL_ARGS_SREG) {
+                                       int offset;
+                                       // In the unoptimized case the return and the start of the param area are always at the
+                                       // same offset. Use the dreg offset so we don't need to rely on existing call_args.
+                                       if (td->optimized)
+                                               offset = get_local_offset (td, ins->info.call_args [0]);
+                                       else
+                                               offset = get_local_offset (td, ins->dreg);
+                                       *ip++ = GINT_TO_UINT16 (offset);
+                               } else {
+                                       *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [i]));
+                               }
                        }
                } else if (opcode == MINT_LDLOCA_S) {
                        // This opcode receives a local but it is not viewed as a sreg since we don't load the value
-                       *ip++ = GINT_TO_UINT16 (td->locals [ins->sregs [0]].offset);
+                       *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [0]));
                }
 
                int left = get_inst_length (ins) - GPTRDIFF_TO_INT(ip - start_ip);
@@ -7721,7 +7926,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in
 static int
 add_patchpoint_data (TransformData *td, int patchpoint_data_index, int native_offset, int key)
 {
-       if (td->rtm->optimized) {
+       if (td->optimized) {
                td->patchpoint_data [patchpoint_data_index++] = key;
                td->patchpoint_data [patchpoint_data_index++] = native_offset;
        } else {
@@ -8257,34 +8462,6 @@ cprop_sreg (TransformData *td, InterpInst *ins, int *psreg, LocalValue *local_de
 }
 
 static void
-foreach_local_var (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int, gpointer))
-{
-       int opcode = ins->opcode;
-       if (mono_interp_op_sregs [opcode]) {
-               for (int i = 0; i < mono_interp_op_sregs [opcode]; i++) {
-                       int sreg = ins->sregs [i];
-
-                       if (sreg == MINT_CALL_ARGS_SREG) {
-                               int *call_args = ins->info.call_args;
-                               if (call_args) {
-                                       int var = *call_args;
-                                       while (var != -1) {
-                                               callback (td, var, data);
-                                               call_args++;
-                                               var = *call_args;
-                                       }
-                               }
-                       } else {
-                               callback (td, sreg, data);
-                       }
-               }
-       }
-
-       if (mono_interp_op_dregs [opcode])
-               callback (td, ins->dreg, data);
-}
-
-static void
 clear_local_defs (TransformData *td, int var, void *data)
 {
        LocalValue *local_defs = (LocalValue*) data;
@@ -8333,7 +8510,7 @@ retry:
                        gint32 *sregs = &ins->sregs [0];
                        gint32 dreg = ins->dreg;
 
-                       if (td->verbose_level && ins->opcode != MINT_NOP)
+                       if (td->verbose_level && ins->opcode != MINT_NOP && ins->opcode != MINT_IL_SEQ_POINT)
                                dump_interp_inst (ins);
 
                        for (int i = 0; i < num_sregs; i++) {
@@ -9480,9 +9657,6 @@ retry:
        if (td->prof_coverage)
                td->coverage_info = mono_profiler_coverage_alloc (method, header->code_size);
 
-       interp_method_compute_offsets (td, rtm, mono_method_signature_internal (method), header, error);
-       goto_if_nok (error, exit);
-
        if (verbose_method_name) {
                const char *name = verbose_method_name;
 
@@ -9500,6 +9674,9 @@ retry:
                }
        }
 
+       interp_method_compute_offsets (td, rtm, mono_method_signature_internal (method), header, error);
+       goto_if_nok (error, exit);
+
        td->stack = (StackInfo*)g_malloc0 ((header->max_stack + 1) * sizeof (td->stack [0]));
        td->stack_capacity = header->max_stack + 1;
        td->sp = td->stack;
@@ -9515,10 +9692,13 @@ retry:
        if (td->has_localloc)
                interp_fix_localloc_ret (td);
 
-       if (td->optimized)
-               interp_optimize_code (td);
+       if (td->verbose_level)
+               mono_interp_print_td_code (td);
 
-       interp_alloc_offsets (td);
+       if (td->optimized) {
+               interp_optimize_code (td);
+               interp_alloc_offsets (td);
+       }
 
        generate_compacted_code (td);
 
@@ -9573,8 +9753,11 @@ retry:
                if (c->flags & MONO_EXCEPTION_CLAUSE_FILTER)
                        c->data.filter_offset = get_native_offset (td, c->data.filter_offset);
        }
-       rtm->alloca_size = td->total_locals_size;
-       rtm->locals_size = td->param_area_offset;
+       // When optimized (using the var offset allocator), total_locals_size contains also the param area.
+       // When unoptimized, the param area is stored in the same order, within the IL execution stack.
+       g_assert (!td->optimized || !td->max_stack_size);
+       rtm->alloca_size = td->total_locals_size + td->max_stack_size;
+       rtm->locals_size = td->optimized ? td->param_area_offset : td->total_locals_size;
        rtm->data_items = (gpointer*)mono_mem_manager_alloc0 (td->mem_manager, td->n_data_items * sizeof (td->data_items [0]));
        memcpy (rtm->data_items, td->data_items, td->n_data_items * sizeof (td->data_items [0]));
 
index 6e3599a..7b55267 100644 (file)
@@ -34,6 +34,8 @@ typedef struct
         * the stack a new local is created.
         */
        int local;
+       /* The offset from the execution stack start where this is stored. Used by the fast offset allocator */
+       int offset;
        /* Saves how much stack this is using. It is a multiple of MINT_VT_ALIGNMENT */
        int size;
 } StackInfo;
@@ -155,7 +157,13 @@ typedef struct {
        int indirects;
        int offset;
        int size;
-       int live_start, live_end;
+       union {
+               // live_start and live_end are used by the offset allocator for optimized code
+               int live_start;
+               // used only by the fast offset allocator, which only works for unoptimized code
+               int stack_offset;
+       };
+       int live_end;
        // index of first basic block where this var is used
        int bb_index;
        union {
@@ -177,7 +185,7 @@ typedef struct
        const unsigned char *il_code;
        const unsigned char *ip;
        const unsigned char *in_start;
-       InterpInst *last_ins, *first_ins;
+       InterpInst *last_ins;
        int code_size;
        int *in_offsets;
        int current_il_offset;
@@ -190,6 +198,7 @@ typedef struct
        unsigned int stack_capacity;
        gint32 param_area_offset;
        gint32 total_locals_size;
+       gint32 max_stack_size;
        InterpLocal *locals;
        int *local_ref_count;
        unsigned int il_locals_offset;