Revert "Switch reverse PInvoke to the NativeCallable plan (#34251)" (#34306)
authorSantiago Fernandez Madero <safern@microsoft.com>
Tue, 31 Mar 2020 02:45:31 +0000 (19:45 -0700)
committerGitHub <noreply@github.com>
Tue, 31 Mar 2020 02:45:31 +0000 (19:45 -0700)
This reverts commit 4e30ff033fbd915c5a39df8901d43c601deeeeb4.

20 files changed:
src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.cs
src/coreclr/src/tools/crossgen2/ILCompiler.ReadyToRun/JitInterface/CorInfoImpl.ReadyToRun.cs
src/coreclr/src/vm/amd64/UMThunkStub.asm
src/coreclr/src/vm/amd64/asmconstants.h
src/coreclr/src/vm/amd64/umthunkstub.S
src/coreclr/src/vm/arm/asmconstants.h
src/coreclr/src/vm/arm/asmhelpers.S
src/coreclr/src/vm/arm/asmhelpers.asm
src/coreclr/src/vm/arm64/asmconstants.h
src/coreclr/src/vm/arm64/asmhelpers.S
src/coreclr/src/vm/arm64/asmhelpers.asm
src/coreclr/src/vm/dllimportcallback.cpp
src/coreclr/src/vm/dllimportcallback.h
src/coreclr/src/vm/i386/asmhelpers.S
src/coreclr/src/vm/ilstubcache.cpp
src/coreclr/src/vm/jithelpers.cpp
src/coreclr/src/vm/jitinterface.cpp
src/coreclr/src/vm/method.cpp
src/coreclr/src/vm/method.hpp
src/coreclr/src/zap/zapinfo.cpp

index 472b1050acb45a3f81e80d553ac50ce371615b21..ebf41f59cd5137c557a6e4ff74897095712b751a 100644 (file)
@@ -2921,7 +2921,8 @@ namespace Internal.JitInterface
             if (this.MethodBeingCompiled.IsNativeCallable)
             {
 #if READYTORUN
-                if (targetArchitecture == TargetArchitecture.X86)
+                if (targetArchitecture == TargetArchitecture.X86
+                    && _compilation.TypeSystemContext.Target.OperatingSystem == TargetOS.Windows)
                 {
                     throw new RequiresRuntimeJitException("ReadyToRun: Methods with NativeCallableAttribute not implemented");
                 }
index 8aa4adc2799308bb371a820c0c6e9009e65f6fff..1f477b034bb586842b56b32fd89cfacfefa5e37b 100644 (file)
@@ -1661,7 +1661,9 @@ namespace Internal.JitInterface
             pResult->methodFlags = FilterNamedIntrinsicMethodAttribs(pResult->methodFlags, methodToCall);
 
             var targetDetails = _compilation.TypeSystemContext.Target;
-            if (targetDetails.Architecture == TargetArchitecture.X86 && targetMethod.IsNativeCallable)
+            if (targetDetails.Architecture == TargetArchitecture.X86
+                && targetDetails.OperatingSystem == TargetOS.Windows
+                && targetMethod.IsNativeCallable)
             {
                 throw new RequiresRuntimeJitException("ReadyToRun: References to methods with NativeCallableAttribute not implemented");
             }
index cee9866329552a3466410679410359673913f525..58239125018a6b001f772036015a2707561dfbb7 100644 (file)
 include <AsmMacros.inc>
 include AsmConstants.inc
 
+extern CreateThreadBlockThrow:proc
 extern TheUMEntryPrestubWorker:proc
 extern UMEntryPrestubUnwindFrameChainHandler:proc
+extern UMThunkStubUnwindFrameChainHandler:proc
+extern g_TrapReturningThreads:dword
+extern UMThunkStubRareDisableWorker:proc
+extern ReversePInvokeBadTransition:proc
 
 ;
 ; METHODDESC_REGISTER: UMEntryThunk*
@@ -73,4 +78,240 @@ endif
 
 NESTED_END TheUMEntryPrestub, _TEXT
 
+
+;
+; METHODDESC_REGISTER: UMEntryThunk*
+;
+NESTED_ENTRY UMThunkStub, _TEXT, UMThunkStubUnwindFrameChainHandler
+
+UMThunkStubAMD64_STACK_FRAME_SIZE = 0
+
+; number of integer registers saved in prologue
+UMThunkStubAMD64_NUM_REG_PUSHES = 2
+UMThunkStubAMD64_STACK_FRAME_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE + (UMThunkStubAMD64_NUM_REG_PUSHES * 8)
+
+; rare path spill area
+UMThunkStubAMD64_RARE_PATH_SPILL_SIZE = 10h
+UMThunkStubAMD64_STACK_FRAME_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE + UMThunkStubAMD64_RARE_PATH_SPILL_SIZE
+UMThunkStubAMD64_RARE_PATH_SPILL_NEGOFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE
+
+
+
+; HOST_NOTIFY_FLAG
+UMThunkStubAMD64_STACK_FRAME_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE + 8
+UMThunkStubAMD64_HOST_NOTIFY_FLAG_NEGOFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE
+
+; XMM save area
+UMThunkStubAMD64_STACK_FRAME_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE + SIZEOF_MAX_FP_ARG_SPILL
+
+; Ensure that the offset of the XMM save area will be 16-byte aligned.
+if ((UMThunkStubAMD64_STACK_FRAME_SIZE + 8) MOD 16) ne 0        ; +8 for caller-pushed return address
+UMThunkStubAMD64_STACK_FRAME_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE + 8
+endif
+
+UMThunkStubAMD64_XMM_SAVE_NEGOFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE
+
+; Add in the callee scratch area size.
+UMThunkStubAMD64_CALLEE_SCRATCH_SIZE = SIZEOF_MAX_OUTGOING_ARGUMENT_HOMES
+UMThunkStubAMD64_STACK_FRAME_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE + UMThunkStubAMD64_CALLEE_SCRATCH_SIZE
+
+; Now we have the full size of the stack frame.  The offsets have been computed relative to the
+; top, so negate them to make them relative to the post-prologue rsp.
+UMThunkStubAMD64_FRAME_OFFSET = UMThunkStubAMD64_CALLEE_SCRATCH_SIZE
+UMThunkStubAMD64_RARE_PATH_SPILL_OFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE - UMThunkStubAMD64_FRAME_OFFSET - UMThunkStubAMD64_RARE_PATH_SPILL_NEGOFFSET
+UMThunkStubAMD64_HOST_NOTIFY_FLAG_OFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE - UMThunkStubAMD64_FRAME_OFFSET - UMThunkStubAMD64_HOST_NOTIFY_FLAG_NEGOFFSET
+UMThunkStubAMD64_XMM_SAVE_OFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE - UMThunkStubAMD64_FRAME_OFFSET - UMThunkStubAMD64_XMM_SAVE_NEGOFFSET
+UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET = UMThunkStubAMD64_STACK_FRAME_SIZE + 8 - UMThunkStubAMD64_FRAME_OFFSET    ; +8 for return address
+UMThunkStubAMD64_FIXED_STACK_ALLOC_SIZE = UMThunkStubAMD64_STACK_FRAME_SIZE - (UMThunkStubAMD64_NUM_REG_PUSHES * 8)
+
+.errnz UMTHUNKSTUB_HOST_NOTIFY_FLAG_RBPOFFSET - UMThunkStubAMD64_HOST_NOTIFY_FLAG_OFFSET, update UMTHUNKSTUB_HOST_NOTIFY_FLAG_RBPOFFSET
+
+
+;
+; [ callee scratch ]            <-- new RSP
+; [ callee scratch ]
+; [ callee scratch ]
+; [ callee scratch ]
+; {optional stack args passed to callee}
+; xmm0                          <-- RBP
+; xmm1
+; xmm2
+; xmm3
+; {optional padding to align xmm regs}
+; HOST_NOTIFY_FLAG (needs to make ReverseLeaveRuntime call flag)
+; [rare path spill area]
+; [rare path spill area]
+; rbp save
+; r12 save
+; return address                <-- entry RSP
+; [rcx home]
+; [rdx home]
+; [r8 home]
+; [r9 home]
+; stack arg 0
+; stack arg 1
+; ...
+
+        push_nonvol_reg r12
+        push_nonvol_reg rbp                                                                     ; stack_args
+        alloc_stack     UMThunkStubAMD64_FIXED_STACK_ALLOC_SIZE
+        set_frame       rbp, UMThunkStubAMD64_FRAME_OFFSET                                      ; stack_args
+        mov             byte ptr [rbp + UMThunkStubAMD64_HOST_NOTIFY_FLAG_OFFSET], 0            ; hosted
+        END_PROLOGUE
+
+        ;
+        ; Call GetThread()
+        ;
+        INLINE_GETTHREAD r12                    ; will not trash r10
+        test            r12, r12
+        jz              DoThreadSetup
+
+HaveThread:
+
+        ;FailFast if a native callable method invoked via ldftn and calli.
+        cmp             dword ptr [r12 + OFFSETOF__Thread__m_fPreemptiveGCDisabled], 1
+        jz              InvalidTransition
+
+        ;
+        ; disable preemptive GC
+        ;
+        mov             dword ptr [r12 + OFFSETOF__Thread__m_fPreemptiveGCDisabled], 1
+
+        ;
+        ; catch returning thread here if a GC is in progress
+        ;
+        cmp             [g_TrapReturningThreads], 0
+        jnz             DoTrapReturningThreadsTHROW
+
+InCooperativeMode:
+
+        mov             r11, [METHODDESC_REGISTER + OFFSETOF__UMEntryThunk__m_pUMThunkMarshInfo]
+        mov             eax, [r11 + OFFSETOF__UMThunkMarshInfo__m_cbActualArgSize]                      ; stack_args
+        test            rax, rax                                                                        ; stack_args
+        jnz             CopyStackArgs                                                                   ; stack_args
+
+ArgumentsSetup:
+
+        mov             rax, [r11 + OFFSETOF__UMThunkMarshInfo__m_pILStub]                              ; rax <- Stub*
+        call            rax
+
+PostCall:
+        ;
+        ; enable preemptive GC
+        ;
+        mov             dword ptr [r12 + OFFSETOF__Thread__m_fPreemptiveGCDisabled], 0
+
+        ; epilog
+        lea             rsp, [rbp - UMThunkStubAMD64_FRAME_OFFSET + UMThunkStubAMD64_FIXED_STACK_ALLOC_SIZE]
+        pop             rbp                                                                             ; stack_args
+        pop             r12
+        ret
+
+
+DoThreadSetup:
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  0h], rcx
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  8h], rdx
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 10h], r8
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 18h], r9
+
+        ; @CONSIDER: mark UMEntryThunks that have FP params and only save/restore xmm regs on those calls
+        ;            initial measurements indidcate that this could be worth about a 5% savings in reverse
+        ;            pinvoke overhead.
+        movdqa          xmmword ptr[rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET +  0h], xmm0
+        movdqa          xmmword ptr[rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 10h], xmm1
+        movdqa          xmmword ptr[rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 20h], xmm2
+        movdqa          xmmword ptr[rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 30h], xmm3
+
+        mov             [rbp + UMThunkStubAMD64_RARE_PATH_SPILL_OFFSET], METHODDESC_REGISTER
+        call            CreateThreadBlockThrow
+        mov             METHODDESC_REGISTER, [rbp + UMThunkStubAMD64_RARE_PATH_SPILL_OFFSET]
+
+        mov             rcx,  [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  0h]
+        mov             rdx,  [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  8h]
+        mov             r8,   [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 10h]
+        mov             r9,   [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 18h]
+
+        ; @CONSIDER: mark UMEntryThunks that have FP params and only save/restore xmm regs on those calls
+        movdqa          xmm0, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET +  0h]
+        movdqa          xmm1, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 10h]
+        movdqa          xmm2, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 20h]
+        movdqa          xmm3, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 30h]
+
+        mov             r12, rax
+
+        jmp             HaveThread
+
+InvalidTransition:
+        ; ReversePInvokeBadTransition will failfast
+        call            ReversePInvokeBadTransition
+
+DoTrapReturningThreadsTHROW:
+
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  0h], rcx
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  8h], rdx
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 10h], r8
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 18h], r9
+
+        ; @CONSIDER: mark UMEntryThunks that have FP params and only save/restore xmm regs on those calls
+        ;            initial measurements indidcate that this could be worth about a 5% savings in reverse
+        ;            pinvoke overhead.
+        movdqa          xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET +  0h], xmm0
+        movdqa          xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 10h], xmm1
+        movdqa          xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 20h], xmm2
+        movdqa          xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 30h], xmm3
+
+        mov             [rbp + UMThunkStubAMD64_RARE_PATH_SPILL_OFFSET], METHODDESC_REGISTER
+        mov             rcx, r12                                                                  ; Thread* pThread
+        mov             rdx, METHODDESC_REGISTER                                                  ; UMEntryThunk* pUMEntry
+        call            UMThunkStubRareDisableWorker
+        mov             METHODDESC_REGISTER, [rbp + UMThunkStubAMD64_RARE_PATH_SPILL_OFFSET]
+
+        mov             rcx,  [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  0h]
+        mov             rdx,  [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  8h]
+        mov             r8,   [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 10h]
+        mov             r9,   [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 18h]
+
+        ; @CONSIDER: mark UMEntryThunks that have FP params and only save/restore xmm regs on those calls
+        movdqa          xmm0, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET +  0h]
+        movdqa          xmm1, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 10h]
+        movdqa          xmm2, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 20h]
+        movdqa          xmm3, xmmword ptr [rbp + UMThunkStubAMD64_XMM_SAVE_OFFSET + 30h]
+
+        jmp             InCooperativeMode
+
+CopyStackArgs:
+        ; rax = cbStackArgs (with 20h for register args subtracted out already)
+
+        sub             rsp, rax
+        and             rsp, -16
+
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  0h], rcx
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  8h], rdx
+        mov             [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 10h], r8
+
+        ; rax = number of bytes
+
+        lea             rcx, [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + SIZEOF_MAX_OUTGOING_ARGUMENT_HOMES]
+        lea             rdx, [rsp + UMThunkStubAMD64_CALLEE_SCRATCH_SIZE]
+
+CopyLoop:
+        ; rax = number of bytes
+        ; rcx = src
+        ; rdx = dest
+        ; r8 = sratch
+
+        add             rax, -8
+        mov             r8, [rcx + rax]
+        mov             [rdx + rax], r8
+        jnz             CopyLoop
+
+        mov             rcx, [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  0h]
+        mov             rdx, [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET +  8h]
+        mov             r8, [rbp + UMThunkStubAMD64_ARGUMENTS_STACK_HOME_OFFSET + 10h]
+
+        jmp             ArgumentsSetup
+
+NESTED_END UMThunkStub, _TEXT
+
         end
+
index 15c5663e79aec5562af5f249d78f2756124f5c5b..6c361f9bcf056fbb6211af48e5fb01262c5f2b47 100644 (file)
@@ -98,6 +98,21 @@ ASMCONSTANTS_C_ASSERT(SIZEOF__ComPrestubMethodFrame
 #define               SIZEOF__ComMethodFrame                        0x20
 ASMCONSTANTS_C_ASSERT(SIZEOF__ComMethodFrame
                     == sizeof(ComMethodFrame));
+#endif // FEATURE_COMINTEROP
+
+#define               OFFSETOF__UMEntryThunk__m_pUMThunkMarshInfo   0x18
+ASMCONSTANTS_C_ASSERT(OFFSETOF__UMEntryThunk__m_pUMThunkMarshInfo
+                    == offsetof(UMEntryThunk, m_pUMThunkMarshInfo));
+
+#define               OFFSETOF__UMThunkMarshInfo__m_pILStub         0x00
+ASMCONSTANTS_C_ASSERT(OFFSETOF__UMThunkMarshInfo__m_pILStub
+                    == offsetof(UMThunkMarshInfo, m_pILStub));
+
+#define               OFFSETOF__UMThunkMarshInfo__m_cbActualArgSize 0x08
+ASMCONSTANTS_C_ASSERT(OFFSETOF__UMThunkMarshInfo__m_cbActualArgSize
+                    == offsetof(UMThunkMarshInfo, m_cbActualArgSize));
+
+#ifdef FEATURE_COMINTEROP
 
 #define               OFFSETOF__ComPlusCallMethodDesc__m_pComPlusCallInfo        DBG_FRE(0x30, 0x08)
 ASMCONSTANTS_C_ASSERT(OFFSETOF__ComPlusCallMethodDesc__m_pComPlusCallInfo
@@ -482,6 +497,8 @@ ASMCONSTANTS_C_ASSERT(OFFSET__TEB__ThreadLocalStoragePointer == offsetof(TEB, Th
 
 #define THROWSTUB_ESTABLISHER_OFFSET_FaultingExceptionFrame 0x30
 
+#define UMTHUNKSTUB_HOST_NOTIFY_FLAG_RBPOFFSET (0x40)   // xmm save size
+
 #define Thread__ObjectRefFlush  ?ObjectRefFlush@Thread@@SAXPEAV1@@Z
 
 
index a3bbb7f432a8e69fa9c8aface21918b875ba005b..4c2b0a32a2f003f2b5cbf665057e0f8ba7e8274e 100644 (file)
@@ -27,3 +27,157 @@ NESTED_ENTRY TheUMEntryPrestub, _TEXT, UnhandledExceptionHandlerUnix
     TAILJMP_RAX
 
 NESTED_END TheUMEntryPrestub, _TEXT
+
+//
+// METHODDESC_REGISTER: UMEntryThunk*
+//
+NESTED_ENTRY UMThunkStub, _TEXT, UnhandledExceptionHandlerUnix
+#define UMThunkStubAMD64_FIXED_STACK_ALLOC_SIZE (SIZEOF_MAX_INT_ARG_SPILL + SIZEOF_MAX_FP_ARG_SPILL + 0x8)
+#define UMThunkStubAMD64_XMM_SAVE_OFFSET 0x0
+#define UMThunkStubAMD64_INT_ARG_OFFSET (SIZEOF_MAX_FP_ARG_SPILL + 0x8)
+#define UMThunkStubAMD64_METHODDESC_OFFSET SIZEOF_MAX_FP_ARG_SPILL
+#define UMThunkStubAMD64_RBP_OFFSET (UMThunkStubAMD64_FIXED_STACK_ALLOC_SIZE + 8)
+
+// {optional stack args passed to callee}   <-- new RSP
+// xmm0                                     <-- RBP
+// xmm1
+// xmm2
+// xmm3
+// xmm4
+// xmm5
+// xmm6
+// xmm7
+// METHODDESC_REGISTER
+// rdi
+// rsi
+// rcx
+// rdx
+// r8
+// r9
+// r12
+// rbp
+// return address                           <-- entry RSP
+        push_nonvol_reg rbp
+        mov             rbp, rsp
+        push_nonvol_reg r12                                                                     // stack_args
+        alloc_stack     UMThunkStubAMD64_FIXED_STACK_ALLOC_SIZE
+        save_reg_postrsp rdi, (UMThunkStubAMD64_INT_ARG_OFFSET)
+        save_reg_postrsp rsi, (UMThunkStubAMD64_INT_ARG_OFFSET + 0x08)
+        save_reg_postrsp rdx, (UMThunkStubAMD64_INT_ARG_OFFSET + 0x10)
+        save_reg_postrsp rcx, (UMThunkStubAMD64_INT_ARG_OFFSET + 0x18)
+        save_reg_postrsp r8,  (UMThunkStubAMD64_INT_ARG_OFFSET + 0x20)
+        save_reg_postrsp r9,  (UMThunkStubAMD64_INT_ARG_OFFSET + 0x28)
+        save_reg_postrsp METHODDESC_REGISTER, UMThunkStubAMD64_METHODDESC_OFFSET
+        SAVE_FLOAT_ARGUMENT_REGISTERS UMThunkStubAMD64_XMM_SAVE_OFFSET
+        set_cfa_register rbp, (2*8)
+        END_PROLOGUE
+
+        //
+        // Call GetThread()
+        //
+        call            C_FUNC(GetThread)
+        test            rax, rax
+        jz              LOCAL_LABEL(DoThreadSetup)
+
+LOCAL_LABEL(HaveThread):
+
+        mov             r12, rax                // r12 <- Thread*
+
+        //FailFast if a native callable method is invoked via ldftn and calli.
+        cmp             dword ptr [r12 + OFFSETOF__Thread__m_fPreemptiveGCDisabled], 1
+        jz              LOCAL_LABEL(InvalidTransition)
+
+        //
+        // disable preemptive GC
+        //
+        mov             dword ptr [r12 + OFFSETOF__Thread__m_fPreemptiveGCDisabled], 1
+
+        //
+        // catch returning thread here if a GC is in progress
+        //
+        PREPARE_EXTERNAL_VAR g_TrapReturningThreads, rax
+        cmp                  dword ptr [rax], 0
+        jnz                  LOCAL_LABEL(DoTrapReturningThreadsTHROW)
+
+LOCAL_LABEL(InCooperativeMode):
+
+        mov             METHODDESC_REGISTER, [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_METHODDESC_OFFSET]
+
+        mov             r11, [METHODDESC_REGISTER + OFFSETOF__UMEntryThunk__m_pUMThunkMarshInfo]
+        mov             eax, [r11 + OFFSETOF__UMThunkMarshInfo__m_cbActualArgSize]                      // stack_args
+        test            rax, rax                                                                        // stack_args
+        jnz             LOCAL_LABEL(UMThunkStub_CopyStackArgs)                                          // stack_args
+
+LOCAL_LABEL(UMThunkStub_ArgumentsSetup):
+        mov    rdi, [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_INT_ARG_OFFSET]
+        mov    rsi, [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_INT_ARG_OFFSET + 0x08]
+        mov    rdx, [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_INT_ARG_OFFSET + 0x10]
+        mov    rcx, [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_INT_ARG_OFFSET + 0x18]
+        mov    r8,  [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_INT_ARG_OFFSET + 0x20]
+        mov    r9,  [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_INT_ARG_OFFSET + 0x28]
+        movdqa xmm0, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET]
+        movdqa xmm1, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x10]
+        movdqa xmm2, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x20]
+        movdqa xmm3, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x30]
+        movdqa xmm4, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x40]
+        movdqa xmm5, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x50]
+        movdqa xmm6, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x60]
+        movdqa xmm7, xmmword ptr [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_XMM_SAVE_OFFSET + 0x70]
+
+        mov             rax, [r11 + OFFSETOF__UMThunkMarshInfo__m_pILStub]                              // rax <- Stub*
+        call            rax
+
+LOCAL_LABEL(PostCall):
+        //
+        // enable preemptive GC
+        //
+        mov             dword ptr [r12 + OFFSETOF__Thread__m_fPreemptiveGCDisabled], 0
+
+        // epilog
+        lea             rsp, [rbp - 8]         // deallocate arguments
+        set_cfa_register rsp, (3*8)
+        pop_nonvol_reg  r12
+        pop_nonvol_reg  rbp
+        ret
+
+
+LOCAL_LABEL(DoThreadSetup):
+        call            C_FUNC(CreateThreadBlockThrow)
+        jmp             LOCAL_LABEL(HaveThread)
+
+LOCAL_LABEL(InvalidTransition):
+        //No arguments to setup , ReversePInvokeBadTransition will failfast
+        call            C_FUNC(ReversePInvokeBadTransition)
+
+LOCAL_LABEL(DoTrapReturningThreadsTHROW):
+        mov             rdi, r12                                                                        // Thread* pThread
+        mov             rsi, [rbp - UMThunkStubAMD64_RBP_OFFSET + UMThunkStubAMD64_METHODDESC_OFFSET]   // UMEntryThunk* pUMEntry
+        call            C_FUNC(UMThunkStubRareDisableWorker)
+
+        jmp             LOCAL_LABEL(InCooperativeMode)
+
+LOCAL_LABEL(UMThunkStub_CopyStackArgs):
+        // rax = cbStackArgs
+
+        sub             rsp, rax
+        and             rsp, -16
+
+        // rax = number of bytes
+
+        lea             rdi, [rbp + 0x10] // rbp + ra
+        lea             rsi, [rsp]
+
+LOCAL_LABEL(CopyLoop):
+        // rax = number of bytes
+        // rdi = src
+        // rsi = dest
+        // rdx = sratch
+
+        add             rax, -8
+        mov             rdx, [rdi + rax]
+        mov             [rsi + rax], rdx
+        jnz             LOCAL_LABEL(CopyLoop)
+
+        jmp             LOCAL_LABEL(UMThunkStub_ArgumentsSetup)
+
+NESTED_END UMThunkStub, _TEXT
index 58bbb8807098c1463c203f2fb585151e40d3712e..f6d782d69811d984abc577f5596c316c668e3bd7 100644 (file)
@@ -124,6 +124,15 @@ ASMCONSTANTS_C_ASSERT(SIZEOF__FloatArgumentRegisters == sizeof(FloatArgumentRegi
 #define ASM_ENREGISTERED_RETURNTYPE_MAXSIZE 0x20
 ASMCONSTANTS_C_ASSERT(ASM_ENREGISTERED_RETURNTYPE_MAXSIZE == ENREGISTERED_RETURNTYPE_MAXSIZE)
 
+#define UMEntryThunk__m_pUMThunkMarshInfo 0x0C
+ASMCONSTANTS_C_ASSERT(UMEntryThunk__m_pUMThunkMarshInfo == offsetof(UMEntryThunk, m_pUMThunkMarshInfo))
+
+#define UMThunkMarshInfo__m_pILStub 0x00
+ASMCONSTANTS_C_ASSERT(UMThunkMarshInfo__m_pILStub == offsetof(UMThunkMarshInfo, m_pILStub))
+
+#define UMThunkMarshInfo__m_cbActualArgSize 0x04
+ASMCONSTANTS_C_ASSERT(UMThunkMarshInfo__m_cbActualArgSize == offsetof(UMThunkMarshInfo, m_cbActualArgSize))
+
 
 #define MethodDesc__m_wFlags DBG_FRE(0x1A, 0x06)
 ASMCONSTANTS_C_ASSERT(MethodDesc__m_wFlags == offsetof(MethodDesc, m_wFlags))
index 0bb2dbee981fc97e5f5b925e06d55e049a048072..3d7c96edc2754de1d6e9c88564a3392aa9f5ae1a 100644 (file)
@@ -306,6 +306,96 @@ LOCAL_LABEL(LNullThis):
 
         NESTED_END TheUMEntryPrestub,_TEXT
 
+//
+// r12 = UMEntryThunk*
+//
+        NESTED_ENTRY UMThunkStub,_TEXT,UnhandledExceptionHandlerUnix
+        PROLOG_PUSH         "{r4,r5,r7,r11,lr}"
+        PROLOG_STACK_SAVE_OFFSET   r7, #8
+
+        alloc_stack         4 * 5
+        stm                 sp, {r0-r3,r12}
+
+        //GBLA UMThunkStub_HiddenArgOffest // offset of saved UMEntryThunk *
+        //GBLA UMThunkStub_StackArgsOffest // offset of original stack args
+        //GBLA UMThunkStub_StackArgsSize   // total size of UMThunkStub frame
+UMThunkStub_HiddenArgOffset = (-3)*4
+UMThunkStub_StackArgsOffset = 3*4
+UMThunkStub_StackArgsSize = 10*4
+
+        CHECK_STACK_ALIGNMENT
+
+        bl                  C_FUNC(GetThread)
+        cbz                 r0, LOCAL_LABEL(UMThunkStub_DoThreadSetup)
+
+LOCAL_LABEL(UMThunkStub_HaveThread):
+        mov                 r5, r0                  // r5 = Thread *
+
+        ldr                 r2, =g_TrapReturningThreads
+
+        mov                 r4, 1
+        str                 r4, [r5, #Thread__m_fPreemptiveGCDisabled]
+
+        ldr                 r3, [r2]
+        cbnz                r3, LOCAL_LABEL(UMThunkStub_DoTrapReturningThreads)
+
+LOCAL_LABEL(UMThunkStub_InCooperativeMode):
+        ldr                 r12, [r7, #UMThunkStub_HiddenArgOffset]
+        ldr                 r3, [r12, #UMEntryThunk__m_pUMThunkMarshInfo]
+        ldr                 r2, [r3, #UMThunkMarshInfo__m_cbActualArgSize]
+        cbz                 r2, LOCAL_LABEL(UMThunkStub_ArgumentsSetup)
+
+        add                 r0, r7, #UMThunkStub_StackArgsOffset // Source pointer
+        add                 r0, r0, r2
+        lsr                 r1, r2, #2      // Count of stack slots to copy
+
+        and                 r2, r2, #4      // Align the stack
+        sub                 sp, sp, r2
+
+LOCAL_LABEL(UMThunkStub_StackLoop):
+        ldr                 r2, [r0,#-4]!
+        str                 r2, [sp,#-4]!
+        subs                r1, r1, #1
+        bne                 LOCAL_LABEL(UMThunkStub_StackLoop)
+
+LOCAL_LABEL(UMThunkStub_ArgumentsSetup):
+        ldr                 r4, [r3, #UMThunkMarshInfo__m_pILStub]
+
+        // reload argument registers
+        sub                 r0, r7, #28
+        ldm                 r0, {r0-r3}
+
+        CHECK_STACK_ALIGNMENT
+
+        blx                 r4
+
+LOCAL_LABEL(UMThunkStub_PostCall):
+        mov                 r4, 0
+        str                 r4, [r5, #Thread__m_fPreemptiveGCDisabled]
+
+        EPILOG_STACK_RESTORE_OFFSET    r7, #8
+        EPILOG_POP           "{r4,r5,r7,r11,pc}"
+
+LOCAL_LABEL(UMThunkStub_DoThreadSetup):
+        sub                 sp, #SIZEOF__FloatArgumentRegisters
+        vstm                sp, {d0-d7}
+        bl                  C_FUNC(CreateThreadBlockThrow)
+        vldm                sp, {d0-d7}
+        add                 sp, #SIZEOF__FloatArgumentRegisters
+        b                   LOCAL_LABEL(UMThunkStub_HaveThread)
+
+LOCAL_LABEL(UMThunkStub_DoTrapReturningThreads):
+        sub                 sp, #SIZEOF__FloatArgumentRegisters
+        vstm                sp, {d0-d7}
+        mov                 r0, r5              // Thread* pThread
+        ldr                 r1, [r7, #UMThunkStub_HiddenArgOffset]  // UMEntryThunk* pUMEntry
+        bl                  C_FUNC(UMThunkStubRareDisableWorker)
+        vldm                sp, {d0-d7}
+        add                 sp, #SIZEOF__FloatArgumentRegisters
+        b                   LOCAL_LABEL(UMThunkStub_InCooperativeMode)
+
+        NESTED_END UMThunkStub,_TEXT
+
 // ------------------------------------------------------------------
 
         NESTED_ENTRY ThePreStub, _TEXT, NoHandler
index a76f1103c13ac5151b71801180b5efdbba59b985..aad5395f97694f2bc92bd7884a0af82921274e37 100644 (file)
@@ -16,6 +16,8 @@
     IMPORT JIT_InternalThrow
     IMPORT JIT_WriteBarrier
     IMPORT TheUMEntryPrestubWorker
+    IMPORT CreateThreadBlockThrow
+    IMPORT UMThunkStubRareDisableWorker
     IMPORT PreStubWorker
     IMPORT PreStubGetMethodDescForCompactEntryPoint
     IMPORT NDirectImportWorker
@@ -38,6 +40,7 @@
 #endif
     IMPORT CallDescrWorkerUnwindFrameChainHandler
     IMPORT UMEntryPrestubUnwindFrameChainHandler
+    IMPORT UMThunkStubUnwindFrameChainHandler
 #ifdef FEATURE_COMINTEROP
     IMPORT ReverseComUnwindFrameChainHandler
 #endif
@@ -364,6 +367,96 @@ LNullThis
 
         NESTED_END
 
+;
+; r12 = UMEntryThunk*
+;
+        NESTED_ENTRY UMThunkStub,,UMThunkStubUnwindFrameChainHandler
+        PROLOG_PUSH         {r4,r5,r7,r11,lr}
+        PROLOG_PUSH         {r0-r3,r12}
+        PROLOG_STACK_SAVE   r7
+
+        GBLA UMThunkStub_HiddenArg ; offset of saved UMEntryThunk *
+        GBLA UMThunkStub_StackArgs ; offset of original stack args (total size of UMThunkStub frame)
+UMThunkStub_HiddenArg SETA 4*4
+UMThunkStub_StackArgs SETA 10*4
+
+        CHECK_STACK_ALIGNMENT
+
+        ; r0 = GetThread(). Trashes r5
+        INLINE_GETTHREAD    r0, r5
+        cbz                 r0, UMThunkStub_DoThreadSetup
+
+UMThunkStub_HaveThread
+        mov                 r5, r0                  ; r5 = Thread *
+
+        ldr                 r2, =g_TrapReturningThreads
+
+        mov                 r4, 1
+        str                 r4, [r5, #Thread__m_fPreemptiveGCDisabled]
+
+        ldr                 r3, [r2]
+        cbnz                r3, UMThunkStub_DoTrapReturningThreads
+
+UMThunkStub_InCooperativeMode
+        ldr                 r12, [r7, #UMThunkStub_HiddenArg]
+
+        ldr                 r3, [r12, #UMEntryThunk__m_pUMThunkMarshInfo]
+        ldr                 r2, [r3, #UMThunkMarshInfo__m_cbActualArgSize]
+        cbz                 r2, UMThunkStub_ArgumentsSetup
+
+        add                 r0, r7, #UMThunkStub_StackArgs ; Source pointer
+        add                 r0, r0, r2
+        lsr                 r1, r2, #2      ; Count of stack slots to copy
+
+        and                 r2, r2, #4      ; Align the stack
+        sub                 sp, sp, r2
+
+UMThunkStub_StackLoop
+        ldr                 r2, [r0,#-4]!
+        str                 r2, [sp,#-4]!
+        subs                r1, r1, #1
+        bne                 UMThunkStub_StackLoop
+
+UMThunkStub_ArgumentsSetup
+        ldr                 r4, [r3, #UMThunkMarshInfo__m_pILStub]
+
+        ; reload argument registers
+        ldm                 r7, {r0-r3}
+
+        CHECK_STACK_ALIGNMENT
+
+        blx                 r4
+
+UMThunkStub_PostCall
+        mov                 r4, 0
+        str                 r4, [r5, #Thread__m_fPreemptiveGCDisabled]
+
+        EPILOG_STACK_RESTORE r7
+        EPILOG_STACK_FREE   4 * 5
+        EPILOG_POP          {r4,r5,r7,r11,pc}
+
+UMThunkStub_DoThreadSetup
+        sub                 sp, #SIZEOF__FloatArgumentRegisters
+        vstm                sp, {d0-d7}
+        bl                  CreateThreadBlockThrow
+        vldm                sp, {d0-d7}
+        add                 sp, #SIZEOF__FloatArgumentRegisters
+        b                   UMThunkStub_HaveThread
+
+UMThunkStub_DoTrapReturningThreads
+        sub                 sp, #SIZEOF__FloatArgumentRegisters
+        vstm                sp, {d0-d7}
+        mov                 r0, r5              ; Thread* pThread
+        ldr                 r1, [r7, #UMThunkStub_HiddenArg]  ; UMEntryThunk* pUMEntry
+        bl                  UMThunkStubRareDisableWorker
+        vldm                sp, {d0-d7}
+        add                 sp, #SIZEOF__FloatArgumentRegisters
+        b                   UMThunkStub_InCooperativeMode
+
+        NESTED_END
+
+        INLINE_GETTHREAD_CONSTANT_POOL
+
 ; ------------------------------------------------------------------
 
         NESTED_ENTRY ThePreStub
index bb65454a7fd2d4aceccd2ca798e731a4689d11f3..544d09cb5d2cd96404a9bdb418cf59106bd0571f 100644 (file)
@@ -157,6 +157,15 @@ ASMCONSTANTS_C_ASSERT(UnmanagedToManagedFrame__m_pvDatum == offsetof(UnmanagedTo
 #endif // FEATURE_COMINTEROP
 
 
+#define UMEntryThunk__m_pUMThunkMarshInfo 0x18
+ASMCONSTANTS_C_ASSERT(UMEntryThunk__m_pUMThunkMarshInfo == offsetof(UMEntryThunk, m_pUMThunkMarshInfo))
+
+#define UMThunkMarshInfo__m_pILStub 0x00
+ASMCONSTANTS_C_ASSERT(UMThunkMarshInfo__m_pILStub == offsetof(UMThunkMarshInfo, m_pILStub))
+
+#define UMThunkMarshInfo__m_cbActualArgSize 0x08
+ASMCONSTANTS_C_ASSERT(UMThunkMarshInfo__m_cbActualArgSize == offsetof(UMThunkMarshInfo, m_cbActualArgSize))
+
 #define REDIRECTSTUB_SP_OFFSET_CONTEXT 0
 
 #define CONTEXT_Pc 0x108
index 48a82f1e9765c7dea73bb17583299b0290309527..4706319b30a622cc2a0ff64e71b05657580fefe3 100644 (file)
@@ -745,6 +745,114 @@ NESTED_ENTRY TheUMEntryPrestub, _TEXT, UnhandledExceptionHandlerUnix
 
 NESTED_END TheUMEntryPrestub, _TEXT
 
+//
+// x12 = UMEntryThunk*
+//
+NESTED_ENTRY UMThunkStub, _TEXT, UnhandledExceptionHandlerUnix
+
+    // Save arguments and return address
+    PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -112 // 72 for regArgs, 8 for x19 & 8 for x12
+    // save callee saved reg x19. x19 is used in the method to store thread*
+    PROLOG_SAVE_REG x19, 96
+
+    SAVE_ARGUMENT_REGISTERS sp, 16
+
+#define UMThunkStub_HiddenArg 88 // offset of saved UMEntryThunk *
+#define UMThunkStub_StackArgs 112 // offset of original stack args (total size of UMThunkStub frame)
+
+    // save UMEntryThunk*
+    str x12, [sp, #UMThunkStub_HiddenArg]
+
+    // assuming GetThread does not clobber FP Args
+    bl C_FUNC(GetThread)
+    cbz x0, LOCAL_LABEL(UMThunkStub_DoThreadSetup)
+
+LOCAL_LABEL(UMThunkStub_HaveThread):
+    mov x19, x0  // x19 = Thread *
+
+    mov x9, 1
+    // m_fPreemptiveGCDisabled is 4 byte field so using 32-bit variant
+    str w9, [x19, #Thread__m_fPreemptiveGCDisabled]
+
+    PREPARE_EXTERNAL_VAR g_TrapReturningThreads, x2
+    ldr x3, [x2]
+    // assuming x0 contains Thread* before jumping to UMThunkStub_DoTrapReturningThreads
+    cbnz x3, LOCAL_LABEL(UMThunkStub_DoTrapReturningThreads)
+
+LOCAL_LABEL(UMThunkStub_InCooperativeMode):
+    ldr x12, [fp, #UMThunkStub_HiddenArg] // x12 = UMEntryThunk*
+    ldr x3, [x12, #UMEntryThunk__m_pUMThunkMarshInfo] // x3 = m_pUMThunkMarshInfo
+
+    // m_cbActualArgSize is UINT32 and hence occupies 4 bytes
+    ldr w2, [x3, #UMThunkMarshInfo__m_cbActualArgSize] // w2 = Stack arg bytes
+    cbz w2, LOCAL_LABEL(UMThunkStub_RegArgumentsSetup)
+
+    // extend to 64-bits
+    uxtw x2, w2
+
+    // Source pointer
+    add x0, fp, #UMThunkStub_StackArgs
+
+    // move source pointer to end of Stack Args
+    add x0, x0, x2
+
+    // Count of stack slot pairs to copy (divide by 16)
+    lsr x1, x2, #4
+
+    // Is there an extra stack slot (can happen when stack arg bytes not multiple of 16)
+    and x2, x2, #8
+
+    // If yes then start source pointer from 16 byte aligned stack slot
+    add x0, x0, x2
+
+    // increment stack slot pair count by 1 if x2 is not zero
+    add x1, x1, x2, LSR #3
+
+LOCAL_LABEL(UMThunkStub_StackLoop):
+    ldp x4, x5, [x0, #-16]! // pre-Index
+    stp x4, x5, [sp, #-16]! // pre-Index
+    subs x1, x1, #1
+    bne LOCAL_LABEL(UMThunkStub_StackLoop)
+
+LOCAL_LABEL(UMThunkStub_RegArgumentsSetup):
+    ldr x16, [x3, #UMThunkMarshInfo__m_pILStub]
+
+    RESTORE_ARGUMENT_REGISTERS fp, 16
+
+    blr x16
+
+LOCAL_LABEL(UMThunkStub_PostCall):
+    mov x4, 0
+    // m_fPreemptiveGCDisabled is 4 byte field so using 32-bit variant
+    str w4, [x19, #Thread__m_fPreemptiveGCDisabled]
+
+    EPILOG_STACK_RESTORE
+    EPILOG_RESTORE_REG x19, 96
+    EPILOG_RESTORE_REG_PAIR_INDEXED   fp, lr, 112
+
+    EPILOG_RETURN
+
+LOCAL_LABEL(UMThunkStub_DoThreadSetup):
+    sub sp, sp, #SIZEOF__FloatArgumentRegisters
+    SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0
+    bl C_FUNC(CreateThreadBlockThrow)
+    RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 0
+    add sp, sp, #SIZEOF__FloatArgumentRegisters
+    b LOCAL_LABEL(UMThunkStub_HaveThread)
+
+LOCAL_LABEL(UMThunkStub_DoTrapReturningThreads):
+    sub sp, sp, #SIZEOF__FloatArgumentRegisters
+    SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0
+    // x0 already contains Thread* pThread
+    // UMEntryThunk* pUMEntry
+    ldr x1, [fp, #UMThunkStub_HiddenArg]
+    bl C_FUNC(UMThunkStubRareDisableWorker)
+    RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 0
+    add sp, sp, #SIZEOF__FloatArgumentRegisters
+    b LOCAL_LABEL(UMThunkStub_InCooperativeMode)
+
+NESTED_END UMThunkStub, _TEXT
+
 #ifdef FEATURE_HIJACK
 // ------------------------------------------------------------------
 // Hijack function for functions which return a scalar type or a struct (value type)
index 2e58e43fec73b468e736541139fc9ef518f53365..11e7d0fef0643b079d3b20013072744ef23a1e82 100644 (file)
     IMPORT COMToCLRWorker
     IMPORT CallDescrWorkerUnwindFrameChainHandler
     IMPORT UMEntryPrestubUnwindFrameChainHandler
+    IMPORT UMThunkStubUnwindFrameChainHandler
     IMPORT TheUMEntryPrestubWorker
+    IMPORT CreateThreadBlockThrow
+    IMPORT UMThunkStubRareDisableWorker
     IMPORT GetCurrentSavedRedirectContext
     IMPORT LinkFrameAndThrow
     IMPORT FixContextHandler
@@ -950,6 +953,118 @@ COMToCLRDispatchHelper_RegSetup
 
     NESTED_END
 
+;
+; x12 = UMEntryThunk*
+;
+    NESTED_ENTRY UMThunkStub,,UMThunkStubUnwindFrameChainHandler
+
+    ; Save arguments and return address
+    PROLOG_SAVE_REG_PAIR           fp, lr, #-112! ; 72 for regArgs, 8 for x19 & 8 for x12 & 8 for 16-byte align
+    ; save callee saved reg x19. x19 is used in the method to store thread*
+    PROLOG_SAVE_REG                x19, #96
+
+    SAVE_ARGUMENT_REGISTERS        sp, 16
+
+    GBLA UMThunkStub_HiddenArg ; offset of saved UMEntryThunk *
+    GBLA UMThunkStub_StackArgs ; offset of original stack args (total size of UMThunkStub frame)
+UMThunkStub_HiddenArg SETA 88
+UMThunkStub_StackArgs SETA 112
+
+    ; save UMEntryThunk*
+    str                 x12, [sp, #UMThunkStub_HiddenArg]
+
+    ; x0 = GetThread(). Trashes x19
+    INLINE_GETTHREAD    x0, x19
+    cbz                 x0, UMThunkStub_DoThreadSetup
+
+UMThunkStub_HaveThread
+    mov                 x19, x0                  ; x19 = Thread *
+
+    mov                 x9, 1
+    ; m_fPreemptiveGCDisabled is 4 byte field so using 32-bit variant
+    str                 w9, [x19, #Thread__m_fPreemptiveGCDisabled]
+
+    ldr                 x2, =g_TrapReturningThreads
+    ldr                 x3, [x2]
+    ; assuming x0 contains Thread* before jumping to UMThunkStub_DoTrapReturningThreads
+    cbnz                x3, UMThunkStub_DoTrapReturningThreads
+
+UMThunkStub_InCooperativeMode
+    ldr                 x12, [fp, #UMThunkStub_HiddenArg] ; x12 = UMEntryThunk*
+    ldr                 x3, [x12, #UMEntryThunk__m_pUMThunkMarshInfo] ; x3 = m_pUMThunkMarshInfo
+
+    ; m_cbActualArgSize is UINT32 and hence occupies 4 bytes
+    ldr                 w2, [x3, #UMThunkMarshInfo__m_cbActualArgSize] ; w2 = Stack arg bytes
+    cbz                 w2, UMThunkStub_RegArgumentsSetup
+
+    ; extend to 64-bits
+    uxtw                x2, w2
+
+    ; Source pointer
+    add                 x0, fp, #UMThunkStub_StackArgs
+
+    ; move source pointer to end of Stack Args
+    add                 x0, x0, x2
+
+    ; Count of stack slot pairs to copy (divide by 16)
+    lsr                 x1, x2, #4
+
+    ; Is there an extra stack slot (can happen when stack arg bytes not multiple of 16)
+    and                 x2, x2, #8
+
+    ; If yes then start source pointer from 16 byte aligned stack slot
+    add                 x0, x0, x2
+
+    ; increment stack slot pair count by 1 if x2 is not zero
+    add                 x1, x1, x2, LSR #3
+
+UMThunkStub_StackLoop
+    ldp                 x4, x5, [x0, #-16]! ; pre-Index
+    stp                 x4, x5, [sp, #-16]! ; pre-Index
+    subs                x1, x1, #1
+    bne                 UMThunkStub_StackLoop
+
+UMThunkStub_RegArgumentsSetup
+    ldr                 x16, [x3, #UMThunkMarshInfo__m_pILStub]
+
+    RESTORE_ARGUMENT_REGISTERS        fp, 16
+
+    blr                 x16
+
+UMThunkStub_PostCall
+    mov                 x4, 0
+    ; m_fPreemptiveGCDisabled is 4 byte field so using 32-bit variant
+    str                 w4, [x19, #Thread__m_fPreemptiveGCDisabled]
+
+    EPILOG_STACK_RESTORE
+    EPILOG_RESTORE_REG                x19, #96
+    EPILOG_RESTORE_REG_PAIR           fp, lr, #112!
+
+    EPILOG_RETURN
+
+UMThunkStub_DoThreadSetup
+    sub                 sp, sp, #SIZEOF__FloatArgumentRegisters
+    SAVE_FLOAT_ARGUMENT_REGISTERS  sp, 0
+    bl                  CreateThreadBlockThrow
+    RESTORE_FLOAT_ARGUMENT_REGISTERS  sp, 0
+    add                 sp, sp, #SIZEOF__FloatArgumentRegisters
+    b                   UMThunkStub_HaveThread
+
+UMThunkStub_DoTrapReturningThreads
+    sub                 sp, sp, #SIZEOF__FloatArgumentRegisters
+    SAVE_FLOAT_ARGUMENT_REGISTERS  sp, 0
+    ; x0 already contains Thread* pThread
+    ; UMEntryThunk* pUMEntry
+    ldr                 x1, [fp, #UMThunkStub_HiddenArg]
+    bl                  UMThunkStubRareDisableWorker
+    RESTORE_FLOAT_ARGUMENT_REGISTERS  sp, 0
+    add                 sp, sp, #SIZEOF__FloatArgumentRegisters
+    b                   UMThunkStub_InCooperativeMode
+
+    NESTED_END
+
+    INLINE_GETTHREAD_CONSTANT_POOL
+
 #ifdef FEATURE_HIJACK
 ; ------------------------------------------------------------------
 ; Hijack function for functions which return a scalar type or a struct (value type)
index 659a3d9f789c8d3be5452aac6f6bdb12da3a6632..4ea1de13b8a1b092453c1d3008170eefa8b55cc6 100644 (file)
@@ -105,20 +105,7 @@ private:
 
 static UMEntryThunkFreeList s_thunkFreeList(DEFAULT_THUNK_FREE_LIST_THRESHOLD);
 
-#ifdef TARGET_X86
-
-#ifdef FEATURE_STUBS_AS_IL
-
-EXTERN_C void UMThunkStub(void);
-
-PCODE UMThunkMarshInfo::GetExecStubEntryPoint()
-{
-    LIMITED_METHOD_CONTRACT;
-
-    return GetEEFuncEntryPoint(UMThunkStub);
-}
-
-#else // FEATURE_STUBS_AS_IL
+#if defined(TARGET_X86) && !defined(FEATURE_STUBS_AS_IL)
 
 EXTERN_C VOID __cdecl UMThunkStubRareDisable();
 EXTERN_C Thread* __stdcall CreateThreadBlockThrow();
@@ -767,18 +754,16 @@ Stub *UMThunkMarshInfo::CompileNExportThunk(LoaderHeap *pLoaderHeap, PInvokeStat
     return pcpusl->Link(pLoaderHeap);
 }
 
-#endif // FEATURE_STUBS_AS_IL
-
-#else // TARGET_X86
+#else // TARGET_X86 && !FEATURE_STUBS_AS_IL
 
 PCODE UMThunkMarshInfo::GetExecStubEntryPoint()
 {
     LIMITED_METHOD_CONTRACT;
 
-    return m_pILStub;
+    return GetEEFuncEntryPoint(UMThunkStub);
 }
 
-#endif // TARGET_X86
+#endif // TARGET_X86 && !FEATURE_STUBS_AS_IL
 
 UMEntryThunkCache::UMEntryThunkCache(AppDomain *pDomain) :
     m_crst(CrstUMEntryThunkCache),
@@ -1209,20 +1194,33 @@ VOID UMThunkMarshInfo::RunTimeInit()
 
     if (pFinalILStub == NULL)
     {
-        PInvokeStaticSigInfo sigInfo;
-
-        if (pMD != NULL)
-            new (&sigInfo) PInvokeStaticSigInfo(pMD);
+        if (pMD != NULL && !pMD->IsEEImpl() &&
+            !NDirect::MarshalingRequired(pMD, GetSignature().GetRawSig(), GetModule()))
+        {
+            // Call the method directly in no-delegate case if possible. This is important to avoid JITing
+            // for stubs created via code:ICLRRuntimeHost2::CreateDelegate during coreclr startup.
+            pFinalILStub = pMD->GetMultiCallableAddrOfCode();
+        }
         else
-           new (&sigInfo) PInvokeStaticSigInfo(GetSignature(), GetModule());
+        {
+            // For perf, it is important to avoid expensive initialization of
+            // PInvokeStaticSigInfo if we have NGened stub.
+            PInvokeStaticSigInfo sigInfo;
 
-        DWORD dwStubFlags = 0;
+            if (pMD != NULL)
+                new (&sigInfo) PInvokeStaticSigInfo(pMD);
+            else
+                new (&sigInfo) PInvokeStaticSigInfo(GetSignature(), GetModule());
+
+            DWORD dwStubFlags = 0;
+
+            if (sigInfo.IsDelegateInterop())
+                dwStubFlags |= NDIRECTSTUB_FL_DELEGATE;
 
-        if (sigInfo.IsDelegateInterop())
-            dwStubFlags |= NDIRECTSTUB_FL_DELEGATE;
+            pStubMD = GetILStubMethodDesc(pMD, &sigInfo, dwStubFlags);
+            pFinalILStub = JitILStub(pStubMD);
 
-        pStubMD = GetILStubMethodDesc(pMD, &sigInfo, dwStubFlags);
-        pFinalILStub = JitILStub(pStubMD);
+        }
     }
 
 #if defined(TARGET_X86)
@@ -1279,6 +1277,13 @@ VOID UMThunkMarshInfo::RunTimeInit()
         // For all the other calling convention except cdecl, callee pops the stack arguments
         m_cbRetPop = cbRetPop + static_cast<UINT16>(m_cbActualArgSize);
     }
+#else // TARGET_X86
+    //
+    // m_cbActualArgSize gets the number of arg bytes for the NATIVE signature
+    //
+    m_cbActualArgSize =
+        (pStubMD != NULL) ? pStubMD->AsDynamicMethodDesc()->GetNativeStackArgSize() : pMD->SizeOfArgStack();
+
 #endif // TARGET_X86
 
 #endif // TARGET_X86 && !FEATURE_STUBS_AS_IL
index 12bc89a167fdc856811bcf50054cde2b1b6ab6b0..0b3414ffc1696088b63e12cc0a511efdb71a9871 100644 (file)
@@ -170,6 +170,20 @@ public:
     PCODE GetExecStubEntryPoint();
 #endif
 
+    UINT32 GetCbActualArgSize()
+    {
+        CONTRACTL
+        {
+            NOTHROW;
+            GC_NOTRIGGER;
+            MODE_ANY;
+            PRECONDITION(IsCompletelyInited());
+        }
+        CONTRACTL_END;
+
+        return m_cbActualArgSize;
+    }
+
     BOOL IsCompletelyInited()
     {
         LIMITED_METHOD_CONTRACT;
@@ -184,9 +198,13 @@ public:
         return (UINT32)offsetof(UMThunkMarshInfo, m_pILStub);
     }
 
-#ifdef TARGET_X86
+#if defined(TARGET_X86) && !defined(FEATURE_STUBS_AS_IL)
+    // Compiles an unmanaged to managed thunk for the given signature. The thunk
+    // will call the stub or, if fNoStub == TRUE, directly the managed target.
+    Stub *CompileNExportThunk(LoaderHeap *pLoaderHeap, PInvokeStaticSigInfo* pSigInfo, MetaSig *pMetaSig, BOOL fNoStub);
+#endif // TARGET_X86 && !FEATURE_STUBS_AS_IL
 
-#ifdef FEATURE_STUBS_AS_IL
+#if defined(TARGET_X86) && defined(FEATURE_STUBS_AS_IL)
     struct ArgumentRegisters
     {
         UINT32 Ecx;
@@ -194,23 +212,17 @@ public:
     };
 
     VOID SetupArguments(char *pSrc, ArgumentRegisters *pArgRegs, char *pDst);
-#else
-    // Compiles an unmanaged to managed thunk for the given signature. The thunk
-    // will call the stub or, if fNoStub == TRUE, directly the managed target.
-    Stub *CompileNExportThunk(LoaderHeap *pLoaderHeap, PInvokeStaticSigInfo* pSigInfo, MetaSig *pMetaSig, BOOL fNoStub);
-#endif // FEATURE_STUBS_AS_IL
-
-#endif // TARGET_X86
+#endif // TARGET_X86 && FEATURE_STUBS_AS_IL
 
 private:
     PCODE             m_pILStub;            // IL stub for marshaling
                                             // On x86, NULL for no-marshal signatures
                                             // On non-x86, the managed entrypoint for no-delegate no-marshal signatures
-#ifdef TARGET_X86
     UINT32            m_cbActualArgSize;    // caches m_pSig.SizeOfFrameArgumentArray()
                                             // On x86/Linux we have to augment with numRegistersUsed * STACK_ELEM_SIZE
+#if defined(TARGET_X86)
     UINT16            m_cbRetPop;           // stack bytes popped by callee (for UpdateRegDisplay)
-#ifdef FEATURE_STUBS_AS_IL
+#if defined(FEATURE_STUBS_AS_IL)
     UINT32            m_cbStackArgSize;     // stack bytes pushed for managed code
 #else
     Stub*             m_pExecStub;          // UMEntryThunk jumps directly here
@@ -531,15 +543,21 @@ private:
 // One-time creation of special prestub to initialize UMEntryThunks.
 //-------------------------------------------------------------------------
 Stub *GenerateUMThunkPrestub();
+#endif // TARGET_X86 && !FEATURE_STUBS_AS_IL
 
+//-------------------------------------------------------------------------
+// NExport stub
+//-------------------------------------------------------------------------
+#if  !defined(HOST_64BIT) && !defined(DACCESS_COMPILE) && !defined(CROSS_COMPILE)
 EXCEPTION_HANDLER_DECL(FastNExportExceptHandler);
 EXCEPTION_HANDLER_DECL(UMThunkPrestubHandler);
-
-#endif // TARGET_X86 && !FEATURE_STUBS_AS_IL
+#endif // HOST_64BIT
 
 extern "C" void TheUMEntryPrestub(void);
 extern "C" PCODE TheUMEntryPrestubWorker(UMEntryThunk * pUMEntryThunk);
 
+EXTERN_C void UMThunkStub(void);
+
 #ifdef _DEBUG
 void STDCALL LogUMTransition(UMEntryThunk* thunk);
 #endif
index c318dc390309612e5f272c4ed8594f8ea6bbc6c7..dcc210c98bb37f9e5bd155a312ab2dfd8aaaf9ce 100644 (file)
@@ -555,6 +555,22 @@ LEAF_ENTRY PrecodeFixupThunk, _TEXT
     jmp     C_FUNC(ThePreStub)
 LEAF_END PrecodeFixupThunk, _TEXT
 
+NESTED_ENTRY UMThunkStubRareDisable, _TEXT, NoHandler
+    push    eax
+    push    ecx
+
+    sub     esp, 12
+    push    eax          // Push the UMEntryThunk
+    push    ecx          // Push thread
+    CHECK_STACK_ALIGNMENT
+    call    C_FUNC(UMThunkStubRareDisableWorker)
+    add     esp, 12
+
+    pop     ecx
+    pop     eax
+    ret
+NESTED_END UMThunkStubRareDisable, _TEXT
+
 //
 // Used to get the current instruction pointer value
 //
index 86299dfeeef0715b45fc6fce39a49b981545faaa..9e17fe18ed31225443f2db0ab8dbf52fe500c877 100644 (file)
@@ -279,9 +279,6 @@ MethodDesc* ILStubCache::CreateNewMethodDesc(LoaderHeap* pCreationHeap, MethodTa
         if (SF_IsReverseStub(dwStubFlags))
         {
             pMD->m_dwExtendedFlags |= DynamicMethodDesc::nomdReverseStub;
-#if !defined(TARGET_X86)
-            pMD->m_dwExtendedFlags |= DynamicMethodDesc::nomdNativeCallableStub;
-#endif
             pMD->GetILStubResolver()->SetStubType(ILStubResolver::NativeToCLRInteropStub);
         }
         else
index 658e4e6cdd07ed018fad696310dd72923be480e3..8d64b2629fffa931d10bd10851a81005eb351d9d 100644 (file)
@@ -5383,14 +5383,8 @@ NOINLINE static void JIT_ReversePInvokeEnterRare(ReversePInvokeFrame* frame)
     if (thread->PreemptiveGCDisabled())
         ReversePInvokeBadTransition();
 
-    frame->currentThread = thread;
-
     thread->DisablePreemptiveGC();
-}
-
-NOINLINE static void JIT_ReversePInvokeEnterRare2(ReversePInvokeFrame* frame)
-{
-    frame->currentThread->RareDisablePreemptiveGC();
+    frame->currentThread = thread;
 }
 
 EXTERN_C void JIT_ReversePInvokeEnter(ReversePInvokeFrame* frame)
@@ -5403,17 +5397,13 @@ EXTERN_C void JIT_ReversePInvokeEnter(ReversePInvokeFrame* frame)
     if (thread != NULL
         && !thread->PreemptiveGCDisabled())
     {
-        frame->currentThread = thread;
-
         // Manually inline the fast path in Thread::DisablePreemptiveGC().
         thread->m_fPreemptiveGCDisabled.StoreWithoutBarrier(1);
         if (g_TrapReturningThreads.LoadWithoutBarrier() == 0)
         {
+            frame->currentThread = thread;
             return;
         }
-
-        JIT_ReversePInvokeEnterRare2(frame);
-        return;
     }
 
     JIT_ReversePInvokeEnterRare(frame);
index 1f71c90307aab85a4e4f23f1586bab2faa10a01b..356ee2b4352e23b081a19f78bfc9092f5823a2b4 100644 (file)
@@ -9209,10 +9209,10 @@ void CEEInfo::getFunctionFixedEntryPoint(CORINFO_METHOD_HANDLE   ftn,
 
     pResult->accessType = IAT_VALUE;
 
-#if defined(TARGET_X86) && !defined(CROSSGEN_COMPILE)
+// Also see GetBaseCompileFlags() below for an additional check.
+#if defined(TARGET_X86) && defined(TARGET_WINDOWS) && !defined(CROSSGEN_COMPILE)
     // Deferring X86 support until a need is observed or
     // time permits investigation into all the potential issues.
-    // https://github.com/dotnet/runtime/issues/33582
     if (pMD->HasNativeCallableAttribute())
     {
         pResult->addr = (void*)COMDelegate::ConvertToCallback(pMD);
@@ -9221,9 +9221,12 @@ void CEEInfo::getFunctionFixedEntryPoint(CORINFO_METHOD_HANDLE   ftn,
     {
         pResult->addr = (void*)pMD->GetMultiCallableAddrOfCode();
     }
+
 #else
+
     pResult->addr = (void*)pMD->GetMultiCallableAddrOfCode();
-#endif
+
+#endif // !(TARGET_X86 && TARGET_WINDOWS) || CROSSGEN_COMPILE
 
     EE_TO_JIT_TRANSITION();
 }
@@ -12438,10 +12441,10 @@ CorJitResult CallCompileMethodWithSEHWrapper(EEJitManager *jitMgr,
          }
     }
 
-#if !defined(TARGET_X86)
+#if !defined(TARGET_X86) || !defined(TARGET_WINDOWS)
     if (ftn->HasNativeCallableAttribute())
         flags.Set(CORJIT_FLAGS::CORJIT_FLAG_REVERSE_PINVOKE);
-#endif // !TARGET_X86
+#endif // !TARGET_X86 || !TARGET_WINDOWS
 
     return flags;
 }
index a458fa9d65cd62ae9dd644830baa188189e37217..ee5b6b5876f68c8f9d5669d8d2a3d6bf45a24be0 100644 (file)
@@ -5415,11 +5415,6 @@ BOOL MethodDesc::HasNativeCallableAttribute()
     }
     CONTRACTL_END;
 
-    if (IsILStub())
-    {
-        return AsDynamicMethodDesc()->IsNativeCallableStub();
-    }
-
     HRESULT hr = GetCustomAttribute(
         WellKnownAttribute::NativeCallable,
         nullptr,
index 5973cd6d6053fc8a93bd24773c98acf8fc906a6d..9a558d293084bf5521cc43a7d1caa5ca72cfae93 100644 (file)
@@ -2618,7 +2618,6 @@ protected:
         nomdMulticastStub         = 0x1000,
         nomdUnboxingILStub        = 0x2000,
         nomdWrapperDelegateStub   = 0x4000,
-        nomdNativeCallableStub    = 0x8000,
 
         nomdILStub          = 0x00010000,
         nomdLCGMethod       = 0x00020000,
@@ -2711,7 +2710,6 @@ public:
     }
 
     bool IsReverseStub()     { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(IsILStub()); return (0 != (m_dwExtendedFlags & nomdReverseStub));  }
-    bool IsNativeCallableStub() { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(IsILStub()); return (0 != (m_dwExtendedFlags & nomdNativeCallableStub)); }
     bool IsCALLIStub()       { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(IsILStub()); return (0 != (m_dwExtendedFlags & nomdCALLIStub));    }
     bool IsDelegateStub()    { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(IsILStub()); return (0 != (m_dwExtendedFlags & nomdDelegateStub)); }
     bool IsCLRToCOMStub()    { LIMITED_METHOD_CONTRACT; _ASSERTE(IsILStub()); return ((0 == (m_dwExtendedFlags & mdStatic)) && !IsReverseStub() && !IsDelegateStub() && !IsStructMarshalStub()); }
index f65ec7c8aaf1ff73875b4c461cf6fa85fadad1bb..d09204bddb806701823506993637d0aebb941588 100644 (file)
@@ -482,14 +482,14 @@ void ZapInfo::CompileMethod()
     }
 #endif
 
-#ifdef TARGET_X86
+#if defined(TARGET_X86) && defined(TARGET_WINDOWS)
     if (GetCompileInfo()->IsNativeCallableMethod(m_currentMethodHandle))
     {
         if (m_zapper->m_pOpt->m_verbose)
             m_zapper->Warning(W("ReadyToRun:  Methods with NativeCallableAttribute not implemented\n"));
         ThrowHR(E_NOTIMPL);
     }
-#endif // TARGET_X86
+#endif // (TARGET_X86) && defined(TARGET_WINDOWS)
 
     if (m_pImage->m_stats)
     {
@@ -2285,14 +2285,14 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
     }
 #endif
 
-#ifdef TARGET_X86
+#if defined(TARGET_X86) && defined(TARGET_WINDOWS)
     if (GetCompileInfo()->IsNativeCallableMethod(pResult->hMethod))
     {
         if (m_zapper->m_pOpt->m_verbose)
             m_zapper->Warning(W("ReadyToRun: References to methods with NativeCallableAttribute not implemented\n"));
         ThrowHR(E_NOTIMPL);
     }
-#endif // TARGET_X86
+#endif // (TARGET_X86) && defined(TARGET_WINDOWS)
 
     if (flags & CORINFO_CALLINFO_KINDONLY)
         return;