Optimize vtable calls (#20696)
authorFadi Hanna <fadim@microsoft.com>
Tue, 13 Nov 2018 20:44:49 +0000 (12:44 -0800)
committerGitHub <noreply@github.com>
Tue, 13 Nov 2018 20:44:49 +0000 (12:44 -0800)
* Implementation of R2R vtable call thunks. These thunks will fetch the target code pointer from the vtable of the input thisPtr, and jump to that address.
This is especially helpful with generics, since we can avoid a generic dictionary lookup cost for a simple vtable call.
Overall, these thunks cause the CPU to have less branch mispredictions, and give a small performance boost to vtable calls.

These stubs are under VirtualCallStubManager so that the managed debugger can handle stepping through them.

src/vm/amd64/virtualcallstubcpu.hpp
src/vm/arm/stubs.cpp
src/vm/arm/virtualcallstubcpu.hpp
src/vm/arm64/virtualcallstubcpu.hpp
src/vm/i386/virtualcallstubcpu.hpp
src/vm/jitinterface.cpp
src/vm/loaderallocator.cpp
src/vm/prestub.cpp
src/vm/virtualcallstub.cpp
src/vm/virtualcallstub.h
src/zap/zapinfo.cpp

index 1bfe858..7547559 100644 (file)
@@ -63,7 +63,7 @@ struct LookupStub
     inline PCODE entryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; }
 
     inline size_t  token()              { LIMITED_METHOD_CONTRACT; return _token; }
-    inline size_t       size()          { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
+    inline size_t  size()               { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
 
 private:
     friend struct LookupHolder;
@@ -430,6 +430,65 @@ struct ResolveHolder
 private:
     ResolveStub _stub;
 };
+
+/*VTableCallStub**************************************************************************************
+These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed
+in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the 
+vtable pointer, and finally jumps to the target method at a given slot in the vtable.
+*/
+struct VTableCallStub
+{
+    friend struct VTableCallHolder;
+
+    inline size_t size()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        BYTE* pStubCode = (BYTE *)this;
+
+        size_t cbSize = 3;                                      // First mov instruction
+        cbSize += (pStubCode[cbSize + 2] == 0x80 ? 7 : 4);      // Either 48 8B 80 or 48 8B 40: mov rax,[rax+offset]
+        cbSize += (pStubCode[cbSize + 1] == 0xa0 ? 6 : 3);      // Either FF A0 or FF 60: jmp qword ptr [rax+slot]
+        cbSize += 4;                                            // Slot value (data storage, not a real instruction)
+
+        return cbSize;
+    }
+
+    inline PCODE        entryPoint()        const { LIMITED_METHOD_CONTRACT;  return (PCODE)&_entryPoint[0]; }
+
+    inline size_t token() 
+    { 
+        LIMITED_METHOD_CONTRACT;
+        DWORD slot = *(DWORD*)(reinterpret_cast<BYTE*>(this) + size() - 4);
+        return DispatchToken::CreateDispatchToken(slot).To_SIZE_T();
+    }
+
+private:
+    BYTE    _entryPoint[0];         // Dynamically sized stub. See Initialize() for more details.
+};
+
+/* VTableCallHolders are the containers for VTableCallStubs, they provide for any alignment of
+stubs as necessary.  */
+struct VTableCallHolder
+{
+    void  Initialize(unsigned slot);
+
+    VTableCallStub* stub() { LIMITED_METHOD_CONTRACT;  return reinterpret_cast<VTableCallStub *>(this); }
+
+    static size_t GetHolderSize(unsigned slot)
+    {
+        STATIC_CONTRACT_WRAPPER;
+        unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        return 3 + (offsetOfIndirection >= 0x80 ? 7 : 4) + (offsetAfterIndirection >= 0x80 ? 6 : 3) + 4;
+    }
+
+    static VTableCallHolder* VTableCallHolder::FromVTableCallEntry(PCODE entry) { LIMITED_METHOD_CONTRACT; return (VTableCallHolder*)entry; }
+
+private:
+    // VTableCallStub follows here. It is dynamically sized on allocation because it could 
+    // use short/long instruction sizes for mov/jmp, depending on the slot value.
+};
 #pragma pack(pop)
 
 #ifdef DECLARE_DATA
@@ -732,6 +791,54 @@ ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry)
     return resolveHolder;
 }
 
+void VTableCallHolder::Initialize(unsigned slot)
+{
+    unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    _ASSERTE(MethodTable::VTableIndir_t::isRelative == false /* TODO: NYI */);
+
+    VTableCallStub* pStub = stub();
+    BYTE* p = (BYTE*)pStub->entryPoint();
+
+#ifdef UNIX_AMD64_ABI
+    // mov rax,[rdi] : rax = MethodTable pointer
+    *(UINT32 *)p = 0x078b48; p += 3;
+#else
+    // mov rax,[rcx] : rax = MethodTable pointer
+    *(UINT32 *)p = 0x018b48; p += 3;
+#endif
+
+    // mov rax,[rax+vtable offset] : rax = vtable pointer
+    if (offsetOfIndirection >= 0x80)
+    {
+        *(UINT32*)p = 0x00808b48; p += 3;
+        *(UINT32*)p = offsetOfIndirection; p += 4;
+    }
+    else
+    {
+        *(UINT32*)p = 0x00408b48; p += 3;
+        *p++ = (BYTE)offsetOfIndirection;
+    }
+
+    // jmp qword ptr [rax+slot]
+    if (offsetAfterIndirection >= 0x80)
+    {
+        *(UINT32*)p = 0xa0ff; p += 2;
+        *(UINT32*)p = offsetAfterIndirection; p += 4;
+    }
+    else
+    {
+        *(UINT16*)p = 0x60ff; p += 2;
+        *p++ = (BYTE)offsetAfterIndirection;
+    }
+
+    // Store the slot value here for convenience. Not a real instruction (unreachable anyways)
+    *(UINT32*)p = slot; p += 4;
+
+    _ASSERT(p == (BYTE*)stub()->entryPoint() + VTableCallHolder::GetHolderSize(slot));
+    _ASSERT(stub()->size() == VTableCallHolder::GetHolderSize(slot));
+}
+
 VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE stubStartAddress)
 {
 #ifdef DACCESS_COMPILE
@@ -763,6 +870,10 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s
         {
             stubKind = SK_LOOKUP;
         }
+        else if (firstWord == 0x8B48)
+        {
+            stubKind = SK_VTABLECALL;
+        }
         else
         {
             BYTE firstByte  = ((BYTE*) stubStartAddress)[0];
index 01d8f31..d863900 100644 (file)
@@ -3387,6 +3387,16 @@ void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target)
 }
 #endif // FEATURE_COMINTEROP
 
+void MovRegImm(BYTE* p, int reg, TADDR imm)
+{
+    LIMITED_METHOD_CONTRACT;
+    *(WORD *)(p + 0) = 0xF240;
+    *(WORD *)(p + 2) = (UINT16)(reg << 8);
+    *(WORD *)(p + 4) = 0xF2C0;
+    *(WORD *)(p + 6) = (UINT16)(reg << 8);
+    PutThumb2Mov32((UINT16 *)p, imm);
+}
+
 #ifndef DACCESS_COMPILE
 
 #ifndef CROSSGEN_COMPILE
@@ -3411,16 +3421,6 @@ void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target)
     ClrFlushInstructionCache(pStart, cbAligned); \
     return (PCODE)((TADDR)pStart | THUMB_CODE)
 
-static void MovRegImm(BYTE* p, int reg, TADDR imm)
-{
-    LIMITED_METHOD_CONTRACT;
-    *(WORD *)(p + 0) = 0xF240;
-    *(WORD *)(p + 2) = (UINT16)(reg << 8);
-    *(WORD *)(p + 4) = 0xF2C0;
-    *(WORD *)(p + 6) = (UINT16)(reg << 8);
-    PutThumb2Mov32((UINT16 *)p, imm);
-}
-
 PCODE DynamicHelpers::CreateHelper(LoaderAllocator * pAllocator, TADDR arg, PCODE target)
 {
     STANDARD_VM_CONTRACT;
index a1e15d3..6dc99e5 100644 (file)
@@ -55,9 +55,9 @@ get quickly changed to point to another kind of stub.
 */
 struct LookupStub
 {
-    inline PCODE entryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0] + THUMB_CODE; }
-    inline size_t  token() { LIMITED_METHOD_CONTRACT; return _token; }
-    inline size_t       size()          { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
+    inline PCODE entryPoint()       { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0] + THUMB_CODE; }
+    inline size_t token()           { LIMITED_METHOD_CONTRACT; return _token; }
+    inline size_t size()            { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
 
 private:
     friend struct LookupHolder;
@@ -259,6 +259,87 @@ struct ResolveHolder
 private:
     ResolveStub _stub;
 };
+
+/*VTableCallStub**************************************************************************************
+These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed
+in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the
+vtable pointer, and finally jumps to the target method at a given slot in the vtable.
+*/
+struct VTableCallStub
+{
+    friend struct VTableCallHolder;
+
+    inline size_t size()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        BYTE* pStubCode = (BYTE *)this;
+
+        size_t cbSize = 4;                                      // First ldr instruction
+
+        // If we never save r0 to the red zone, we have the short version of the stub
+        if (*(UINT32*)(&pStubCode[cbSize]) != 0x0c04f84d)
+        {
+            return 
+                4 +         // ldr r12,[r0]
+                4 +         // ldr r12,[r12+offset]
+                4 +         // ldr r12,[r12+offset]
+                2 +         // bx r12
+                4;          // Slot value (data storage, not a real instruction)
+        }
+
+        cbSize += 4;                                                    // Saving r0 into red zone
+        cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12);    // Loading of vtable into r12
+        cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12);    // Loading of targe address into r12
+
+        return cbSize + 6 /* Restore r0, bx*/ + 4 /* Slot value */;
+    }
+
+    inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0] + THUMB_CODE; }
+
+    inline size_t token()
+    {
+        LIMITED_METHOD_CONTRACT;
+        DWORD slot = *(DWORD*)(reinterpret_cast<BYTE*>(this) + size() - 4);
+        return DispatchToken::CreateDispatchToken(slot).To_SIZE_T();
+    }
+
+private:
+    BYTE    _entryPoint[0];         // Dynamically sized stub. See Initialize() for more details.
+};
+
+/* VTableCallHolders are the containers for VTableCallStubs, they provide for any alignment of
+stubs as necessary.  */
+struct VTableCallHolder
+{
+    void  Initialize(unsigned slot);
+
+    VTableCallStub* stub() { LIMITED_METHOD_CONTRACT;  return reinterpret_cast<VTableCallStub *>(this); }
+
+    static size_t GetHolderSize(unsigned slot)
+    {
+        STATIC_CONTRACT_WRAPPER;
+        unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+       
+        int indirectionsSize = (offsetOfIndirection > 0xFFF ? 12 : 4) + (offsetAfterIndirection > 0xFFF ? 12 : 4);
+        if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF)
+            indirectionsSize += 8;    // Save/restore r0 using red zone
+
+        return 6 + indirectionsSize + 4;
+    }
+
+    static VTableCallHolder* VTableCallHolder::FromVTableCallEntry(PCODE entry) 
+    {
+        LIMITED_METHOD_CONTRACT;
+        return (VTableCallHolder*)(entry & ~THUMB_CODE);
+    }
+
+private:
+    // VTableCallStub follows here. It is dynamically sized on allocation because it could 
+    // use short/long instruction sizes for the mov/jmp, depending on the slot value.
+};
+
 #include <poppack.h>
 
 
@@ -324,6 +405,69 @@ ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry)
     return resolveHolder;
 }
 
+void MovRegImm(BYTE* p, int reg, TADDR imm);
+
+void VTableCallHolder::Initialize(unsigned slot)
+{
+    unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    _ASSERTE(MethodTable::VTableIndir_t::isRelative == false /* TODO: NYI */);
+
+    VTableCallStub* pStub = stub();
+    BYTE* p = (BYTE*)(pStub->entryPoint() & ~THUMB_CODE);
+
+    // ldr r12,[r0] : r12 = MethodTable pointer
+    *(UINT32*)p = 0xc000f8d0; p += 4;
+
+    if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF)
+    {
+        // str r0, [sp, #-4]. Save r0 in the red zone
+        *(UINT32*)p = 0x0c04f84d; p += 4;
+    }
+
+    if (offsetOfIndirection > 0xFFF)
+    {
+        // mov r0, offsetOfIndirection
+        MovRegImm(p, 0, offsetOfIndirection); p += 8;
+        // ldr r12, [r12, r0]
+        *(UINT32*)p = 0xc000f85c; p += 4;
+    }
+    else
+    {
+        // ldr r12, [r12 + offset]
+        *(WORD *)p = 0xf8dc; p += 2;
+        *(WORD *)p = (WORD)(offsetOfIndirection | 0xc000); p += 2;
+    }
+
+    if (offsetAfterIndirection > 0xFFF)
+    {
+        // mov r0, offsetAfterIndirection
+        MovRegImm(p, 0, offsetAfterIndirection); p += 8;
+        // ldr r12, [r12, r0]
+        *(UINT32*)p = 0xc000f85c; p += 4;
+    }
+    else
+    {
+        // ldr r12, [r12 + offset]
+        *(WORD *)p = 0xf8dc; p += 2;
+        *(WORD *)p = (WORD)(offsetAfterIndirection | 0xc000); p += 2;
+    }
+
+    if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF)
+    {
+        // ldr r0, [sp, #-4]. Restore r0 from the red zone.
+        *(UINT32*)p = 0x0c04f85d; p += 4;
+    }
+
+    // bx r12
+    *(UINT16*)p = 0x4760; p += 2;
+
+    // Store the slot value here for convenience. Not a real instruction (unreachable anyways)
+    *(UINT32*)p = slot; p += 4;
+
+    _ASSERT(p == (BYTE*)(stub()->entryPoint() & ~THUMB_CODE) + VTableCallHolder::GetHolderSize(slot));
+    _ASSERT(stub()->size() == VTableCallHolder::GetHolderSize(slot));
+}
 
 #endif // DACCESS_COMPILE
 
@@ -347,23 +491,35 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s
 
         WORD firstWord = *((WORD*) pInstr);
 
-        //Assuming that RESOLVE_STUB_FIRST_WORD & DISPATCH_STUB_FIRST_WORD have same values
-        if (firstWord == DISPATCH_STUB_FIRST_WORD)
+        if (*((UINT32*)pInstr) == 0xc000f8d0)
         {
+            // Confirm the thrid word belongs to the vtable stub pattern
             WORD thirdWord = ((WORD*)pInstr)[2];
-            if(thirdWord == 0xf84d)
+            if (thirdWord == 0xf84d /* Part of str r0, [sp, #-4] */  || 
+                thirdWord == 0xf8dc /* Part of ldr r12, [r12 + offset] */)
+                stubKind = SK_VTABLECALL;
+        }
+
+        if (stubKind == SK_UNKNOWN)
+        {
+            //Assuming that RESOLVE_STUB_FIRST_WORD & DISPATCH_STUB_FIRST_WORD have same values
+            if (firstWord == DISPATCH_STUB_FIRST_WORD)
             {
-                stubKind = SK_DISPATCH;
+                WORD thirdWord = ((WORD*)pInstr)[2];
+                if (thirdWord == 0xf84d)
+                {
+                    stubKind = SK_DISPATCH;
+                }
+                else if (thirdWord == 0xb460)
+                {
+                    stubKind = SK_RESOLVE;
+                }
             }
-            else if(thirdWord == 0xb460)
+            else if (firstWord == 0xf8df)
             {
-                stubKind = SK_RESOLVE;
+                stubKind = SK_LOOKUP;
             }
         }
-        else if (firstWord == 0xf8df)
-        {
-            stubKind = SK_LOOKUP;
-        }
     }
     EX_CATCH
     {
index 3f22518..c7b3f75 100644 (file)
@@ -9,6 +9,7 @@
 
 #define DISPATCH_STUB_FIRST_DWORD 0xf940000d
 #define RESOLVE_STUB_FIRST_DWORD 0xF940000C
+#define VTABLECALL_STUB_FIRST_DWORD 0xF9400009
 
 struct ARM64EncodeHelpers
 {
@@ -386,6 +387,87 @@ private:
     ResolveStub _stub;
 };
 
+
+/*VTableCallStub**************************************************************************************
+These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed
+in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the
+vtable pointer, and finally jumps to the target method at a given slot in the vtable.
+*/
+struct VTableCallStub
+{
+    friend struct VTableCallHolder;
+
+    inline size_t size()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        BYTE* pStubCode = (BYTE *)this;
+
+        int numDataSlots = 0;
+
+        size_t cbSize = 4;              // First ldr instruction
+
+        for (int i = 0; i < 2; i++)
+        {
+            if (((*(DWORD*)(&pStubCode[cbSize])) & 0xFFC003FF) == 0xF9400129)
+            {
+                // ldr x9, [x9, #offsetOfIndirection]
+                cbSize += 4;
+            }
+            else
+            {
+                // These 2 instructions used when the indirection offset is >= 0x8000
+                // ldr w10, [PC, #dataOffset]
+                // ldr x9, [x9, x10]
+                numDataSlots++;
+                cbSize += 8;
+            }
+        }
+        return cbSize +
+                4 +                     // Last 'br x9' instruction
+                (numDataSlots * 4) +    // Data slots containing indirection offset values
+                4;                      // Slot value (data storage, not a real instruction)
+    }
+
+    inline PCODE        entryPoint()        const { LIMITED_METHOD_CONTRACT;  return (PCODE)&_entryPoint[0]; }
+
+    inline size_t token()
+    {
+        LIMITED_METHOD_CONTRACT;
+        DWORD slot = *(DWORD*)(reinterpret_cast<BYTE*>(this) + size() - 4);
+        return DispatchToken::CreateDispatchToken(slot).To_SIZE_T();
+    }
+
+private:
+    BYTE    _entryPoint[0];         // Dynamically sized stub. See Initialize() for more details.
+};
+
+/* VTableCallHolders are the containers for VTableCallStubs, they provide for any alignment of
+stubs as necessary.  */
+struct VTableCallHolder
+{
+    void  Initialize(unsigned slot);
+
+    VTableCallStub* stub() { LIMITED_METHOD_CONTRACT;  return reinterpret_cast<VTableCallStub *>(this); }
+
+    static size_t GetHolderSize(unsigned slot)
+    {
+        STATIC_CONTRACT_WRAPPER;
+        unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        int indirectionsCodeSize = (offsetOfIndirection >= 0x8000 ? 8 : 4) + (offsetAfterIndirection >= 0x8000 ? 8 : 4);
+        int indirectionsDataSize = (offsetOfIndirection >= 0x8000 ? 4 : 0) + (offsetAfterIndirection >= 0x8000 ? 4 : 0);
+        return 8 + indirectionsCodeSize + indirectionsDataSize + 4;
+    }
+
+    static VTableCallHolder* VTableCallHolder::FromVTableCallEntry(PCODE entry) { LIMITED_METHOD_CONTRACT; return (VTableCallHolder*)entry; }
+
+private:
+    // VTableCallStub follows here. It is dynamically sized on allocation because it could 
+    // use short/long instruction sizes for LDR, depending on the slot value.
+};
+
+
 #ifdef DECLARE_DATA
 
 #ifndef DACCESS_COMPILE
@@ -403,6 +485,78 @@ ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry)
     return resolveHolder;
 }
 
+void VTableCallHolder::Initialize(unsigned slot)
+{
+    unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    _ASSERTE(MethodTable::VTableIndir_t::isRelative == false /* TODO: NYI */);
+
+    int indirectionsCodeSize = (offsetOfIndirection >= 0x8000 ? 8 : 4) + (offsetAfterIndirection >= 0x8000 ? 8 : 4);
+    int indirectionsDataSize = (offsetOfIndirection >= 0x8000 ? 4 : 0) + (offsetAfterIndirection >= 0x8000 ? 4 : 0);
+    int codeSize = 8 + indirectionsCodeSize + indirectionsDataSize;
+
+    VTableCallStub* pStub = stub();
+    BYTE* p = (BYTE*)pStub->entryPoint();
+
+    // ldr x9,[x0] : x9 = MethodTable pointer
+    *(UINT32*)p = 0xF9400009; p += 4;
+
+    // moving offset value wrt PC. Currently points to first indirection offset data. 
+    uint dataOffset = codeSize - indirectionsDataSize - 4;
+
+    if (offsetOfIndirection >= 0x8000)
+    {
+        // ldr w10, [PC, #dataOffset]
+        *(DWORD*)p = 0x1800000a | ((dataOffset >> 2) << 5); p += 4;
+        // ldr x9, [x9, x10]
+        *(DWORD*)p = 0xf86a6929; p += 4;
+
+        // move to next indirection offset data
+        dataOffset = dataOffset - 8 + 4; // subtract 8 as we have moved PC by 8 and add 4 as next data is at 4 bytes from previous data
+    }
+    else
+    {
+        // ldr x9, [x9, #offsetOfIndirection]
+        *(DWORD*)p = 0xf9400129 | (((UINT32)offsetOfIndirection >> 3) << 10);
+        p += 4;
+    }
+
+    if (offsetAfterIndirection >= 0x8000)
+    {
+        // ldr w10, [PC, #dataOffset]
+        *(DWORD*)p = 0x1800000a | ((dataOffset >> 2) << 5); p += 4;
+        // ldr x9, [x9, x10]
+        *(DWORD*)p = 0xf86a6929; p += 4;
+    }
+    else
+    {
+        // ldr x9, [x9, #offsetAfterIndirection]
+        *(DWORD*)p = 0xf9400129 | (((UINT32)offsetAfterIndirection >> 3) << 10);
+        p += 4;
+    }
+
+    // br x9
+    *(UINT32*)p = 0xd61f0120; p += 4;
+
+    // data labels:
+    if (offsetOfIndirection >= 0x8000)
+    {
+        *(UINT32*)p = (UINT32)offsetOfIndirection;
+        p += 4;
+    }
+    if (offsetAfterIndirection >= 0x8000)
+    {
+        *(UINT32*)p = (UINT32)offsetAfterIndirection; 
+        p += 4;
+    }
+
+    // Store the slot value here for convenience. Not a real instruction (unreachable anyways)
+    // NOTE: Not counted in codeSize above.
+    *(UINT32*)p = slot; p += 4;
+
+    _ASSERT(p == (BYTE*)stub()->entryPoint() + VTableCallHolder::GetHolderSize(slot));
+    _ASSERT(stub()->size() == VTableCallHolder::GetHolderSize(slot));
+}
 
 #endif // DACCESS_COMPILE
 
@@ -435,6 +589,10 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s
         {
             stubKind = SK_RESOLVE;
         }
+        else if (firstDword == VTABLECALL_STUB_FIRST_DWORD) // assembly of first instruction of VTableCallStub : ldr x9, [x0]
+        {
+            stubKind = SK_VTABLECALL;
+        }
         else if (firstDword == 0x10000089) // assembly of first instruction of LookupStub : adr x9, _resolveWorkerTarget
         {
             stubKind = SK_LOOKUP;
index 67737a2..3bdae8c 100644 (file)
@@ -57,9 +57,9 @@ get quickly changed to point to another kind of stub.
 */
 struct LookupStub
 {
-    inline PCODE entryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; }
-    inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; }
-    inline size_t       size()          { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
+    inline PCODE entryPoint()       { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; }
+    inline size_t token()           { LIMITED_METHOD_CONTRACT; return _token; }
+    inline size_t size()            { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
 
 private:
     friend struct LookupHolder;
@@ -357,6 +357,66 @@ private:
     BYTE pad[(sizeof(void*)-((sizeof(ResolveStub))%sizeof(void*))+offsetof(ResolveStub,_token))%sizeof(void*)];        //fill out DWORD
 //#endif
 };
+
+/*VTableCallStub**************************************************************************************
+These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed
+in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the
+vtable pointer, and finally jumps to the target method at a given slot in the vtable.
+*/
+struct VTableCallStub
+{
+    friend struct VTableCallHolder;
+
+    inline size_t size()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        BYTE* pStubCode = (BYTE *)this;
+
+        size_t cbSize = 2;                                      // First mov instruction
+        cbSize += (pStubCode[cbSize + 1] == 0x80 ? 6 : 3);      // Either 8B 80 or 8B 40: mov eax,[eax+offset]
+        cbSize += (pStubCode[cbSize + 1] == 0xa0 ? 6 : 3);      // Either FF A0 or FF 60: jmp dword ptr [eax+slot]
+        cbSize += 4;                                            // Slot value (data storage, not a real instruction)
+
+        return cbSize;
+    }
+
+    inline PCODE        entryPoint()        const { LIMITED_METHOD_CONTRACT;  return (PCODE)&_entryPoint[0]; }
+
+    inline size_t token()
+    {
+        LIMITED_METHOD_CONTRACT;
+        DWORD slot = *(DWORD*)(reinterpret_cast<BYTE*>(this) + size() - 4);
+        return DispatchToken::CreateDispatchToken(slot).To_SIZE_T();
+    }
+
+private:
+    BYTE    _entryPoint[0];         // Dynamically sized stub. See Initialize() for more details.
+};
+
+/* VTableCallHolders are the containers for VTableCallStubs, they provide for any alignment of
+stubs as necessary.  */
+struct VTableCallHolder
+{
+    void  Initialize(unsigned slot);
+
+    VTableCallStub* stub() { LIMITED_METHOD_CONTRACT;  return reinterpret_cast<VTableCallStub *>(this); }
+
+    static size_t GetHolderSize(unsigned slot)
+    {
+        STATIC_CONTRACT_WRAPPER;
+        unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+        return 2 + (offsetOfIndirection >= 0x80 ? 6 : 3) + (offsetAfterIndirection >= 0x80 ? 6 : 3) + 4;
+    }
+
+    static VTableCallHolder* VTableCallHolder::FromVTableCallEntry(PCODE entry) { LIMITED_METHOD_CONTRACT; return (VTableCallHolder*)entry; }
+
+private:
+    // VTableCallStub follows here. It is dynamically sized on allocation because it could 
+    // use short/long instruction sizes for the mov/jmp, depending on the slot value.
+};
+
 #include <poppack.h>
 
 
@@ -895,6 +955,49 @@ ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry)
     return resolveHolder;
 }
 
+void VTableCallHolder::Initialize(unsigned slot)
+{
+    unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE;
+    _ASSERTE(MethodTable::VTableIndir_t::isRelative == false /* TODO: NYI */);
+
+    VTableCallStub* pStub = stub();
+    BYTE* p = (BYTE*)pStub->entryPoint();
+
+    // mov eax,[ecx] : eax = MethodTable pointer
+    *(UINT16*)p = 0x018b; p += 2;
+
+    // mov eax,[eax+vtable offset] : eax = vtable pointer
+    if (offsetOfIndirection >= 0x80)
+    {
+        *(UINT16*)p = 0x808b; p += 2;
+        *(UINT32*)p = offsetOfIndirection; p += 4;
+    }
+    else
+    {
+        *(UINT16*)p = 0x408b; p += 2;
+        *p++ = (BYTE)offsetOfIndirection;
+    }
+
+    // jmp dword ptr [eax+slot]
+    if (offsetAfterIndirection >= 0x80)
+    {
+        *(UINT16*)p = 0xa0ff; p += 2;
+        *(UINT32*)p = offsetAfterIndirection; p += 4;
+    }
+    else
+    {
+        *(UINT16*)p = 0x60ff; p += 2;
+        *p++ = (BYTE)offsetAfterIndirection;
+    }
+
+    // Store the slot value here for convenience. Not a real instruction (unreachable anyways)
+    *(UINT32*)p = slot; p += 4;
+
+    _ASSERT(p == (BYTE*)stub()->entryPoint() + VTableCallHolder::GetHolderSize(slot));
+    _ASSERT(stub()->size() == VTableCallHolder::GetHolderSize(slot));
+}
+
 #endif // DACCESS_COMPILE
 
 VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE stubStartAddress)
@@ -932,6 +1035,10 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s
         {
             stubKind = SK_RESOLVE;
         }
+        else if (firstWord == 0x018b)
+        {
+            stubKind = SK_VTABLECALL;
+        }
         else
         {
             BYTE firstByte  = ((BYTE*) stubStartAddress)[0];
index efaa340..4423c98 100644 (file)
@@ -5155,6 +5155,8 @@ void CEEInfo::getCallInfo(
 
     INDEBUG(memset(pResult, 0xCC, sizeof(*pResult)));
 
+    pResult->stubLookup.lookupKind.needsRuntimeLookup = false;
+
     MethodDesc* pMD = (MethodDesc *)pResolvedToken->hMethod;
     TypeHandle th(pResolvedToken->hClass);
 
@@ -5460,13 +5462,18 @@ void CEEInfo::getCallInfo(
         pResult->nullInstanceCheck = TRUE;
     }
     // Non-interface dispatches go through the vtable.
-    // We'll special virtual calls to target methods in the corelib assembly when compiling in R2R mode and generate fragile-NI-like callsites for improved performance. We
-    // can do that because today we'll always service the corelib assembly and the runtime in one bundle. Any caller in the corelib version bubble can benefit from this
-    // performance optimization.
-    else if (!pTargetMD->IsInterface() && (!IsReadyToRunCompilation() || CallerAndCalleeInSystemVersionBubble((MethodDesc*)callerHandle, pTargetMD)))
+    else if (!pTargetMD->IsInterface())
     {
         pResult->kind = CORINFO_VIRTUALCALL_VTABLE;
         pResult->nullInstanceCheck = TRUE;
+
+        // We'll special virtual calls to target methods in the corelib assembly when compiling in R2R mode, and generate fragile-NI-like callsites for improved performance. We
+        // can do that because today we'll always service the corelib assembly and the runtime in one bundle. Any caller in the corelib version bubble can benefit from this
+        // performance optimization.
+        if (IsReadyToRunCompilation() && !CallerAndCalleeInSystemVersionBubble((MethodDesc*)callerHandle, pTargetMD))
+        {
+            pResult->kind = CORINFO_VIRTUALCALL_STUB;
+        }
     }
     else
     {
@@ -5504,8 +5511,6 @@ void CEEInfo::getCallInfo(
         }
         else
         {
-            pResult->stubLookup.lookupKind.needsRuntimeLookup = false;
-
             BYTE * indcell = NULL;
 
             if (!(flags & CORINFO_CALLINFO_KINDONLY) && !isVerifyOnly())
index 7033abf..acb9bca 100644 (file)
@@ -1027,7 +1027,8 @@ void LoaderAllocator::ActivateManagedTracking()
 #endif // !CROSSGEN_COMPILE
 
 
-// We don't actually allocate a low frequency heap for collectible types
+// We don't actually allocate a low frequency heap for collectible types.
+// This is carefully tuned to sum up to 16 pages to reduce waste.
 #define COLLECTIBLE_LOW_FREQUENCY_HEAP_SIZE        (0 * GetOsPageSize())
 #define COLLECTIBLE_HIGH_FREQUENCY_HEAP_SIZE       (3 * GetOsPageSize())
 #define COLLECTIBLE_STUB_HEAP_SIZE                 GetOsPageSize()
index b614e34..78e9f22 100644 (file)
@@ -2286,20 +2286,24 @@ EXTERN_C PCODE STDCALL ExternalMethodFixupWorker(TransitionBlock * pTransitionBl
             // Get the stub manager for this module
             VirtualCallStubManager *pMgr = pModule->GetLoaderAllocator()->GetVirtualCallStubManager();
 
-            DispatchToken token;
-            if (pMT->IsInterface())
-                token = pMT->GetLoaderAllocator()->GetDispatchToken(pMT->GetTypeID(), slot);
-            else
-                token = DispatchToken::CreateDispatchToken(slot);
-
             OBJECTREF *protectedObj = pEMFrame->GetThisPtr();
             _ASSERTE(protectedObj != NULL);
             if (*protectedObj == NULL) {
                 COMPlusThrow(kNullReferenceException);
             }
-            
-            StubCallSite callSite(pIndirection, pEMFrame->GetReturnAddress());
-            pCode = pMgr->ResolveWorker(&callSite, protectedObj, token, VirtualCallStubManager::SK_LOOKUP);
+
+            DispatchToken token;
+            if (pMT->IsInterface() || MethodTable::VTableIndir_t::isRelative)
+            {
+                token = pMT->GetLoaderAllocator()->GetDispatchToken(pMT->GetTypeID(), slot);
+                StubCallSite callSite(pIndirection, pEMFrame->GetReturnAddress());
+                pCode = pMgr->ResolveWorker(&callSite, protectedObj, token, VirtualCallStubManager::SK_LOOKUP);
+            }
+            else
+            {
+                pCode = pMgr->GetVTableCallStub(slot);
+                *EnsureWritableExecutablePages((TADDR *)pIndirection) = pCode;
+            }
             _ASSERTE(pCode != NULL);
         }
         else
index 657200b..78a3748 100644 (file)
@@ -36,6 +36,7 @@ UINT32 g_site_write_mono = 0;           //# of call site backpatch writes to poi
 UINT32 g_stub_lookup_counter = 0;       //# of lookup stubs
 UINT32 g_stub_mono_counter = 0;         //# of dispatch stubs
 UINT32 g_stub_poly_counter = 0;         //# of resolve stubs
+UINT32 g_stub_vtable_counter = 0;       //# of vtable call stubs
 UINT32 g_stub_space = 0;                //# of bytes of stubs
 
 UINT32 g_reclaim_counter = 0;           //# of times a ReclaimAll was performed
@@ -239,6 +240,8 @@ void VirtualCallStubManager::LoggingDump()
         WriteFile (g_hStubLogFile, szPrintStr, (DWORD) strlen(szPrintStr), &dwWriteByte, NULL);
         sprintf_s(szPrintStr, COUNTOF(szPrintStr), OUTPUT_FORMAT_INT, "stub_poly_counter", g_stub_poly_counter);
         WriteFile (g_hStubLogFile, szPrintStr, (DWORD) strlen(szPrintStr), &dwWriteByte, NULL);
+        sprintf_s(szPrintStr, COUNTOF(szPrintStr), OUTPUT_FORMAT_INT, "stub_vtable_counter", g_stub_vtable_counter);
+        WriteFile(g_hStubLogFile, szPrintStr, (DWORD)strlen(szPrintStr), &dwWriteByte, NULL);
         sprintf_s(szPrintStr, COUNTOF(szPrintStr), OUTPUT_FORMAT_INT, "stub_space", g_stub_space);
         WriteFile (g_hStubLogFile, szPrintStr, (DWORD) strlen(szPrintStr), &dwWriteByte, NULL);
 
@@ -501,6 +504,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
     NewHolder<BucketTable> resolvers_holder(new BucketTable(CALL_STUB_MIN_BUCKETS));
     NewHolder<BucketTable> dispatchers_holder(new BucketTable(CALL_STUB_MIN_BUCKETS*2));
     NewHolder<BucketTable> lookups_holder(new BucketTable(CALL_STUB_MIN_BUCKETS));
+    NewHolder<BucketTable> vtableCallers_holder(new BucketTable(CALL_STUB_MIN_BUCKETS));
     NewHolder<BucketTable> cache_entries_holder(new BucketTable(CALL_STUB_MIN_BUCKETS));
 
     //
@@ -521,6 +525,8 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
     DWORD dispatch_heap_commit_size;
     DWORD resolve_heap_reserve_size;
     DWORD resolve_heap_commit_size;
+    DWORD vtable_heap_reserve_size;
+    DWORD vtable_heap_commit_size;
 
     //
     // Setup an expected number of items to commit and reserve
@@ -538,6 +544,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
         lookup_heap_commit_size      = 24;        lookup_heap_reserve_size       =  250;
         dispatch_heap_commit_size    = 24;        dispatch_heap_reserve_size     =  600;
         resolve_heap_commit_size     = 24;        resolve_heap_reserve_size      =  300;
+        vtable_heap_commit_size      = 24;        vtable_heap_reserve_size       =  600;
     }
     else if (parentDomain->IsSharedDomain())
     {
@@ -550,6 +557,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
         lookup_heap_commit_size      = 24;        lookup_heap_reserve_size       =  200;
         dispatch_heap_commit_size    = 24;        dispatch_heap_reserve_size     =  450;
         resolve_heap_commit_size     = 24;        resolve_heap_reserve_size      =  200;
+        vtable_heap_commit_size      = 24;        vtable_heap_reserve_size       =  450;
     }
     else
     {
@@ -559,6 +567,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
         lookup_heap_commit_size      = 8;         lookup_heap_reserve_size       = 8;
         dispatch_heap_commit_size    = 8;         dispatch_heap_reserve_size     = 8;
         resolve_heap_commit_size     = 8;         resolve_heap_reserve_size      = 8;
+        vtable_heap_commit_size      = 8;         vtable_heap_reserve_size       = 8;
     }
 
 #ifdef _WIN64
@@ -571,7 +580,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
 #endif
 
     //
-    // Convert the number of items into a size in bytes to commit abd reserve
+    // Convert the number of items into a size in bytes to commit and reserve
     //
     indcell_heap_reserve_size       *= sizeof(void *);
     indcell_heap_commit_size        *= sizeof(void *);
@@ -593,6 +602,9 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
     resolve_heap_reserve_size       *= sizeof(ResolveHolder);
     resolve_heap_commit_size        *= sizeof(ResolveHolder);
 
+    vtable_heap_reserve_size       *= static_cast<DWORD>(VTableCallHolder::GetHolderSize(0));
+    vtable_heap_commit_size        *= static_cast<DWORD>(VTableCallHolder::GetHolderSize(0));
+
     //
     // Align up all of the commit and reserve sizes
     //
@@ -611,6 +623,9 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
     resolve_heap_reserve_size        = (DWORD) ALIGN_UP(resolve_heap_reserve_size,     GetOsPageSize());
     resolve_heap_commit_size         = (DWORD) ALIGN_UP(resolve_heap_commit_size,      GetOsPageSize());
 
+    vtable_heap_reserve_size         = (DWORD) ALIGN_UP(vtable_heap_reserve_size,      GetOsPageSize());
+    vtable_heap_commit_size          = (DWORD) ALIGN_UP(vtable_heap_commit_size,       GetOsPageSize());
+
     BYTE * initReservedMem = NULL;
 
     if (!isCollectible)
@@ -619,7 +634,8 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
                                            cache_entry_heap_reserve_size +
                                            lookup_heap_reserve_size      +
                                            dispatch_heap_reserve_size    +
-                                           resolve_heap_reserve_size;
+                                           resolve_heap_reserve_size     +
+                                           vtable_heap_reserve_size;
 
         DWORD dwTotalReserveMemSize = (DWORD) ALIGN_UP(dwTotalReserveMemSizeCalc, VIRTUAL_ALLOC_RESERVE_GRANULARITY);
 
@@ -629,13 +645,14 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
             if (dwWastedReserveMemSize != 0)
             {
                 DWORD cWastedPages = dwWastedReserveMemSize / GetOsPageSize();
-                DWORD cPagesPerHeap = cWastedPages / 5;
-                DWORD cPagesRemainder = cWastedPages % 5; // We'll throw this at the resolve heap
+                DWORD cPagesPerHeap = cWastedPages / 6;
+                DWORD cPagesRemainder = cWastedPages % 6; // We'll throw this at the resolve heap
 
                 indcell_heap_reserve_size += cPagesPerHeap * GetOsPageSize();
                 cache_entry_heap_reserve_size += cPagesPerHeap * GetOsPageSize();
                 lookup_heap_reserve_size += cPagesPerHeap * GetOsPageSize();
                 dispatch_heap_reserve_size += cPagesPerHeap * GetOsPageSize();
+                vtable_heap_reserve_size += cPagesPerHeap * GetOsPageSize();
                 resolve_heap_reserve_size += cPagesPerHeap * GetOsPageSize();
                 resolve_heap_reserve_size += cPagesRemainder * GetOsPageSize();
             }
@@ -644,7 +661,8 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
                                cache_entry_heap_reserve_size +
                                lookup_heap_reserve_size      +
                                dispatch_heap_reserve_size    +
-                               resolve_heap_reserve_size)    == 
+                               resolve_heap_reserve_size     +
+                               vtable_heap_reserve_size)    == 
                               dwTotalReserveMemSize);
         }
 
@@ -672,12 +690,20 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
         resolve_heap_reserve_size        = GetOsPageSize();
         resolve_heap_commit_size         = GetOsPageSize();
 
+        // Heap for the collectible case is carefully tuned to sum up to 16 pages. Today, we only use the 
+        // vtable jump stubs in the R2R scenario, which is unlikely to be loaded in the collectible context,
+        // so we'll keep the heap numbers at zero for now. If we ever use vtable stubs in the collectible
+        // scenario, we'll just allocate the memory on demand.
+        vtable_heap_reserve_size         = 0;
+        vtable_heap_commit_size          = 0;
+
 #ifdef _DEBUG
         DWORD dwTotalReserveMemSizeCalc  = indcell_heap_reserve_size     +
                                            cache_entry_heap_reserve_size +
                                            lookup_heap_reserve_size      +
                                            dispatch_heap_reserve_size    +
-                                           resolve_heap_reserve_size;
+                                           resolve_heap_reserve_size     +
+                                           vtable_heap_reserve_size;
 #endif
 
         DWORD dwActualVSDSize = 0;
@@ -756,6 +782,19 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
 
     initReservedMem += resolve_heap_reserve_size;
 
+    // Hot  memory, Writable, Execute, write exactly once
+    NewHolder<LoaderHeap> vtable_heap_holder(
+                               new LoaderHeap(vtable_heap_reserve_size, vtable_heap_commit_size,
+                                              initReservedMem, vtable_heap_reserve_size,
+#ifdef ENABLE_PERF_COUNTERS
+                                              &(GetPerfCounters().m_Loading.cbLoaderHeapSize),
+#else
+                                              NULL,
+#endif                                              
+                                              &vtable_rangeList, TRUE));
+
+    initReservedMem += vtable_heap_reserve_size;
+
     // Allocate the initial counter block
     NewHolder<counter_block> m_counters_holder(new counter_block);
 
@@ -767,12 +806,13 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA
     lookup_heap      = lookup_heap_holder;      lookup_heap_holder.SuppressRelease();
     dispatch_heap    = dispatch_heap_holder;    dispatch_heap_holder.SuppressRelease();
     resolve_heap     = resolve_heap_holder;     resolve_heap_holder.SuppressRelease();
+    vtable_heap      = vtable_heap_holder;      vtable_heap_holder.SuppressRelease();
     cache_entry_heap = cache_entry_heap_holder; cache_entry_heap_holder.SuppressRelease();
 
     resolvers        = resolvers_holder;        resolvers_holder.SuppressRelease();
     dispatchers      = dispatchers_holder;      dispatchers_holder.SuppressRelease();
     lookups          = lookups_holder;          lookups_holder.SuppressRelease();
-
+    vtableCallers    = vtableCallers_holder;    vtableCallers_holder.SuppressRelease();
     cache_entries    = cache_entries_holder;    cache_entries_holder.SuppressRelease();
 
     m_counters       = m_counters_holder;       m_counters_holder.SuppressRelease();
@@ -832,11 +872,13 @@ VirtualCallStubManager::~VirtualCallStubManager()
     if (lookup_heap)      { delete lookup_heap;      lookup_heap      = NULL;}
     if (dispatch_heap)    { delete dispatch_heap;    dispatch_heap    = NULL;}
     if (resolve_heap)     { delete resolve_heap;     resolve_heap     = NULL;}
+    if (vtable_heap)      { delete vtable_heap;      vtable_heap      = NULL;}
     if (cache_entry_heap) { delete cache_entry_heap; cache_entry_heap = NULL;}
 
     if (resolvers)        { delete resolvers;        resolvers        = NULL;}
     if (dispatchers)      { delete dispatchers;      dispatchers      = NULL;}
     if (lookups)          { delete lookups;          lookups          = NULL;}
+    if (vtableCallers)    { delete vtableCallers;    vtableCallers    = NULL;}
     if (cache_entries)    { delete cache_entries;    cache_entries    = NULL;}
 
     // Now get rid of the memory taken by the counter_blocks
@@ -1075,6 +1117,8 @@ BOOL VirtualCallStubManager::DoTraceStub(PCODE stubStartAddress, TraceDestinatio
 {
     LIMITED_METHOD_CONTRACT;
 
+    LOG((LF_CORDB, LL_EVERYTHING, "VirtualCallStubManager::DoTraceStub called\n"));
+
     _ASSERTE(CheckIsStub_Internal(stubStartAddress));
 
 #ifdef FEATURE_PREJIT
@@ -1191,6 +1235,68 @@ PCODE VirtualCallStubManager::GetCallStub(TypeHandle ownerType, DWORD slot)
     RETURN (stub);
 }
 
+PCODE VirtualCallStubManager::GetVTableCallStub(DWORD slot)
+{
+    CONTRACT(PCODE) {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_ANY;
+        INJECT_FAULT(COMPlusThrowOM(););
+        PRECONDITION(!MethodTable::VTableIndir_t::isRelative /* Not yet supported */);
+        POSTCONDITION(RETVAL != NULL);
+    } CONTRACT_END;
+
+    GCX_COOP(); // This is necessary for BucketTable synchronization
+
+    PCODE stub = CALL_STUB_EMPTY_ENTRY;
+
+    VTableCallEntry entry;
+    Prober probe(&entry);
+    if (vtableCallers->SetUpProber(DispatchToken::CreateDispatchToken(slot).To_SIZE_T(), 0, &probe))
+    {
+        if ((stub = (PCODE)(vtableCallers->Find(&probe))) == CALL_STUB_EMPTY_ENTRY)
+        {
+            VTableCallHolder *pHolder = GenerateVTableCallStub(slot);
+            stub = (PCODE)(vtableCallers->Add((size_t)(pHolder->stub()->entryPoint()), &probe));
+        }
+    }
+
+    _ASSERTE(stub != CALL_STUB_EMPTY_ENTRY);
+    RETURN(stub);
+}
+
+VTableCallHolder* VirtualCallStubManager::GenerateVTableCallStub(DWORD slot)
+{
+    CONTRACT(VTableCallHolder*) {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_ANY;
+        INJECT_FAULT(COMPlusThrowOM(););
+        PRECONDITION(!MethodTable::VTableIndir_t::isRelative /* Not yet supported */);
+        POSTCONDITION(RETVAL != NULL);
+    } CONTRACT_END;
+
+    //allocate from the requisite heap and copy the template over it.
+    VTableCallHolder * pHolder = (VTableCallHolder*)(void*)vtable_heap->AllocAlignedMem(VTableCallHolder::GetHolderSize(slot), CODE_SIZE_ALIGN);
+
+    pHolder->Initialize(slot);
+    ClrFlushInstructionCache(pHolder->stub(), pHolder->stub()->size());
+
+    AddToCollectibleVSDRangeList(pHolder);
+
+    //incr our counters
+    stats.stub_vtable_counter++;
+    stats.stub_space += (UINT32)pHolder->stub()->size();
+    LOG((LF_STUBS, LL_INFO10000, "GenerateVTableCallStub for slot " FMT_ADDR "at" FMT_ADDR "\n",
+        DBG_ADDR(slot), DBG_ADDR(pHolder->stub())));
+
+#ifdef FEATURE_PERFMAP
+    PerfMap::LogStubs(__FUNCTION__, "GenerateVTableCallStub", (PCODE)pHolder->stub(), pHolder->stub()->size());
+#endif
+
+    RETURN(pHolder);
+}
+
 #ifdef FEATURE_PREJIT
 extern "C" PCODE STDCALL StubDispatchFixupWorker(TransitionBlock * pTransitionBlock, 
                                                  TADDR siteAddrForRegisterIndirect,
@@ -1457,6 +1563,12 @@ size_t VirtualCallStubManager::GetTokenFromStubQuick(VirtualCallStubManager * pM
         LookupHolder  * lookupHolder  = LookupHolder::FromLookupEntry(stub);
         return lookupHolder->stub()->token();
     }
+    else if (kind == SK_VTABLECALL)
+    {
+        _ASSERTE(pMgr->isVTableCallStub(stub));
+        VTableCallStub * vtableStub = (VTableCallStub *)PCODEToPINSTR(stub);
+        return vtableStub->token();
+    }
 
     _ASSERTE(!"Should not get here.");
 
@@ -3023,12 +3135,14 @@ void VirtualCallStubManager::LogStats()
     resolvers->LogStats();
     dispatchers->LogStats();
     lookups->LogStats();
+    vtableCallers->LogStats();
     cache_entries->LogStats();
 
     g_site_counter += stats.site_counter;
     g_stub_lookup_counter += stats.stub_lookup_counter;
     g_stub_poly_counter += stats.stub_poly_counter;
     g_stub_mono_counter += stats.stub_mono_counter;
+    g_stub_vtable_counter += stats.stub_vtable_counter;
     g_site_write += stats.site_write;
     g_site_write_poly += stats.site_write_poly;
     g_site_write_mono += stats.site_write_mono;
@@ -3043,6 +3157,7 @@ void VirtualCallStubManager::LogStats()
     stats.stub_lookup_counter = 0;
     stats.stub_poly_counter = 0;
     stats.stub_mono_counter = 0;
+    stats.stub_vtable_counter = 0;
     stats.site_write = 0;
     stats.site_write_poly = 0;
     stats.site_write_mono = 0;
@@ -3369,6 +3484,7 @@ void BucketTable::Reclaim()
 // dispatchers     token     the expected MT
 // resolver        token     the stub calling convention
 // cache_entries   token     the expected method table
+// vtableCallers   token     unused (zero)
 //
 BOOL BucketTable::SetUpProber(size_t keyA, size_t keyB, Prober *prober)
 {
index b8984ea..1d22e46 100644 (file)
@@ -38,6 +38,7 @@ class VirtualCallStubManagerManager;
 struct LookupHolder;
 struct DispatchHolder;
 struct ResolveHolder;
+struct VTableCallHolder;
 
 /////////////////////////////////////////////////////////////////////////////////////
 // Forward function declarations
@@ -238,6 +239,9 @@ public:
     PCODE GetCallStub(TypeHandle ownerType, MethodDesc *pMD);
     PCODE GetCallStub(TypeHandle ownerType, DWORD slot);
 
+    // Stubs for vtable-based virtual calls with no lookups
+    PCODE GetVTableCallStub(DWORD slot);
+
     // Generate an fresh indirection cell.
     BYTE* GenerateStubIndirection(PCODE stub, BOOL fUseRecycledCell = FALSE);
 
@@ -272,6 +276,7 @@ public:
           resolve_rangeList(),
           dispatch_rangeList(),
           cache_entry_rangeList(),
+          vtable_rangeList(),
           parentDomain(NULL),
           isCollectible(false),
           m_initialReservedMemForHeaps(NULL),
@@ -308,6 +313,7 @@ public:
         SK_LOOKUP,      // Lookup Stubs are SLOW stubs that simply call into the runtime to do all work.
         SK_DISPATCH,    // Dispatch Stubs have a fast check for one type otherwise jumps to runtime.  Works for monomorphic sites
         SK_RESOLVE,     // Resolve Stubs do a hash lookup before fallling back to the runtime.  Works for polymorphic sites.
+        SK_VTABLECALL,  // Stub that jumps to a target method using vtable-based indirections. Works for non-interface calls.
         SK_BREAKPOINT 
     };
 
@@ -346,6 +352,11 @@ public:
             if (isResolvingStub(stubStartAddress))
                 return SK_RESOLVE;
         }
+        else if (predictedKind == SK_VTABLECALL)
+        {
+            if (isVTableCallStub(stubStartAddress))
+                return SK_VTABLECALL;
+        }
 
         // This is the slow case. If the predict returned SK_UNKNOWN, SK_BREAKPOINT,
         // or the predict was found to be incorrect when checked against the RangeLists
@@ -356,6 +367,8 @@ public:
             return SK_LOOKUP;
         else if (isResolvingStub(stubStartAddress))
             return SK_RESOLVE;
+        else if (isVTableCallStub(stubStartAddress))
+            return SK_VTABLECALL;
 
         return SK_UNKNOWN;
     }
@@ -392,6 +405,14 @@ public:
         return GetLookupRangeList()->IsInRange(stubStartAddress);
     }
 
+    BOOL isVTableCallStub(PCODE stubStartAddress)
+    {
+        WRAPPER_NO_CONTRACT;
+        SUPPORTS_DAC;
+
+        return GetVTableCallRangeList()->IsInRange(stubStartAddress);
+    }
+
     static BOOL isDispatchingStubStatic(PCODE addr)
     {
         WRAPPER_NO_CONTRACT;
@@ -416,11 +437,20 @@ public:
         return stubKind == SK_LOOKUP;
     }
 
+    static BOOL isVtableCallStubStatic(PCODE addr)
+    {
+        WRAPPER_NO_CONTRACT;
+        StubKind stubKind;
+        FindStubManager(addr, &stubKind);
+        return stubKind == SK_VTABLECALL;
+    }
+
     //use range lists to track the chunks of memory that are part of each heap
     LockedRangeList lookup_rangeList;
     LockedRangeList resolve_rangeList;
     LockedRangeList dispatch_rangeList;
     LockedRangeList cache_entry_rangeList;
+    LockedRangeList vtable_rangeList;
 
     // Get dac-ized pointers to rangelist.
     RangeList* GetLookupRangeList() 
@@ -450,6 +480,12 @@ public:
         TADDR addr = PTR_HOST_MEMBER_TADDR(VirtualCallStubManager, this, cache_entry_rangeList);
         return PTR_RangeList(addr);
     }
+    RangeList* GetVTableCallRangeList()
+    {
+        SUPPORTS_DAC;
+        TADDR addr = PTR_HOST_MEMBER_TADDR(VirtualCallStubManager, this, vtable_rangeList);
+        return PTR_RangeList(addr);
+    }
 
 private:
 
@@ -475,6 +511,8 @@ private:
     LookupHolder *GenerateLookupStub(PCODE addrOfResolver,
                                      size_t dispatchToken);
 
+    VTableCallHolder* GenerateVTableCallStub(DWORD slot);
+
     template <typename STUB_HOLDER>
     void AddToCollectibleVSDRangeList(STUB_HOLDER *holder)
     {
@@ -687,6 +725,7 @@ private:
     PTR_LoaderHeap  lookup_heap;        // lookup stubs go here
     PTR_LoaderHeap  dispatch_heap;      // dispatch stubs go here
     PTR_LoaderHeap  resolve_heap;       // resolve stubs go here
+    PTR_LoaderHeap  vtable_heap;        // vtable-based jump stubs go here
 
 #ifdef _TARGET_AMD64_
     // When we layout the stub heaps, we put them close together in a sequential order
@@ -707,6 +746,7 @@ private:
     BucketTable *   cache_entries;      // hash table of dispatch token/target structs for dispatch cache
     BucketTable *   dispatchers;        // hash table of dispatching stubs keyed by tokens/actualtype
     BucketTable *   resolvers;          // hash table of resolvers keyed by tokens/resolverstub
+    BucketTable *   vtableCallers;      // hash table of vtable call stubs keyed by slot values
 
     // This structure is used to keep track of the fail counters.
     // We only need one fail counter per ResolveStub,
@@ -758,6 +798,7 @@ public:
         UINT32 stub_lookup_counter;     //# of lookup stubs
         UINT32 stub_poly_counter;       //# of resolve stubs
         UINT32 stub_mono_counter;       //# of dispatch stubs
+        UINT32 stub_vtable_counter;     //# of vtable call stubs
         UINT32 site_write;              //# of call site backpatch writes
         UINT32 site_write_poly;         //# of call site backpatch writes to point to resolve stubs
         UINT32 site_write_mono;         //# of call site backpatch writes to point to dispatch stubs
@@ -1061,6 +1102,44 @@ private:
 };
 #endif // USES_LOOKUP_STUBS
 
+class VTableCallEntry : public Entry
+{
+public:
+    //Creates an entry that wraps vtable call stub
+    VTableCallEntry(size_t s)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(VirtualCallStubManager::isVtableCallStubStatic((PCODE)s));
+        stub = (VTableCallStub*)s;
+    }
+
+    //default contructor to allow stack and inline allocation of vtable call entries
+    VTableCallEntry() { LIMITED_METHOD_CONTRACT; stub = NULL; }
+
+    //implementations of abstract class Entry
+    BOOL Equals(size_t keyA, size_t keyB)
+    {
+        WRAPPER_NO_CONTRACT; return stub && (keyA == KeyA()) && (keyB == KeyB());
+    }
+
+    size_t KeyA() { WRAPPER_NO_CONTRACT; return Token(); }
+    size_t KeyB() { WRAPPER_NO_CONTRACT; return (size_t)0; }
+
+    void SetContents(size_t contents)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(VirtualCallStubManager::isVtableCallStubStatic((PCODE)contents));
+        stub = VTableCallHolder::FromVTableCallEntry((PCODE)contents)->stub();
+    }
+
+    //extract the token of the underlying lookup stub
+
+    inline size_t Token() { LIMITED_METHOD_CONTRACT; return stub ? stub->token() : 0; }
+
+private:
+    VTableCallStub* stub;   //the stub the entry wrapping
+};
+
 /**********************************************************************************************
 ResolveCacheEntry wraps a ResolveCacheElem and provides lookup functionality for entries that
 were created that may be added to the ResolveCache
index 8efeedd..e0acd81 100644 (file)
@@ -2145,29 +2145,28 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
                 return;
             }
 
-#ifdef FEATURE_READYTORUN_COMPILER
             if (IsReadyToRunCompilation())
             {
                 ZapImport * pImport = m_pImage->GetImportTable()->GetStubDispatchCell(pResolvedToken);
 
                 pResult->stubLookup.constLookup.accessType   = IAT_PVALUE;
                 pResult->stubLookup.constLookup.addr         = pImport;
-                break;
             }
-#endif
-
-            CORINFO_CLASS_HANDLE calleeOwner = pResolvedToken->hClass;
-            CORINFO_METHOD_HANDLE callee = pResolvedToken->hMethod;
-            _ASSERTE(callee == pResult->hMethod);
+            else
+            {
 
-            //
-            // Create the indirection cell
-            //
-            pTarget = m_pImage->GetImportTable()->GetStubDispatchCell(calleeOwner, callee);
+                CORINFO_CLASS_HANDLE calleeOwner = pResolvedToken->hClass;
+                CORINFO_METHOD_HANDLE callee = pResolvedToken->hMethod;
+                _ASSERTE(callee == pResult->hMethod);
 
-            pResult->stubLookup.constLookup.accessType = IAT_PVALUE;
+                //
+                // Create the indirection cell
+                //
+                pTarget = m_pImage->GetImportTable()->GetStubDispatchCell(calleeOwner, callee);
 
-            pResult->stubLookup.constLookup.addr = pTarget;
+                pResult->stubLookup.constLookup.accessType = IAT_PVALUE;
+                pResult->stubLookup.constLookup.addr = pTarget;
+            }
         }
         break;
 
@@ -2183,7 +2182,6 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
         return;
 
     case CORINFO_CALL:
-#ifdef FEATURE_READYTORUN_COMPILER
         if (IsReadyToRunCompilation())
         {
             // Constrained token is not interesting with this transforms
@@ -2207,12 +2205,11 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
             pResult->codePointerLookup.constLookup.accessType   = IAT_PVALUE;
             pResult->codePointerLookup.constLookup.addr         = pImport;
         }
-#endif
         break;
 
     case CORINFO_VIRTUALCALL_VTABLE:
-        // READYTORUN: FUTURE: support for vtable-based calls (currently, only calls within the CoreLib version bubble is supported, and the codegen we generate
-        // is the same as the fragile NI (because CoreLib and the runtime will always be updated together anyways - this is a special case)
+        // Only calls within the CoreLib version bubble support fragile NI codegen with vtable based calls, for better performance (because 
+        // CoreLib and the runtime will always be updated together anyways - this is a special case)
         break;
 
     case CORINFO_VIRTUALCALL_LDVIRTFTN:
@@ -2240,7 +2237,6 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
         break;
     }
 
-#ifdef FEATURE_READYTORUN_COMPILER
     if (IsReadyToRunCompilation() && pResult->sig.hasTypeArg())
     {
         if (pResult->exactContextNeedsRuntimeLookup)
@@ -2272,8 +2268,8 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
             AppendConditionalImport(pImport);
         }
     }
-#endif
 }
+
 BOOL ZapInfo::canAccessFamily(CORINFO_METHOD_HANDLE hCaller,
                               CORINFO_CLASS_HANDLE hInstanceType)
 {
@@ -2285,7 +2281,6 @@ BOOL ZapInfo::isRIDClassDomainID (CORINFO_CLASS_HANDLE cls)
     return m_pEEJitInfo->isRIDClassDomainID(cls);
 }
 
-
 unsigned ZapInfo::getClassDomainID (CORINFO_CLASS_HANDLE cls, void **ppIndirection)
 {
     _ASSERTE(ppIndirection != NULL);