1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 // Various helper routines for generating AMD64 assembly code.
8 // DO NOT INCLUDE THIS FILE DIRECTLY - ALWAYS USE CGENSYS.H INSTEAD
13 #ifndef _TARGET_AMD64_
14 #error Should only include "AMD64\cgencpu.h" for AMD64 builds
20 #include "xmmintrin.h"
22 // Given a return address retrieved during stackwalk,
23 // this is the offset by which it should be decremented to lend somewhere in a call instruction.
24 #define STACKWALK_CONTROLPC_ADJUST_OFFSET 1
26 // preferred alignment for data
27 #define DATA_ALIGNMENT 8
30 class FramedMethodFrame;
33 class ComCallMethodDesc;
36 // functions implemented in AMD64 assembly
38 EXTERN_C void InstantiatingMethodStubWorker(void);
39 EXTERN_C void SinglecastDelegateInvokeStub();
40 EXTERN_C void FastCallFinalizeWorker(Object *obj, PCODE funcPtr);
42 #define COMMETHOD_PREPAD 16 // # extra bytes to allocate in addition to sizeof(ComCallMethodDesc)
43 #define COMMETHOD_CALL_PRESTUB_SIZE 6 // 32-bit indirect relative call
44 #define COMMETHOD_CALL_PRESTUB_ADDRESS_OFFSET -10 // the offset of the call target address inside the prestub
46 #define STACK_ALIGN_SIZE 16
48 #define JUMP_ALLOCATE_SIZE 12 // # bytes to allocate for a 64-bit jump instruction
49 #define BACK_TO_BACK_JUMP_ALLOCATE_SIZE 12 // # bytes to allocate for a back to back 64-bit jump instruction
50 #define SIZEOF_LOAD_AND_JUMP_THUNK 22 // # bytes to mov r10, X; jmp Z
51 #define SIZEOF_LOAD2_AND_JUMP_THUNK 32 // # bytes to mov r10, X; mov r11, Y; jmp Z
53 // Also in Zapper.h, CorCompile.h, FnTableAccess.h
54 #define USE_INDIRECT_CODEHEADER // use CodeHeader, RealCodeHeader construct
56 #define HAS_NDIRECT_IMPORT_PRECODE 1
57 #define HAS_FIXUP_PRECODE 1
58 #define HAS_FIXUP_PRECODE_CHUNKS 1
59 #define FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS 1
61 // ThisPtrRetBufPrecode one is necessary for closed delegates over static methods with return buffer
62 #define HAS_THISPTR_RETBUF_PRECODE 1
64 #define CODE_SIZE_ALIGN 16 // must alloc code blocks on 8-byte boundaries; for perf reasons we use 16 byte boundaries
65 #define CACHE_LINE_SIZE 64 // Current AMD64 processors have 64-byte cache lines as per AMD64 optmization manual
66 #define LOG2SLOT LOG2_PTRSIZE
70 #define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 16 // bytes
71 #define ENREGISTERED_PARAMTYPE_MAXSIZE 16 // bytes
72 #define ENREGISTERED_RETURNTYPE_MAXSIZE 16 // bytes
73 #define CALLDESCR_ARGREGS 1 // CallDescrWorker has ArgumentRegister parameter
74 #define CALLDESCR_FPARGREGS 1 // CallDescrWorker has FloatArgumentRegisters parameter
76 #define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 8 // bytes
77 #define ENREGISTERED_PARAMTYPE_MAXSIZE 8 // bytes
78 #define ENREGISTERED_RETURNTYPE_MAXSIZE 8 // bytes
79 #define COM_STUBS_SEPARATE_FP_LOCATIONS
80 #define CALLDESCR_REGTYPEMAP 1
83 #define INSTRFMT_K64SMALL
87 #define USE_REDIRECT_FOR_GCSTRESS
93 #define REX_PREFIX_BASE 0x40 // 0100xxxx
94 #define REX_OPERAND_SIZE_64BIT 0x08 // xxxx1xxx
95 #define REX_MODRM_REG_EXT 0x04 // xxxxx1xx // use for 'middle' 3 bit field of mod/r/m
96 #define REX_SIB_INDEX_EXT 0x02 // xxxxxx10
97 #define REX_MODRM_RM_EXT 0x01 // XXXXXXX1 // use for low 3 bit field of mod/r/m
98 #define REX_SIB_BASE_EXT 0x01 // XXXXXXX1
99 #define REX_OPCODE_REG_EXT 0x01 // XXXXXXX1
101 #define X86_REGISTER_MASK 0x7
103 #define X86RegFromAMD64Reg(extended_reg) \
104 ((X86Reg)(((int)extended_reg) & X86_REGISTER_MASK))
107 //=======================================================================
108 // IMPORTANT: This value is used to figure out how much to allocate
109 // for a fixed array of FieldMarshaler's. That means it must be at least
110 // as large as the largest FieldMarshaler subclass. This requirement
111 // is guarded by an assert.
112 //=======================================================================
113 #define MAXFIELDMARSHALERSIZE 40
116 // Why is the return value ARG_SLOT? On 64-bit systems, that is 64-bits
117 // and much bigger than necessary for R4, requiring explicit downcasts.
119 ARG_SLOT FPSpillToR4(void* pSpillSlot)
121 LIMITED_METHOD_CONTRACT;
122 return *(DWORD*)pSpillSlot;
126 ARG_SLOT FPSpillToR8(void* pSpillSlot)
128 LIMITED_METHOD_CONTRACT;
129 return *(SIZE_T*)pSpillSlot;
133 void R4ToFPSpill(void* pSpillSlot, DWORD srcFloatAsDWORD)
135 LIMITED_METHOD_CONTRACT;
136 *(SIZE_T*)pSpillSlot = (SIZE_T)srcFloatAsDWORD;
137 *((SIZE_T*)pSpillSlot + 1) = 0;
141 void R8ToFPSpill(void* pSpillSlot, SIZE_T srcDoubleAsSIZE_T)
143 LIMITED_METHOD_CONTRACT;
144 *(SIZE_T*)pSpillSlot = srcDoubleAsSIZE_T;
145 *((SIZE_T*)pSpillSlot + 1) = 0;
149 #ifdef CROSSGEN_COMPILE
150 #define GetEEFuncEntryPoint(pfn) 0x1001
152 #define GetEEFuncEntryPoint(pfn) GFN_TADDR(pfn)
156 //**********************************************************************
158 //**********************************************************************
160 typedef INT64 StackElemType;
161 #define STACK_ELEM_SIZE sizeof(StackElemType)
163 // !! This expression assumes STACK_ELEM_SIZE is a power of 2.
164 #define StackElemSize(parmSize) (((parmSize) + STACK_ELEM_SIZE - 1) & ~((ULONG)(STACK_ELEM_SIZE - 1)))
166 //**********************************************************************
168 //**********************************************************************
169 //--------------------------------------------------------------------
170 // This represents some of the TransitionFrame fields that are
171 // stored at negative offsets.
172 //--------------------------------------------------------------------
175 //--------------------------------------------------------------------
176 // This represents the arguments that are stored in volatile registers.
177 // This should not overlap the CalleeSavedRegisters since those are already
178 // saved separately and it would be wasteful to save the same register twice.
179 // If we do use a non-volatile register as an argument, then the ArgIterator
180 // will probably have to communicate this back to the PromoteCallerStack
181 // routine to avoid a double promotion.
182 //--------------------------------------------------------------------
183 #ifdef UNIX_AMD64_ABI
185 #define ENUM_ARGUMENT_REGISTERS() \
186 ARGUMENT_REGISTER(RDI) \
187 ARGUMENT_REGISTER(RSI) \
188 ARGUMENT_REGISTER(RDX) \
189 ARGUMENT_REGISTER(RCX) \
190 ARGUMENT_REGISTER(R8) \
191 ARGUMENT_REGISTER(R9)
193 #define NUM_ARGUMENT_REGISTERS 6
195 // The order of registers in this macro is hardcoded in assembly code
196 // at number of places
197 #define ENUM_CALLEE_SAVED_REGISTERS() \
198 CALLEE_SAVED_REGISTER(R12) \
199 CALLEE_SAVED_REGISTER(R13) \
200 CALLEE_SAVED_REGISTER(R14) \
201 CALLEE_SAVED_REGISTER(R15) \
202 CALLEE_SAVED_REGISTER(Rbx) \
203 CALLEE_SAVED_REGISTER(Rbp)
205 #define NUM_CALLEE_SAVED_REGISTERS 6
207 #else // UNIX_AMD64_ABI
209 #define ENUM_ARGUMENT_REGISTERS() \
210 ARGUMENT_REGISTER(RCX) \
211 ARGUMENT_REGISTER(RDX) \
212 ARGUMENT_REGISTER(R8) \
213 ARGUMENT_REGISTER(R9)
215 #define NUM_ARGUMENT_REGISTERS 4
217 // The order of registers in this macro is hardcoded in assembly code
218 // at number of places
219 #define ENUM_CALLEE_SAVED_REGISTERS() \
220 CALLEE_SAVED_REGISTER(Rdi) \
221 CALLEE_SAVED_REGISTER(Rsi) \
222 CALLEE_SAVED_REGISTER(Rbx) \
223 CALLEE_SAVED_REGISTER(Rbp) \
224 CALLEE_SAVED_REGISTER(R12) \
225 CALLEE_SAVED_REGISTER(R13) \
226 CALLEE_SAVED_REGISTER(R14) \
227 CALLEE_SAVED_REGISTER(R15)
229 #define NUM_CALLEE_SAVED_REGISTERS 8
231 #endif // UNIX_AMD64_ABI
233 typedef DPTR(struct ArgumentRegisters) PTR_ArgumentRegisters;
234 struct ArgumentRegisters {
235 #define ARGUMENT_REGISTER(regname) INT_PTR regname;
236 ENUM_ARGUMENT_REGISTERS();
237 #undef ARGUMENT_REGISTER
240 typedef DPTR(struct CalleeSavedRegisters) PTR_CalleeSavedRegisters;
241 struct CalleeSavedRegisters {
242 #define CALLEE_SAVED_REGISTER(regname) INT_PTR regname;
243 ENUM_CALLEE_SAVED_REGISTERS();
244 #undef CALLEE_SAVED_REGISTER
247 struct CalleeSavedRegistersPointers {
248 #define CALLEE_SAVED_REGISTER(regname) PTR_TADDR p##regname;
249 ENUM_CALLEE_SAVED_REGISTERS();
250 #undef CALLEE_SAVED_REGISTER
253 #define SCRATCH_REGISTER_X86REG kRAX
255 #ifdef UNIX_AMD64_ABI
257 #define THIS_kREG kRDI
259 #define ARGUMENT_kREG1 kRDI
260 #define ARGUMENT_kREG2 kRSI
263 #define THIS_kREG kRCX
265 #define ARGUMENT_kREG1 kRCX
266 #define ARGUMENT_kREG2 kRDX
269 #ifdef UNIX_AMD64_ABI
271 #define NUM_FLOAT_ARGUMENT_REGISTERS 8
273 typedef DPTR(struct FloatArgumentRegisters) PTR_FloatArgumentRegisters;
274 struct FloatArgumentRegisters {
275 M128A d[NUM_FLOAT_ARGUMENT_REGISTERS]; // xmm0-xmm7
281 void UpdateRegDisplayFromCalleeSavedRegisters(REGDISPLAY * pRD, CalleeSavedRegisters * pRegs);
284 // Sufficient context for Try/Catch restoration.
289 #define ARGUMENTREGISTERS_SIZE sizeof(ArgumentRegisters)
292 #include "stublinkeramd64.h"
296 //**********************************************************************
297 // Exception handling
298 //**********************************************************************
300 inline PCODE GetIP(const CONTEXT * context)
308 PRECONDITION(CheckPointer(context));
312 return PCODE(context->Rip);
315 inline void SetIP(CONTEXT* context, PCODE rip)
323 PRECONDITION(CheckPointer(context));
327 context->Rip = (DWORD64) rip;
330 inline TADDR GetSP(const CONTEXT * context)
338 PRECONDITION(CheckPointer(context));
342 return (TADDR)context->Rsp;
344 inline void SetSP(CONTEXT *context, TADDR rsp)
352 PRECONDITION(CheckPointer(context));
359 #define SetFP(context, ebp)
360 inline TADDR GetFP(const CONTEXT * context)
362 LIMITED_METHOD_CONTRACT;
364 return (TADDR)(context->Rbp);
367 extern "C" TADDR GetCurrentSP();
373 void EncodeLoadAndJumpThunk (LPBYTE pBuffer, LPVOID pv, LPVOID pTarget);
376 // Get Rel32 destination, emit jumpStub if necessary
377 INT32 rel32UsingJumpStub(INT32 UNALIGNED * pRel32, PCODE target, MethodDesc *pMethod,
378 LoaderAllocator *pLoaderAllocator = NULL, bool throwOnOutOfMemoryWithinRange = true);
380 // Get Rel32 destination, emit jumpStub if necessary into a preallocated location
381 INT32 rel32UsingPreallocatedJumpStub(INT32 UNALIGNED * pRel32, PCODE target, PCODE jumpStubAddr, bool emitJump);
383 void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target);
385 void emitJump(LPBYTE pBuffer, LPVOID target);
387 BOOL isJumpRel32(PCODE pCode);
388 PCODE decodeJump32(PCODE pCode);
390 BOOL isJumpRel64(PCODE pCode);
391 PCODE decodeJump64(PCODE pCode);
394 // On IA64 back to back jumps should be separated by a nop bundle to get
395 // the best performance from the hardware's branch prediction logic.
396 // For all other platforms back to back jumps don't require anything special
397 // That is why we have these two wrapper functions that call emitJump and decodeJump
399 inline void emitBackToBackJump(LPBYTE pBuffer, LPVOID target)
403 emitJump(pBuffer, target);
406 inline BOOL isBackToBackJump(PCODE pCode)
410 return isJumpRel32(pCode) || isJumpRel64(pCode);
413 inline PCODE decodeBackToBackJump(PCODE pCode)
417 if (isJumpRel32(pCode))
418 return decodeJump32(pCode);
420 if (isJumpRel64(pCode))
421 return decodeJump64(pCode);
426 extern "C" void setFPReturn(int fpSize, INT64 retVal);
427 extern "C" void getFPReturn(int fpSize, INT64 *retval);
430 struct ComToManagedExRecord; // defined in cgencpu.cpp
432 inline BOOL IsUnmanagedValueTypeReturnedByRef(UINT sizeofvaluetype)
434 LIMITED_METHOD_CONTRACT;
436 if (sizeofvaluetype > ENREGISTERED_RETURNTYPE_MAXSIZE)
446 #include <pshpack1.h>
447 struct DECLSPEC_ALIGN(8) UMEntryThunkCode
449 // padding // CC CC CC CC
450 // mov r10, pUMEntryThunk // 49 ba xx xx xx xx xx xx xx xx // METHODDESC_REGISTER
451 // mov rax, pJmpDest // 48 b8 xx xx xx xx xx xx xx xx // need to ensure this imm64 is qword aligned
452 // TAILJMP_RAX // 48 FF E0
455 BYTE m_movR10[2]; // MOV R10,
456 LPVOID m_uet; // pointer to start of this structure
457 BYTE m_movRAX[2]; // MOV RAX,
459 const BYTE* m_execstub; // pointer to destination code // ensure this is qword aligned
460 BYTE m_jmpRAX[3]; // JMP RAX
463 void Encode(BYTE* pTargetCode, void* pvSecretParam);
466 LPCBYTE GetEntryPoint() const
468 LIMITED_METHOD_CONTRACT;
470 return (LPCBYTE)&m_movR10;
473 static int GetEntryPointOffset()
475 LIMITED_METHOD_CONTRACT;
477 return offsetof(UMEntryThunkCode, m_movR10);
484 #ifndef FEATURE_MULTIREG_RETURN
488 ULONG64 ReturnValue[1];
490 #else // FEATURE_MULTIREG_RETURN
498 ULONG64 ReturnValue[2];
500 #endif // PLATFORM_UNIX
501 CalleeSavedRegisters Regs;
505 size_t ReturnAddress;
509 #ifndef DACCESS_COMPILE
511 DWORD GetOffsetAtEndOfFunction(ULONGLONG uImageBase,
512 PT_RUNTIME_FUNCTION pFunctionEntry,
515 #endif // DACCESS_COMPILE
517 // ClrFlushInstructionCache is used when we want to call FlushInstructionCache
518 // for a specific architecture in the common code, but not for other architectures.
519 // We call ClrFlushInstructionCache whenever we create or modify code in the heap.
520 // Currently ClrFlushInstructionCache has no effect on AMD64
523 inline BOOL ClrFlushInstructionCache(LPCVOID pCodeAddr, size_t sizeOfCode)
525 // FlushInstructionCache(GetCurrentProcess(), pCodeAddr, sizeOfCode);
531 // JIT HELPER ALIASING FOR PORTABILITY.
533 // Create alias for optimized implementations of helpers provided on this platform
535 #define JIT_GetSharedGCStaticBase JIT_GetSharedGCStaticBase_SingleAppDomain
536 #define JIT_GetSharedNonGCStaticBase JIT_GetSharedNonGCStaticBase_SingleAppDomain
537 #define JIT_GetSharedGCStaticBaseNoCtor JIT_GetSharedGCStaticBaseNoCtor_SingleAppDomain
538 #define JIT_GetSharedNonGCStaticBaseNoCtor JIT_GetSharedNonGCStaticBaseNoCtor_SingleAppDomain
541 #define JIT_ChkCastClass JIT_ChkCastClass
542 #define JIT_ChkCastClassSpecial JIT_ChkCastClassSpecial
543 #define JIT_IsInstanceOfClass JIT_IsInstanceOfClass
544 #define JIT_ChkCastInterface JIT_ChkCastInterface
545 #define JIT_IsInstanceOfInterface JIT_IsInstanceOfInterface
546 #endif // FEATURE_PAL
548 #define JIT_Stelem_Ref JIT_Stelem_Ref
550 #endif // __cgencpu_h__