From 25a734e3b64a6d956d73adf3b43d3444fdbd1f1f Mon Sep 17 00:00:00 2001 From: John Chen Date: Tue, 29 Nov 2016 19:50:09 -0800 Subject: [PATCH] Capture YMM registers on OSX (dotnet/coreclr#8365) Also fixed checking of context flags on OSX (issue dotnet/coreclr#8116) Commit migrated from https://github.com/dotnet/coreclr/commit/85c0a5641d60aabe028a9e4b4ef07a7b58a2181b --- src/coreclr/src/pal/src/CMakeLists.txt | 1 + src/coreclr/src/pal/src/arch/amd64/processor.cpp | 20 +++++ .../src/pal/src/exception/machexception.cpp | 7 ++ src/coreclr/src/pal/src/thread/context.cpp | 89 +++++++++++++++++----- 4 files changed, 100 insertions(+), 17 deletions(-) diff --git a/src/coreclr/src/pal/src/CMakeLists.txt b/src/coreclr/src/pal/src/CMakeLists.txt index 89eac35..298763e 100644 --- a/src/coreclr/src/pal/src/CMakeLists.txt +++ b/src/coreclr/src/pal/src/CMakeLists.txt @@ -41,6 +41,7 @@ endif() if(CMAKE_SYSTEM_NAME STREQUAL Darwin) add_definitions(-D_TARGET_MAC64) + add_definitions(-DXSTATE_SUPPORTED) set(PLATFORM_SOURCES arch/amd64/activationhandlerwrapper.S arch/amd64/context.S diff --git a/src/coreclr/src/pal/src/arch/amd64/processor.cpp b/src/coreclr/src/pal/src/arch/amd64/processor.cpp index 4fd3a4a..ac3d448 100644 --- a/src/coreclr/src/pal/src/arch/amd64/processor.cpp +++ b/src/coreclr/src/pal/src/arch/amd64/processor.cpp @@ -42,3 +42,23 @@ YieldProcessor( ); } +/*++ +Function: +XmmYmmStateSupport + +Check if OS has enabled both XMM and YMM state support + +Return value: +1 if XMM and YMM are enabled, 0 otherwise +--*/ +extern "C" unsigned int XmmYmmStateSupport() +{ + unsigned int eax; + __asm(" xgetbv\n" \ + : "=a"(eax) /*output in eax*/\ + : "c"(0) /*inputs - 0 in ecx*/\ + : "eax", "edx" /* registers that are clobbered*/ + ); + // Check OS has enabled both XMM and YMM state support + return ((eax & 0x06) == 0x06) ? 1 : 0; +} diff --git a/src/coreclr/src/pal/src/exception/machexception.cpp b/src/coreclr/src/pal/src/exception/machexception.cpp index 8b0d7f2..5328e1f 100644 --- a/src/coreclr/src/pal/src/exception/machexception.cpp +++ b/src/coreclr/src/pal/src/exception/machexception.cpp @@ -1518,6 +1518,7 @@ ActivationHandler(CONTEXT* context) extern "C" void ActivationHandlerWrapper(); extern "C" int ActivationHandlerReturnOffset; +extern "C" unsigned int XmmYmmStateSupport(); /*++ Function : @@ -1581,6 +1582,12 @@ InjectActivationInternal(CPalThread* pThread) // after the activation function returns. CONTEXT *pContext = (CONTEXT *)contextAddress; pContext->ContextFlags = CONTEXT_FULL | CONTEXT_SEGMENTS; +#ifdef XSTATE_SUPPORTED + if (XmmYmmStateSupport() == 1) + { + pContext->ContextFlags |= CONTEXT_XSTATE; + } +#endif MachRet = CONTEXT_GetThreadContextFromPort(threadPort, pContext); _ASSERT_MSG(MachRet == KERN_SUCCESS, "CONTEXT_GetThreadContextFromPort\n"); diff --git a/src/coreclr/src/pal/src/thread/context.cpp b/src/coreclr/src/pal/src/thread/context.cpp index 9e1e060..d051051 100644 --- a/src/coreclr/src/pal/src/thread/context.cpp +++ b/src/coreclr/src/pal/src/thread/context.cpp @@ -36,6 +36,7 @@ extern PGET_GCMARKER_EXCEPTION_CODE g_getGcMarkerExceptionCode; // in context2.S extern void CONTEXT_CaptureContext(LPCONTEXT lpContext); +#define CONTEXT_AREA_MASK 0xffff #ifdef _X86_ #define CONTEXT_ALL_FLOATING (CONTEXT_FLOATING_POINT | CONTEXT_EXTENDED_REGISTERS) #elif defined(_AMD64_) @@ -288,7 +289,7 @@ CONTEXT_GetThreadContext( } if (lpContext->ContextFlags & - (CONTEXT_CONTROL | CONTEXT_INTEGER)) + (CONTEXT_CONTROL | CONTEXT_INTEGER) & CONTEXT_AREA_MASK) { if (CONTEXT_GetRegisters(dwProcessId, lpContext) == FALSE) { @@ -348,7 +349,7 @@ CONTEXT_SetThreadContext( } if (lpContext->ContextFlags & - (CONTEXT_CONTROL | CONTEXT_INTEGER)) + (CONTEXT_CONTROL | CONTEXT_INTEGER) & CONTEXT_AREA_MASK) { #if HAVE_PT_REGS if (ptrace((__ptrace_request)PT_GETREGS, dwProcessId, (caddr_t)&ptrace_registers, 0) == -1) @@ -371,11 +372,11 @@ CONTEXT_SetThreadContext( ASSERT("Don't know how to set the context of another process on this platform!"); return FALSE; #endif - if (lpContext->ContextFlags & CONTEXT_CONTROL) + if (lpContext->ContextFlags & CONTEXT_CONTROL & CONTEXT_AREA_MASK) { ASSIGN_CONTROL_REGS } - if (lpContext->ContextFlags & CONTEXT_INTEGER) + if (lpContext->ContextFlags & CONTEXT_INTEGER & CONTEXT_AREA_MASK) { ASSIGN_INTEGER_REGS } @@ -857,7 +858,7 @@ CONTEXT_GetThreadContextFromPort( mach_msg_type_number_t StateCount; thread_state_flavor_t StateFlavor; - if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS)) + if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS) & CONTEXT_AREA_MASK) { #ifdef _X86_ x86_thread_state32_t State; @@ -879,7 +880,7 @@ CONTEXT_GetThreadContextFromPort( CONTEXT_GetThreadContextFromThreadState(StateFlavor, (thread_state_t)&State, lpContext); } - if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING) { + if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING & CONTEXT_AREA_MASK) { #ifdef _X86_ x86_float_state32_t State; StateFlavor = x86_FLOAT_STATE32; @@ -900,6 +901,22 @@ CONTEXT_GetThreadContextFromPort( CONTEXT_GetThreadContextFromThreadState(StateFlavor, (thread_state_t)&State, lpContext); } +#if defined(_AMD64_) && defined(XSTATE_SUPPORTED) + if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) { + x86_avx_state64_t State; + StateFlavor = x86_AVX_STATE64; + StateCount = sizeof(State) / sizeof(natural_t); + MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount); + if (MachRet != KERN_SUCCESS) + { + ASSERT("thread_get_state(XSTATE) failed: %d\n", MachRet); + goto exit; + } + + CONTEXT_GetThreadContextFromThreadState(StateFlavor, (thread_state_t)&State, lpContext); + } +#endif + exit: return MachRet; } @@ -919,7 +936,7 @@ CONTEXT_GetThreadContextFromThreadState( { #ifdef _X86_ case x86_THREAD_STATE32: - if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS)) + if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS) & CONTEXT_AREA_MASK) { x86_thread_state32_t *pState = (x86_thread_state32_t *)threadState; @@ -946,7 +963,7 @@ CONTEXT_GetThreadContextFromThreadState( { x86_float_state32_t *pState = (x86_float_state32_t *)threadState; - if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT) + if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK) { lpContext->FloatSave.ControlWord = *(DWORD*)&pState->fpu_fcw; lpContext->FloatSave.StatusWord = *(DWORD*)&pState->fpu_fsw; @@ -965,7 +982,7 @@ CONTEXT_GetThreadContextFromThreadState( memcpy(&lpContext->FloatSave.RegisterArea[i * 10], (&pState->fpu_stmm0)[i].mmst_reg, 10); } - if (lpContext->ContextFlags & CONTEXT_EXTENDED_REGISTERS) + if (lpContext->ContextFlags & CONTEXT_EXTENDED_REGISTERS & CONTEXT_AREA_MASK) { // The only extended register information that Mach will tell us about are the xmm register values. // Both Windows and Mach store the registers in a packed layout (each of the 8 registers is 16 bytes) @@ -977,7 +994,7 @@ CONTEXT_GetThreadContextFromThreadState( #elif defined(_AMD64_) case x86_THREAD_STATE64: - if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS)) + if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS) & CONTEXT_AREA_MASK) { x86_thread_state64_t *pState = (x86_thread_state64_t *)threadState; @@ -1011,7 +1028,7 @@ CONTEXT_GetThreadContextFromThreadState( break; case x86_FLOAT_STATE64: - if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT) + if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK) { x86_float_state64_t *pState = (x86_float_state64_t *)threadState; @@ -1036,6 +1053,16 @@ CONTEXT_GetThreadContextFromThreadState( memcpy(&lpContext->Xmm0, &pState->__fpu_xmm0, 16 * 16); } break; + +#ifdef XSTATE_SUPPORTED + case x86_AVX_STATE64: + if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) + { + x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState; + memcpy(&lpContext->VectorRegister, &pState->__fpu_ymmh0, 16 * 16); + } + break; +#endif #else #error Unexpected architecture. #endif @@ -1122,7 +1149,7 @@ CONTEXT_SetThreadContextOnPort( mach_msg_type_number_t StateCount; thread_state_flavor_t StateFlavor; - if (lpContext->ContextFlags & (CONTEXT_CONTROL|CONTEXT_INTEGER)) + if (lpContext->ContextFlags & (CONTEXT_CONTROL|CONTEXT_INTEGER) & CONTEXT_AREA_MASK) { #ifdef _X86_ x86_thread_state32_t State; @@ -1189,21 +1216,42 @@ CONTEXT_SetThreadContextOnPort( } } - if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING) + if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING & CONTEXT_AREA_MASK) { #ifdef _X86_ x86_float_state32_t State; StateFlavor = x86_FLOAT_STATE32; + StateCount = sizeof(State) / sizeof(natural_t); #elif defined(_AMD64_) +#ifdef XSTATE_SUPPORTED + // We're relying on the fact that the initial portion of + // x86_avx_state64_t is identical to x86_float_state64_t. + // Check a few fields to make sure the assumption is correct. + static_assert_no_msg(sizeof(x86_avx_state64_t) > sizeof(x86_float_state64_t)); + static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw)); + static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0)); + + x86_avx_state64_t State; + if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) + { + StateFlavor = x86_AVX_STATE64; + StateCount = sizeof(State) / sizeof(natural_t); + } + else + { + StateFlavor = x86_FLOAT_STATE64; + StateCount = sizeof(x86_float_state64_t) / sizeof(natural_t); + } +#else x86_float_state64_t State; StateFlavor = x86_FLOAT_STATE64; + StateCount = sizeof(State) / sizeof(natural_t); +#endif #else #error Unexpected architecture. #endif - StateCount = sizeof(State) / sizeof(natural_t); - // If we're setting only one of the floating point or extended registers (of which Mach supports only // the xmm values) then we don't have values for the other set. This is a problem since Mach only // supports setting both groups as a single unit. So in this case we'll need to fetch the current @@ -1224,7 +1272,7 @@ CONTEXT_SetThreadContextOnPort( _ASSERTE(StateCountGet == StateCount); } - if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT) + if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK) { #ifdef _X86_ *(DWORD*)&State.fpu_fcw = lpContext->FloatSave.ControlWord; @@ -1267,7 +1315,7 @@ CONTEXT_SetThreadContextOnPort( } #ifdef _X86_ - if (lpContext->ContextFlags & CONTEXT_EXTENDED_REGISTERS) + if (lpContext->ContextFlags & CONTEXT_EXTENDED_REGISTERS & CONTEXT_AREA_MASK) { // The only extended register information that Mach will tell us about are the xmm register // values. Both Windows and Mach store the registers in a packed layout (each of the 8 registers @@ -1276,6 +1324,13 @@ CONTEXT_SetThreadContextOnPort( } #endif // _X86_ +#if defined(_AMD64_) && defined(XSTATE_SUPPORTED) + if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) + { + memcpy(&State.__fpu_ymmh0, lpContext->VectorRegister, 16 * 16); + } +#endif + MachRet = thread_set_state(Port, StateFlavor, (thread_state_t)&State, -- 2.7.4