Fix OSX floating point state extraction (#25295)
authorJan Vorlicek <janvorli@microsoft.com>
Fri, 21 Jun 2019 08:48:08 +0000 (10:48 +0200)
committerGitHub <noreply@github.com>
Fri, 21 Jun 2019 08:48:08 +0000 (10:48 +0200)
There was a bug reported on a very recent Mac with Intel i9 processor. A
crash in the RtlRestoreContext was happening at the fxrstor instruction
due to the fact that the floating point state data were garbage.
The investigation has shown that sometimes, the x86_FLOAT_STATE64
cannot be obtained using the thread_get_state API. And it was also found
that at the same time, the x86_AVX_STATE64 can be obtained. The state
extracted by the AVX variant contains all the registers that the FLOAT
variant would extract.
However, in some cases, even the x86_AVX_STATE64 cannot be obtained and
there is a third flavor that we can get - x86_AVX512_STATE64.
Unfortunately, there are cases where none of those can be obtained.
It is not clear what causes these cases, it seems only kernel debugging
can give us an answer to that.

This change modifies the way we extract the floating point state. We
first try to get the AVX state, if we fail, we try the AVX512 and
finally we fall back to the FLOAT state. If we fail to get the floating
point state with any of these, we return context without the floating
point state flag set. Also, if only getting the FLOAT state succeeds,
we return context without the XSTATE flag set.

src/pal/src/exception/machexception.cpp
src/pal/src/thread/context.cpp

index 67d28d0..336e951 100644 (file)
@@ -1017,10 +1017,10 @@ Parameters:
     thread - mach thread port
 
 Return value :
-    None
+    KERN_SUCCESS if the suspend succeeded, other code in case of failure
 --*/
 static
-void
+kern_return_t
 SuspendMachThread(thread_act_t thread)
 {
     kern_return_t machret;
@@ -1028,7 +1028,10 @@ SuspendMachThread(thread_act_t thread)
     while (true)
     {
         machret = thread_suspend(thread);
-        CHECK_MACH("thread_suspend", machret);
+        if (machret != KERN_SUCCESS)
+        {
+            break;
+        }
 
         // Ensure that if the thread was running in the kernel, the kernel operation
         // is safely aborted so that it can be restarted later.
@@ -1041,8 +1044,13 @@ SuspendMachThread(thread_act_t thread)
         // The thread was running in the kernel executing a non-atomic operation
         // that cannot be restarted, so we need to resume the thread and retry
         machret = thread_resume(thread);
-        CHECK_MACH("thread_resume", machret);
+        if (machret != KERN_SUCCESS)
+        {
+            break;
+        }
     }
+
+    return machret;
 }
 
 /*++
@@ -1103,7 +1111,8 @@ SEHExceptionThread(void *args)
             thread = sMessage.GetThreadContext(&sContext);
 
             // Suspend the target thread
-            SuspendMachThread(thread);
+            machret = SuspendMachThread(thread);
+            CHECK_MACH("SuspendMachThread", machret);
             
             machret = CONTEXT_SetThreadContextOnPort(thread, &sContext);
             CHECK_MACH("CONTEXT_SetThreadContextOnPort", machret);
@@ -1220,7 +1229,8 @@ SEHExceptionThread(void *args)
             NONPAL_TRACE("ForwardExceptionRequest for thread %08x\n", thread);
 
             // Suspend the faulting thread. 
-            SuspendMachThread(thread);
+            machret = SuspendMachThread(thread);
+            CHECK_MACH("SuspendMachThread", machret);
 
             // Set the context back to the original faulting state.
             MachExceptionInfo *pExceptionInfo = sMessage.GetExceptionInfo();
@@ -1545,7 +1555,8 @@ InjectActivationInternal(CPalThread* pThread)
     PAL_ERROR palError;
 
     mach_port_t threadPort = pThread->GetMachPortSelf();
-    kern_return_t MachRet = thread_suspend(threadPort);
+
+    kern_return_t MachRet = SuspendMachThread(threadPort);
     palError = (MachRet == KERN_SUCCESS) ? NO_ERROR : ERROR_GEN_FAILURE;
 
     if (palError == NO_ERROR)
index b98b99d..10b2db4 100644 (file)
@@ -890,10 +890,8 @@ CONTEXT_GetThreadContextFromPort(
   
     if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS) & CONTEXT_AREA_MASK)
     {
-#ifdef _X86_  
-        x86_thread_state32_t State;
-        StateFlavor = x86_THREAD_STATE32;
-#elif defined(_AMD64_)
+
+#ifdef _AMD64_
         x86_thread_state64_t State;
         StateFlavor = x86_THREAD_STATE64;
 #else
@@ -911,41 +909,47 @@ CONTEXT_GetThreadContextFromPort(
     }
     
     if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING & CONTEXT_AREA_MASK) {
-#ifdef _X86_
-        x86_float_state32_t State;
-        StateFlavor = x86_FLOAT_STATE32;
-#elif defined(_AMD64_)
-        x86_float_state64_t State;
-        StateFlavor = x86_FLOAT_STATE64;
-#else
-#error Unexpected architecture.
-#endif
-        StateCount = sizeof(State) / sizeof(natural_t);
-        MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount);
-        if (MachRet != KERN_SUCCESS)
-        {
-            ASSERT("thread_get_state(FLOAT_STATE) failed: %d\n", MachRet);
-            goto exit;
-        }
-        
-        CONTEXT_GetThreadContextFromThreadState(StateFlavor, (thread_state_t)&State, lpContext);
-    }
+        // The thread_get_state for floating point state can fail for some flavors when the processor is not
+        // in the right mode at the time we are taking the state. So we will try to get the AVX state first and
+        // if it fails, get the FLOAT state and if that fails, take AVX512 state. Both AVX and AVX512 states
+        // are supersets of the FLOAT state.
+        // Check a few fields to make sure the assumption is correct.
+        static_assert_no_msg(sizeof(x86_avx_state64_t) > sizeof(x86_float_state64_t));
+        static_assert_no_msg(sizeof(x86_avx512_state64_t) > sizeof(x86_avx_state64_t));
+        static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw));
+        static_assert_no_msg(offsetof(x86_avx_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0));
+        static_assert_no_msg(offsetof(x86_avx512_state64_t, __fpu_fcw) == offsetof(x86_float_state64_t, __fpu_fcw));
+        static_assert_no_msg(offsetof(x86_avx512_state64_t, __fpu_xmm0) == offsetof(x86_float_state64_t, __fpu_xmm0));
+
+        x86_avx512_state64_t State;
 
-#if defined(_AMD64_) && defined(XSTATE_SUPPORTED)
-    if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK) {
-        x86_avx_state64_t State;
         StateFlavor = x86_AVX_STATE64;
-        StateCount = sizeof(State) / sizeof(natural_t);
+        StateCount = sizeof(x86_avx_state64_t) / sizeof(natural_t);
         MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount);
         if (MachRet != KERN_SUCCESS)
         {
-            ASSERT("thread_get_state(XSTATE) failed: %d\n", MachRet);
-            goto exit;
+            // The AVX state is not available, try to get the AVX512 state.
+            StateFlavor = x86_AVX512_STATE64;
+            StateCount = sizeof(x86_avx512_state64_t) / sizeof(natural_t);
+            MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount);
+            if (MachRet != KERN_SUCCESS)
+            {
+                // Neither the AVX nor the AVX512 state is not available, try to get at least the FLOAT state.
+                lpContext->ContextFlags &= ~(CONTEXT_XSTATE & CONTEXT_AREA_MASK);
+
+                StateFlavor = x86_FLOAT_STATE64;
+                StateCount = sizeof(x86_float_state64_t) / sizeof(natural_t);
+                MachRet = thread_get_state(Port, StateFlavor, (thread_state_t)&State, &StateCount);
+                if (MachRet != KERN_SUCCESS)
+                {
+                    // We were unable to get any floating point state. This case was observed on OSX with AVX512 capable processors.
+                    lpContext->ContextFlags &= ~((CONTEXT_XSTATE | CONTEXT_ALL_FLOATING) & CONTEXT_AREA_MASK);
+                }
+            }
         }
 
         CONTEXT_GetThreadContextFromThreadState(StateFlavor, (thread_state_t)&State, lpContext);
     }
-#endif
 
 exit:
     return MachRet;
@@ -964,65 +968,7 @@ CONTEXT_GetThreadContextFromThreadState(
 {
     switch (threadStateFlavor)
     {
-#ifdef _X86_
-        case x86_THREAD_STATE32:
-            if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS) & CONTEXT_AREA_MASK)
-            {
-                x86_thread_state32_t *pState = (x86_thread_state32_t *)threadState;
-
-                lpContext->Eax = pState->eax;
-                lpContext->Ebx = pState->ebx;
-                lpContext->Ecx = pState->ecx;
-                lpContext->Edx = pState->edx;
-                lpContext->Edi = pState->edi;
-                lpContext->Esi = pState->esi;
-                lpContext->Ebp = pState->ebp;
-                lpContext->Esp = pState->esp;
-                lpContext->SegSs = pState->ss;
-                lpContext->EFlags = pState->eflags;
-                lpContext->Eip = pState->eip;
-                lpContext->SegCs = pState->cs;
-                lpContext->SegDs_PAL_Undefined = pState->ds;
-                lpContext->SegEs_PAL_Undefined = pState->es;
-                lpContext->SegFs_PAL_Undefined = pState->fs;
-                lpContext->SegGs_PAL_Undefined = pState->gs;
-            }
-            break;
-
-        case x86_FLOAT_STATE32:
-        {
-            x86_float_state32_t *pState = (x86_float_state32_t *)threadState;
-
-            if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK)
-            {
-                lpContext->FloatSave.ControlWord = *(DWORD*)&pState->fpu_fcw;
-                lpContext->FloatSave.StatusWord = *(DWORD*)&pState->fpu_fsw;
-                lpContext->FloatSave.TagWord = pState->fpu_ftw;
-                lpContext->FloatSave.ErrorOffset = pState->fpu_ip;
-                lpContext->FloatSave.ErrorSelector = pState->fpu_cs;
-                lpContext->FloatSave.DataOffset = pState->fpu_dp;
-                lpContext->FloatSave.DataSelector = pState->fpu_ds;
-                lpContext->FloatSave.Cr0NpxState = pState->fpu_mxcsr;
-
-                // Windows stores the floating point registers in a packed layout (each 10-byte register end to end
-                // for a total of 80 bytes). But Mach returns each register in an 16-bit structure (presumably for
-                // alignment purposes). So we can't just memcpy the registers over in a single block, we need to copy
-                // them individually.
-                for (int i = 0; i < 8; i++)
-                    memcpy(&lpContext->FloatSave.RegisterArea[i * 10], (&pState->fpu_stmm0)[i].mmst_reg, 10);
-            }
-
-            if (lpContext->ContextFlags & CONTEXT_EXTENDED_REGISTERS & CONTEXT_AREA_MASK)
-            {
-                // The only extended register information that Mach will tell us about are the xmm register values.
-                // Both Windows and Mach store the registers in a packed layout (each of the 8 registers is 16 bytes)
-                // so we can simply memcpy them across.
-                memcpy(lpContext->ExtendedRegisters + CONTEXT_EXREG_XMM_OFFSET, &pState->fpu_xmm0, 8 * 16);
-            }
-        }
-        break;
-
-#elif defined(_AMD64_)
+#ifdef _AMD64_
         case x86_THREAD_STATE64:
             if (lpContext->ContextFlags & (CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_SEGMENTS) & CONTEXT_AREA_MASK)
             {
@@ -1057,6 +1003,16 @@ CONTEXT_GetThreadContextFromThreadState(
             }
             break;
 
+        case x86_AVX_STATE64:
+        case x86_AVX512_STATE64:
+            if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK)
+            {
+                x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState;
+                memcpy(&lpContext->VectorRegister, &pState->__fpu_ymmh0, 16 * 16);
+            }
+
+            // Intentional fall-through, the AVX states are supersets of the FLOAT state
+
         case x86_FLOAT_STATE64:
             if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK)
             {
@@ -1083,16 +1039,6 @@ CONTEXT_GetThreadContextFromThreadState(
                 memcpy(&lpContext->Xmm0, &pState->__fpu_xmm0, 16 * 16);
             }
             break;
-
-#ifdef XSTATE_SUPPORTED
-        case x86_AVX_STATE64:
-            if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK)
-            {
-                x86_avx_state64_t *pState = (x86_avx_state64_t *)threadState;
-                memcpy(&lpContext->VectorRegister, &pState->__fpu_ymmh0, 16 * 16);
-            }
-            break;
-#endif
 #else
 #error Unexpected architecture.
 #endif
@@ -1181,27 +1127,7 @@ CONTEXT_SetThreadContextOnPort(
 
     if (lpContext->ContextFlags & (CONTEXT_CONTROL|CONTEXT_INTEGER) & CONTEXT_AREA_MASK)
     {
-#ifdef _X86_
-        x86_thread_state32_t State;
-        StateFlavor = x86_THREAD_STATE32;
-        
-        State.eax = lpContext->Eax;
-        State.ebx = lpContext->Ebx;
-        State.ecx = lpContext->Ecx;
-        State.edx = lpContext->Edx;
-        State.edi = lpContext->Edi;
-        State.esi = lpContext->Esi;
-        State.ebp = lpContext->Ebp;
-        State.esp = lpContext->Esp;
-        State.ss = lpContext->SegSs;
-        State.eflags = lpContext->EFlags;
-        State.eip = lpContext->Eip;
-        State.cs = lpContext->SegCs;
-        State.ds = lpContext->SegDs_PAL_Undefined;
-        State.es = lpContext->SegEs_PAL_Undefined;
-        State.fs = lpContext->SegFs_PAL_Undefined;
-        State.gs = lpContext->SegGs_PAL_Undefined;
-#elif defined(_AMD64_)
+#ifdef _AMD64_
         x86_thread_state64_t State;
         StateFlavor = x86_THREAD_STATE64;
 
@@ -1249,11 +1175,7 @@ CONTEXT_SetThreadContextOnPort(
     if (lpContext->ContextFlags & CONTEXT_ALL_FLOATING & CONTEXT_AREA_MASK)
     {
         
-#ifdef _X86_
-        x86_float_state32_t State;
-        StateFlavor = x86_FLOAT_STATE32;
-        StateCount = sizeof(State) / sizeof(natural_t);
-#elif defined(_AMD64_)
+#ifdef _AMD64_
 #ifdef XSTATE_SUPPORTED
         // We're relying on the fact that the initial portion of
         // x86_avx_state64_t is identical to x86_float_state64_t.
@@ -1304,23 +1226,7 @@ CONTEXT_SetThreadContextOnPort(
 
         if (lpContext->ContextFlags & CONTEXT_FLOATING_POINT & CONTEXT_AREA_MASK)
         {
-#ifdef _X86_
-            *(DWORD*)&State.fpu_fcw = lpContext->FloatSave.ControlWord;
-            *(DWORD*)&State.fpu_fsw = lpContext->FloatSave.StatusWord;
-            State.fpu_ftw = lpContext->FloatSave.TagWord;
-            State.fpu_ip = lpContext->FloatSave.ErrorOffset;
-            State.fpu_cs = lpContext->FloatSave.ErrorSelector;
-            State.fpu_dp = lpContext->FloatSave.DataOffset;
-            State.fpu_ds = lpContext->FloatSave.DataSelector;
-            State.fpu_mxcsr = lpContext->FloatSave.Cr0NpxState;
-
-            // Windows stores the floating point registers in a packed layout (each 10-byte register end to
-            // end for a total of 80 bytes). But Mach returns each register in an 16-bit structure (presumably
-            // for alignment purposes). So we can't just memcpy the registers over in a single block, we need
-            // to copy them individually.
-            for (int i = 0; i < 8; i++)
-                memcpy((&State.fpu_stmm0)[i].mmst_reg, &lpContext->FloatSave.RegisterArea[i * 10], 10);
-#elif defined(_AMD64_)
+#ifdef _AMD64_
             *(DWORD*)&State.__fpu_fcw = lpContext->FltSave.ControlWord;
             *(DWORD*)&State.__fpu_fsw = lpContext->FltSave.StatusWord;
             State.__fpu_ftw = lpContext->FltSave.TagWord;
@@ -1344,16 +1250,6 @@ CONTEXT_SetThreadContextOnPort(
 #endif
         }
 
-#ifdef _X86_
-        if (lpContext->ContextFlags & CONTEXT_EXTENDED_REGISTERS & CONTEXT_AREA_MASK)
-        {
-            // The only extended register information that Mach will tell us about are the xmm register
-            // values. Both Windows and Mach store the registers in a packed layout (each of the 8 registers
-            // is 16 bytes) so we can simply memcpy them across.
-            memcpy(&State.fpu_xmm0, lpContext->ExtendedRegisters + CONTEXT_EXREG_XMM_OFFSET, 8 * 16);
-        }
-#endif // _X86_
-
 #if defined(_AMD64_) && defined(XSTATE_SUPPORTED)
         if (lpContext->ContextFlags & CONTEXT_XSTATE & CONTEXT_AREA_MASK)
         {