Take crash dumps for timed-out tests.
authorPat Gavlin <pagavlin@microsoft.com>
Thu, 16 Mar 2017 00:28:42 +0000 (17:28 -0700)
committerPat Gavlin <pagavlin@microsoft.com>
Thu, 16 Mar 2017 17:03:12 +0000 (10:03 -0700)
This change updates the test infrastructure to collect crash dumps for
tests that time out. We've been seeing a number of tests that only time
out in CI (e.g. dotnet/coreclr#10076); hopefully this will help us root-cause the
timeouts.

Commit migrated from https://github.com/dotnet/coreclr/commit/6f30002fc917cdacd4d0565f05ded920ab17f830

src/coreclr/netci.groovy
src/coreclr/tests/runtest.cmd
src/coreclr/tests/src/Common/Coreclr.TestWrapper/Coreclr.TestWrapper.csproj
src/coreclr/tests/src/Common/Coreclr.TestWrapper/CoreclrTestWrapperLib.cs
src/coreclr/tests/src/Common/Desktop.Coreclr.TestWrapper/Desktop.Coreclr.TestWrapper.csproj

index 1399a3a..983ca2f 100755 (executable)
@@ -1515,7 +1515,7 @@ def static calculateBuildCommands(def newJob, def scenario, def branch, def isPR
                             gcTestArguments = "${scenario} sequential"
                         }
 
-                        runtestArguments = "${lowerConfiguration} ${arch} ${gcstressStr} ${crossgenStr} ${runcrossgentestsStr} ${runjitstressStr} ${runjitstressregsStr} ${runjitmioptsStr} ${runjitforcerelocsStr} ${runjitdisasmStr} ${gcTestArguments}"
+                        runtestArguments = "${lowerConfiguration} ${arch} ${gcstressStr} ${crossgenStr} ${runcrossgentestsStr} ${runjitstressStr} ${runjitstressregsStr} ${runjitmioptsStr} ${runjitforcerelocsStr} ${runjitdisasmStr} ${gcTestArguments} collectdumps"
 
                         if (Constants.jitStressModeScenarios.containsKey(scenario)) {
                             def stepScriptLocation = "%WORKSPACE%\\SetStressModes.bat"
index 89e2571..d112455 100644 (file)
@@ -44,6 +44,7 @@ set __LongGCTests=
 set __GCSimulatorTests=
 set __AgainstPackages=
 set __JitDisasm=
+set __CollectDumps=
 
 :Arg_Loop
 if "%1" == "" goto ArgsDone
@@ -84,6 +85,7 @@ if /i "%1" == "link"                  (set DoLink=true&set ILLINK=%2&shift&shift
 
 REM change it to COMPlus_GCStress when we stop using xunit harness
 if /i "%1" == "gcstresslevel"         (set __GCSTRESSLEVEL=%2&set __TestTimeout=1800000&shift&shift&goto Arg_Loop)
+if /i "%1" == "collectdumps"          (set __CollectDumps=true&shift&goto Arg_Loop)
 
 if /i not "%1" == "msbuildargs" goto SkipMsbuildArgs
 :: All the rest of the args will be collected and passed directly to msbuild.
@@ -207,12 +209,33 @@ if not exist %XunitTestBinBase% (
     echo %__MsgPrefix%Run "buildtest.cmd %__BuildArch% %__BuildType%" to build the tests first.
     exit /b 1
 )
+
+if "%__CollectDumps%"=="true" (
+    :: Install dumpling
+    set "__DumplingHelperPath=%__ProjectDir%\..\Tools\DumplingHelper.py"
+    python "!__DumplingHelperPath!" install_dumpling
+
+    :: Create the crash dump folder if necessary
+    set "__CrashDumpFolder=%tmp%\CoreCLRTestCrashDumps"
+    if not exist "!__CrashDumpFolder!" (
+        mkdir "!__CrashDumpFolder!"
+    )
+
+    :: Grab the current time before execution begins. This will be used to determine which crash dumps
+    :: will be uploaded.
+    for /f "delims=" %%a in ('python !__DumplingHelperPath! get_timestamp') do @set __StartTime=%%a
+)
+
 echo %__MsgPrefix%CORE_ROOT that will be used is: %CORE_ROOT%
 echo %__MsgPrefix%Starting the test run ...
 
 set __BuildLogRootName=TestRunResults
 call :msbuild "%__ProjectFilesDir%\runtest.proj" /p:Runtests=true /clp:showcommandline
 
+if "%__CollectDumps%"=="true" (
+    python "%__DumplingHelperPath%" collect_dump %errorlevel% "%__CrashDumpFolder%" %__StartTime% "CoreCLR_Tests"
+)
+
 if errorlevel 1 (
     echo Test Run failed. Refer to the following:
     echo     Html report: %__TestRunHtmlLog%
index eb9a414..b25a6d6 100644 (file)
@@ -17,6 +17,7 @@
     <NuGetPackageImportStamp>7a9bfb7d</NuGetPackageImportStamp>
     <GenerateRunScript>false</GenerateRunScript>
     <CLRTestKind>BuildOnly</CLRTestKind>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
   <!-- Default configurations to help VS understand the configurations -->
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
index 5c04827..e431f3a 100644 (file)
@@ -7,12 +7,89 @@ using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
+using System.Runtime.InteropServices;
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
+using Microsoft.Win32.SafeHandles;
 
 namespace CoreclrTestLib
 {
+    static class DbgHelp
+    {
+        public enum MiniDumpType : int
+        {
+            MiniDumpNormal                          = 0x00000000,
+            MiniDumpWithDataSegs                    = 0x00000001,
+            MiniDumpWithFullMemory                  = 0x00000002,
+            MiniDumpWithHandleData                  = 0x00000004,
+            MiniDumpFilterMemory                    = 0x00000008,
+            MiniDumpScanMemory                      = 0x00000010,
+            MiniDumpWithUnloadedModules             = 0x00000020,
+            MiniDumpWithIndirectlyReferencedMemory  = 0x00000040,
+            MiniDumpFilterModulePaths               = 0x00000080,
+            MiniDumpWithProcessThreadData           = 0x00000100,
+            MiniDumpWithPrivateReadWriteMemory      = 0x00000200,
+            MiniDumpWithoutOptionalData             = 0x00000400,
+            MiniDumpWithFullMemoryInfo              = 0x00000800,
+            MiniDumpWithThreadInfo                  = 0x00001000,
+            MiniDumpWithCodeSegs                    = 0x00002000,
+            MiniDumpWithoutAuxiliaryState           = 0x00004000,
+            MiniDumpWithFullAuxiliaryState          = 0x00008000,
+            MiniDumpWithPrivateWriteCopyMemory      = 0x00010000,
+            MiniDumpIgnoreInaccessibleMemory        = 0x00020000,
+            MiniDumpWithTokenInformation            = 0x00040000,
+            MiniDumpWithModuleHeaders               = 0x00080000,
+            MiniDumpFilterTriage                    = 0x00100000,
+            MiniDumpValidTypeFlags                  = 0x001fffff
+        }
+
+        [DllImport("DbgHelp.dll", SetLastError = true)]
+        public static extern bool MiniDumpWriteDump(IntPtr handle, int processId, SafeFileHandle file, MiniDumpType dumpType, IntPtr exceptionParam, IntPtr userStreamParam, IntPtr callbackParam);
+    }
+
+    static class Kernel32
+    {
+        public const int MAX_PATH = 260;
+        public const int ERROR_NO_MORE_FILES = 0x12;
+
+        public enum Toolhelp32Flags : uint
+        {
+            TH32CS_INHERIT = 0x80000000,
+            TH32CS_SNAPHEAPLIST = 0x00000001,
+            TH32CS_SNAPMODULE = 0x00000008,
+            TH32CS_SNAPMODULE32 = 0x00000010,
+            TH32CS_SNAPPROCESS = 0x00000002,
+            TH32CS_SNAPTHREAD = 0x00000004
+        };
+
+        public unsafe struct ProcessEntry32
+        {
+            public int Size;
+            public int Usage;
+            public int ProcessID;
+            public IntPtr DefaultHeapID;
+            public int ModuleID;
+            public int Threads;
+            public int ParentProcessID;
+            public int PriClassBase;
+            public int Flags;
+            public fixed char ExeFile[MAX_PATH];
+        }
+
+        [DllImport("kernel32.dll")]
+        public static extern bool CloseHandle(IntPtr handle);
+
+        [DllImport("kernel32.dll", SetLastError = true)]
+        public static extern IntPtr CreateToolhelp32Snapshot(Toolhelp32Flags flags, int processId);
+
+        [DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Unicode)]
+        public static extern bool Process32First(IntPtr snapshot, ref ProcessEntry32 entry);
+
+        [DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Unicode)]
+        public static extern bool Process32Next(IntPtr snapshot, ref ProcessEntry32 entry);
+    }
+
     public class CoreclrTestWrapperLib
     {
         public const int EXIT_SUCCESS_CODE = 0;
@@ -22,6 +99,63 @@ namespace CoreclrTestLib
         public const int DEFAULT_TIMEOUT = 1000 * 60*10;
         public const string GC_STRESS_LEVEL = "__GCSTRESSLEVEL";
 
+        public const string COLLECT_DUMPS_ENVIRONMENT_VAR = "__CollectDumps";
+        public const string CRASH_DUMP_FOLDER_ENVIRONMENT_VAR = "__CrashDumpFolder";
+
+        static bool CollectCrashDump(Process process, string path)
+        {
+            using (var crashDump = File.OpenWrite(path))
+            {
+                var flags = DbgHelp.MiniDumpType.MiniDumpWithFullMemory | DbgHelp.MiniDumpType.MiniDumpIgnoreInaccessibleMemory;
+                return DbgHelp.MiniDumpWriteDump(process.Handle, process.Id, crashDump.SafeFileHandle, flags, IntPtr.Zero, IntPtr.Zero, IntPtr.Zero);
+            }
+        }
+
+        static unsafe bool TryFindChildProcessByName(Process process, string childName, out Process child)
+        {
+            IntPtr snapshot = Kernel32.CreateToolhelp32Snapshot(Kernel32.Toolhelp32Flags.TH32CS_SNAPPROCESS, 0);
+            if (snapshot == IntPtr.Zero)
+            {
+                child = null;
+                return false;
+            }
+
+            try
+            {
+                int ppid = process.Id;
+
+                var processEntry = new Kernel32.ProcessEntry32 { Size = sizeof(Kernel32.ProcessEntry32) };
+
+                bool success = Kernel32.Process32First(snapshot, ref processEntry);
+                while (success)
+                {
+                    if (processEntry.ParentProcessID == ppid)
+                    {
+                        try
+                        {
+                            Process c = Process.GetProcessById(processEntry.ProcessID);
+                            if (c.ProcessName.Equals(childName, StringComparison.OrdinalIgnoreCase))
+                            {
+                                child = c;
+                                return true;
+                            }
+                            c.Dispose();
+                        }
+                        catch {}
+                    }
+
+                    success = Kernel32.Process32Next(snapshot, ref processEntry);
+                }
+
+                child = null;
+                return false;
+            }
+            finally
+            {
+                Kernel32.CloseHandle(snapshot);
+            }
+        }
+
         public int RunTest(string executable, string outputFile, string errorFile)
         {
             Debug.Assert(outputFile != errorFile);
@@ -39,6 +173,9 @@ namespace CoreclrTestLib
             string operatingSystem = System.Environment.GetEnvironmentVariable("OS");
             bool runningInWindows = (operatingSystem != null && operatingSystem.StartsWith("Windows"));
 
+            // We can't yet take crash dumps on non-Windows OSs for timed-out tests
+            bool collectCrashDumps = runningInWindows && Environment.GetEnvironmentVariable(COLLECT_DUMPS_ENVIRONMENT_VAR) != null;
+
             var outputStream = new FileStream(outputFile, FileMode.Create);
             var errorStream = new FileStream(errorFile, FileMode.Create);
 
@@ -92,6 +229,23 @@ namespace CoreclrTestLib
 
                     outputWriter.WriteLine("\ncmdLine:" + executable + " Timed Out");
                     errorWriter.WriteLine("\ncmdLine:" + executable + " Timed Out");
+
+                    if (collectCrashDumps)
+                    {
+                        string crashDumpFolder = Environment.GetEnvironmentVariable(CRASH_DUMP_FOLDER_ENVIRONMENT_VAR);
+                        if (crashDumpFolder != null)
+                        {
+                            Process childProcess;
+                            if (TryFindChildProcessByName(process, "corerun", out childProcess))
+                            {
+                                string crashDumpPath = Path.Combine(Path.GetFullPath(crashDumpFolder), string.Format("crashdump_{0}.dmp", childProcess.Id));
+                                if (CollectCrashDump(childProcess, crashDumpPath))
+                                {
+                                    Console.WriteLine("Collected crash dump {0}", crashDumpPath);
+                                }
+                            }
+                        }
+                    }
                 }
 
                outputWriter.WriteLine("Test Harness Exitcode is : " + exitCode.ToString());
index b70f332..c3cd97f 100644 (file)
@@ -14,6 +14,7 @@
     <SkipSigning>true</SkipSigning>
     <FileAlignment>512</FileAlignment>
     <CLRTestKind>BuildOnly</CLRTestKind>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
     <DebugSymbols>true</DebugSymbols>