From 37dae35dd08adfa97f514c822f4333019d821b00 Mon Sep 17 00:00:00 2001 From: Maoni Stephens Date: Fri, 5 Jul 2019 12:48:34 -0700 Subject: [PATCH] many core (dotnet/coreclr#25350) Commit migrated from https://github.com/dotnet/coreclr/commit/c7d12d2b91f79802a12484d4d56c41ba355b7058 --- src/coreclr/src/gc/env/gcenv.interlocked.h | 6 + src/coreclr/src/gc/env/gcenv.interlocked.inl | 31 + src/coreclr/src/gc/env/gcenv.os.h | 19 +- src/coreclr/src/gc/gc.cpp | 940 ++++++++++++++--- src/coreclr/src/gc/gcpriv.h | 35 +- src/coreclr/src/gc/unix/gcenv.unix.cpp | 8 +- src/coreclr/src/gc/windows/gcenv.windows.cpp | 83 +- src/coreclr/src/inc/utilcode.h | 3 + src/coreclr/src/tools/GCLogParser/App.config | 6 + .../tools/GCLogParser/Properties/AssemblyInfo.cs | 36 + src/coreclr/src/tools/GCLogParser/parse-hb-log.cs | 1099 ++++++++++++++++++++ .../src/tools/GCLogParser/parse-hb-log.csproj | 73 ++ src/coreclr/src/tools/GCLogParser/parse-hb-log.sln | 31 + src/coreclr/src/utilcode/util.cpp | 50 + src/coreclr/src/vm/gcenv.os.cpp | 94 +- 15 files changed, 2346 insertions(+), 168 deletions(-) create mode 100644 src/coreclr/src/tools/GCLogParser/App.config create mode 100644 src/coreclr/src/tools/GCLogParser/Properties/AssemblyInfo.cs create mode 100644 src/coreclr/src/tools/GCLogParser/parse-hb-log.cs create mode 100644 src/coreclr/src/tools/GCLogParser/parse-hb-log.csproj create mode 100644 src/coreclr/src/tools/GCLogParser/parse-hb-log.sln diff --git a/src/coreclr/src/gc/env/gcenv.interlocked.h b/src/coreclr/src/gc/env/gcenv.interlocked.h index 1da222d..eb73a36 100644 --- a/src/coreclr/src/gc/env/gcenv.interlocked.h +++ b/src/coreclr/src/gc/env/gcenv.interlocked.h @@ -78,6 +78,12 @@ public: template static T ExchangeAdd(T volatile *addend, T value); + template + static T ExchangeAdd64(T volatile* addend, T value); + + template + static T ExchangeAddPtr(T volatile* addend, T value); + // Performs an atomic compare-and-exchange operation on the specified values. // Parameters: // destination - value to be exchanged diff --git a/src/coreclr/src/gc/env/gcenv.interlocked.inl b/src/coreclr/src/gc/env/gcenv.interlocked.inl index 1df2700..401c4da 100644 --- a/src/coreclr/src/gc/env/gcenv.interlocked.inl +++ b/src/coreclr/src/gc/env/gcenv.interlocked.inl @@ -116,6 +116,37 @@ __forceinline T Interlocked::ExchangeAdd(T volatile *addend, T value) #endif } +template +__forceinline T Interlocked::ExchangeAdd64(T volatile* addend, T value) +{ +#ifdef _MSC_VER + static_assert(sizeof(int64_t) == sizeof(T), "Size of LONGLONG must be the same as size of T"); + return _InterlockedExchangeAdd64((int64_t*)addend, value); +#else + T result = __sync_fetch_and_add(addend, value); + ArmInterlockedOperationBarrier(); + return result; +#endif +} + +template +__forceinline T Interlocked::ExchangeAddPtr(T volatile* addend, T value) +{ +#ifdef _MSC_VER +#ifdef BIT64 + static_assert(sizeof(int64_t) == sizeof(T), "Size of LONGLONG must be the same as size of T"); + return _InterlockedExchangeAdd64((int64_t*)addend, value); +#else + static_assert(sizeof(long) == sizeof(T), "Size of long must be the same as size of T"); + return _InterlockedExchangeAdd((long*)addend, value); +#endif +#else + T result = __sync_fetch_and_add(addend, value); + ArmInterlockedOperationBarrier(); + return result; +#endif +} + // Perform an atomic AND operation on the specified values values // Parameters: // destination - the first operand and the destination diff --git a/src/coreclr/src/gc/env/gcenv.os.h b/src/coreclr/src/gc/env/gcenv.os.h index 393bd1f..e56321c 100644 --- a/src/coreclr/src/gc/env/gcenv.os.h +++ b/src/coreclr/src/gc/env/gcenv.os.h @@ -144,8 +144,10 @@ typedef void (*GCThreadFunction)(void* param); // Right now we support maximum 1024 procs - meaning that we will create at most // that many GC threads and GC heaps. #define MAX_SUPPORTED_CPUS 1024 +#define MAX_SUPPORTED_NODES 64 #else #define MAX_SUPPORTED_CPUS 64 +#define MAX_SUPPORTED_NODES 16 #endif // BIT64 // Add of processor indices used to store affinity. @@ -253,6 +255,7 @@ public: // size - size of the virtual memory range // alignment - requested memory alignment // flags - flags to control special settings like write watching + // node - the NUMA node to reserve memory on // Return: // Starting virtual address of the reserved range // Notes: @@ -264,7 +267,7 @@ public: // // Windows guarantees that the returned mapping will be aligned to the allocation // granularity. - static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags); + static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node = NUMA_NODE_UNDEFINED); // Release virtual memory range previously reserved using VirtualReserve // Parameters: @@ -360,6 +363,8 @@ public: // true if it has succeeded, false if it has failed static bool SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t dstProcNo); + static bool GetCurrentThreadIdealProc(uint16_t* procNo); + // Get numeric id of the current thread if possible on the // current platform. It is indended for logging purposes only. // Return: @@ -484,6 +489,15 @@ public: // Is NUMA support available static bool CanEnableGCNumaAware(); + // TODO: add Linux implementation. + // For no NUMA this returns false. + static bool GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node); + + // Is CPU Group enabled + // This only applies on Windows and only used by instrumentation but is on the + // interface due to LocalGC. + static bool CanEnableGCCPUGroups(); + // Get processor number and optionally its NUMA node number for the specified heap number // Parameters: // heap_number - heap number to get the result for @@ -493,6 +507,9 @@ public: // true if it succeeded static bool GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_no, uint16_t* node_no); + // For no CPU groups this returns false. + static bool GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group); + // Parse the confing string describing affinitization ranges and update the passed in affinitySet accordingly // Parameters: // config_string - string describing the affinitization range, platform specific diff --git a/src/coreclr/src/gc/gc.cpp b/src/coreclr/src/gc/gc.cpp index 0917527..f4d877f 100644 --- a/src/coreclr/src/gc/gc.cpp +++ b/src/coreclr/src/gc/gc.cpp @@ -56,6 +56,8 @@ BOOL bgc_heap_walk_for_etw_p = FALSE; #define MAX_PTR ((uint8_t*)(~(ptrdiff_t)0)) #define commit_min_th (16*OS_PAGE_SIZE) +static size_t smoothed_desired_per_heap = 0; + #ifdef SERVER_GC #define partial_size_th 100 #define num_partial_refs 64 @@ -205,6 +207,12 @@ size_t GetHighPrecisionTimeStamp() return (size_t)(ts / (qpf / 1000)); } + +uint64_t RawGetHighPrecisionTimeStamp() +{ + return (uint64_t)GCToOSInterface::QueryPerformanceCounter(); +} + #endif #ifdef GC_STATS @@ -436,6 +444,7 @@ void c_write (uint32_t& place, uint32_t value) } #ifndef DACCESS_COMPILE + // If every heap's gen2 or gen3 size is less than this threshold we will do a blocking GC. const size_t bgc_min_per_heap = 4*1024*1024; @@ -682,6 +691,7 @@ process_sync_log_stats() #ifdef MULTIPLE_HEAPS #ifndef DACCESS_COMPILE +uint32_t g_num_active_processors = 0; enum gc_join_stage { @@ -2442,6 +2452,7 @@ sorted_table* gc_heap::seg_table; #ifdef MULTIPLE_HEAPS GCEvent gc_heap::ee_suspend_event; +size_t gc_heap::min_gen0_balance_delta = 0; size_t gc_heap::min_balance_threshold = 0; #endif //MULTIPLE_HEAPS @@ -2537,6 +2548,7 @@ uint64_t gc_heap::entry_available_physical_mem = 0; size_t gc_heap::heap_hard_limit = 0; +bool affinity_config_specified_p = false; #ifdef BACKGROUND_GC GCEvent gc_heap::bgc_start_event; @@ -2576,6 +2588,7 @@ int gc_heap::spinlock_info_index = 0; spinlock_info gc_heap::last_spinlock_info[max_saved_spinlock_info + 8]; #endif //SPINLOCK_HISTORY +uint32_t gc_heap::fgn_maxgen_percent = 0; size_t gc_heap::fgn_last_alloc = 0; int gc_heap::generation_skip_ratio = 100; @@ -2834,6 +2847,9 @@ size_t gc_heap::eph_gen_starts_size = 0; heap_segment* gc_heap::segment_standby_list; bool gc_heap::use_large_pages_p = 0; size_t gc_heap::last_gc_index = 0; +#ifdef HEAP_BALANCE_INSTRUMENTATION +size_t gc_heap::last_gc_end_time_ms = 0; +#endif //HEAP_BALANCE_INSTRUMENTATION #ifdef SEG_MAPPING_TABLE size_t gc_heap::min_segment_size = 0; size_t gc_heap::min_segment_size_shr = 0; @@ -2859,8 +2875,6 @@ GCEvent gc_heap::full_gc_approach_event; GCEvent gc_heap::full_gc_end_event; -uint32_t gc_heap::fgn_maxgen_percent = 0; - uint32_t gc_heap::fgn_loh_percent = 0; #ifdef BACKGROUND_GC @@ -4283,10 +4297,10 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h // should only be called once assert (memory_details.initial_memory == 0); - memory_details.initial_memory = new (nothrow) imemory_data[num_heaps*2]; + memory_details.initial_memory = new (nothrow) imemory_data[num_heaps * 2]; if (memory_details.initial_memory == 0) { - dprintf (2, ("failed to reserve %Id bytes for imemory_data", num_heaps*2*sizeof(imemory_data))); + dprintf (2, ("failed to reserve %Id bytes for imemory_data", num_heaps * 2 * sizeof (imemory_data))); return FALSE; } @@ -4320,15 +4334,15 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h uint8_t* allatonce_block = (uint8_t*)virtual_alloc (requestedMemory, use_large_pages_p); if (allatonce_block) { - g_gc_lowest_address = allatonce_block; + g_gc_lowest_address = allatonce_block; g_gc_highest_address = allatonce_block + requestedMemory; memory_details.allocation_pattern = initial_memory_details::ALLATONCE; - for(size_t i = 0; i < memory_details.block_count; i++) + for (size_t i = 0; i < memory_details.block_count; i++) { - memory_details.initial_normal_heap[i].memory_base = allatonce_block + (i*normal_size); + memory_details.initial_normal_heap[i].memory_base = allatonce_block + (i * normal_size); memory_details.initial_large_heap[i].memory_base = allatonce_block + - (memory_details.block_count*normal_size) + (i*large_size); + (memory_details.block_count * normal_size) + (i * large_size); reserve_success = TRUE; } } @@ -4344,13 +4358,13 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h if (b2) { memory_details.allocation_pattern = initial_memory_details::TWO_STAGE; - g_gc_lowest_address = min(b1,b2); - g_gc_highest_address = max(b1 + memory_details.block_count*normal_size, - b2 + memory_details.block_count*large_size); - for(size_t i = 0; i < memory_details.block_count; i++) + g_gc_lowest_address = min (b1, b2); + g_gc_highest_address = max (b1 + memory_details.block_count * normal_size, + b2 + memory_details.block_count * large_size); + for (size_t i = 0; i < memory_details.block_count; i++) { - memory_details.initial_normal_heap[i].memory_base = b1 + (i*normal_size); - memory_details.initial_large_heap[i].memory_base = b2 + (i*large_size); + memory_details.initial_normal_heap[i].memory_base = b1 + (i * normal_size); + memory_details.initial_large_heap[i].memory_base = b2 + (i * large_size); reserve_success = TRUE; } } @@ -4362,28 +4376,28 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h } } - if ((b2==NULL) && ( memory_details.block_count > 1)) + if ((b2 == NULL) && (memory_details.block_count > 1)) { memory_details.allocation_pattern = initial_memory_details::EACH_BLOCK; - imemory_data *current_block = memory_details.initial_memory; - for(size_t i = 0; i < (memory_details.block_count*2); i++, current_block++) + imemory_data* current_block = memory_details.initial_memory; + for (size_t i = 0; i < (memory_details.block_count * 2); i++, current_block++) { size_t block_size = ((i < memory_details.block_count) ? - memory_details.block_size_normal : - memory_details.block_size_large); + memory_details.block_size_normal : + memory_details.block_size_large); current_block->memory_base = (uint8_t*)virtual_alloc (block_size, use_large_pages_p); if (current_block->memory_base == 0) { // Free the blocks that we've allocated so far current_block = memory_details.initial_memory; - for(size_t j = 0; j < i; j++, current_block++){ - if (current_block->memory_base != 0){ + for (size_t j = 0; j < i; j++, current_block++) { + if (current_block->memory_base != 0) { block_size = ((j < memory_details.block_count) ? - memory_details.block_size_normal : - memory_details.block_size_large); - virtual_free (current_block->memory_base , block_size); + memory_details.block_size_normal : + memory_details.block_size_large); + virtual_free (current_block->memory_base, block_size); } } reserve_success = FALSE; @@ -4392,8 +4406,8 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h else { if (current_block->memory_base < g_gc_lowest_address) - g_gc_lowest_address = current_block->memory_base; - if (((uint8_t *) current_block->memory_base + block_size) > g_gc_highest_address) + g_gc_lowest_address = current_block->memory_base; + if (((uint8_t*)current_block->memory_base + block_size) > g_gc_highest_address) g_gc_highest_address = (current_block->memory_base + block_size); } reserve_success = TRUE; @@ -4507,7 +4521,9 @@ void* virtual_alloc (size_t size, bool use_large_pages_p) } #endif // !FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP - void* prgmem = use_large_pages_p ? GCToOSInterface::VirtualReserveAndCommitLargePages(requested_size) : GCToOSInterface::VirtualReserve(requested_size, card_size * card_word_width, flags); + void* prgmem = use_large_pages_p ? + GCToOSInterface::VirtualReserveAndCommitLargePages(requested_size) : + GCToOSInterface::VirtualReserve(requested_size, card_size * card_word_width, flags); void *aligned_mem = prgmem; // We don't want (prgmem + size) to be right at the end of the address space @@ -5053,9 +5069,18 @@ extern "C" uint64_t __rdtsc(); } #endif //_TARGET_X86_ +// We may not be on contiguous numa nodes so need to store +// the node index as well. +struct node_heap_count +{ + int node_no; + int heap_count; +}; + class heap_select { heap_select() {} +public: static uint8_t* sniff_buffer; static unsigned n_sniff_buffers; static unsigned cur_sniff_index; @@ -5063,7 +5088,12 @@ class heap_select static uint16_t proc_no_to_heap_no[MAX_SUPPORTED_CPUS]; static uint16_t heap_no_to_proc_no[MAX_SUPPORTED_CPUS]; static uint16_t heap_no_to_numa_node[MAX_SUPPORTED_CPUS]; + static uint16_t proc_no_to_numa_node[MAX_SUPPORTED_CPUS]; static uint16_t numa_node_to_heap_map[MAX_SUPPORTED_CPUS+4]; + // Note this is the total numa nodes GC heaps are on. There might be + // more on the machine if GC threads aren't using all of them. + static uint16_t total_numa_nodes; + static node_heap_count heaps_on_node[MAX_SUPPORTED_NODES]; static int access_time(uint8_t *sniff_buffer, int heap_number, unsigned sniff_index, unsigned n_sniff_buffers) { @@ -5104,14 +5134,11 @@ public: return TRUE; } - static void init_cpu_mapping(gc_heap * /*heap*/, int heap_number) + static void init_cpu_mapping(int heap_number) { if (GCToOSInterface::CanGetCurrentProcessorNumber()) { - uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber() % gc_heap::n_heaps; - // We can safely cast heap_number to a uint16_t 'cause GetCurrentProcessCpuCount - // only returns up to MAX_SUPPORTED_CPUS procs right now. We only ever create at most - // MAX_SUPPORTED_CPUS GC threads. + uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber(); proc_no_to_heap_no[proc_no] = (uint16_t)heap_number; } } @@ -5125,12 +5152,15 @@ public: sniff_buffer[(1 + heap_number*n_sniff_buffers + sniff_index)*HS_CACHE_LINE_SIZE] &= 1; } - static int select_heap(alloc_context* acontext, int /*hint*/) + static int select_heap(alloc_context* acontext) { UNREFERENCED_PARAMETER(acontext); // only referenced by dprintf if (GCToOSInterface::CanGetCurrentProcessorNumber()) - return proc_no_to_heap_no[GCToOSInterface::GetCurrentProcessorNumber() % gc_heap::n_heaps]; + { + uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber(); + return proc_no_to_heap_no[proc_no]; + } unsigned sniff_index = Interlocked::Increment(&cur_sniff_index); sniff_index %= n_sniff_buffers; @@ -5175,6 +5205,11 @@ public: return GCToOSInterface::CanGetCurrentProcessorNumber(); } + static uint16_t find_heap_no_from_proc_no(uint16_t proc_no) + { + return proc_no_to_heap_no[proc_no]; + } + static uint16_t find_proc_no_from_heap_no(int heap_number) { return heap_no_to_proc_no[heap_number]; @@ -5190,33 +5225,102 @@ public: return heap_no_to_numa_node[heap_number]; } - static void set_numa_node_for_heap(int heap_number, uint16_t numa_node) + static uint16_t find_numa_node_from_proc_no (uint16_t proc_no) + { + return proc_no_to_numa_node[proc_no]; + } + + static void set_numa_node_for_heap_and_proc(int heap_number, uint16_t proc_no, uint16_t numa_node) { heap_no_to_numa_node[heap_number] = numa_node; + proc_no_to_numa_node[proc_no] = numa_node; } static void init_numa_node_to_heap_map(int nheaps) - { // Called right after GCHeap::Init() for each heap + { + // Called right after GCHeap::Init() for each heap // For each NUMA node used by the heaps, the // numa_node_to_heap_map[numa_node] is set to the first heap number on that node and // numa_node_to_heap_map[numa_node + 1] is set to the first heap number not on that node - // Set the start of the heap number range for the first NUMA node numa_node_to_heap_map[heap_no_to_numa_node[0]] = 0; + total_numa_nodes = 0; + memset (heaps_on_node, 0, sizeof (heaps_on_node)); + heaps_on_node[0].node_no = heap_no_to_numa_node[0]; + heaps_on_node[0].heap_count = 1; for (int i=1; i < nheaps; i++) { if (heap_no_to_numa_node[i] != heap_no_to_numa_node[i-1]) { + total_numa_nodes++; + heaps_on_node[total_numa_nodes].node_no = heap_no_to_numa_node[i]; + // Set the end of the heap number range for the previous NUMA node numa_node_to_heap_map[heap_no_to_numa_node[i-1] + 1] = // Set the start of the heap number range for the current NUMA node numa_node_to_heap_map[heap_no_to_numa_node[i]] = (uint16_t)i; } + (heaps_on_node[total_numa_nodes].heap_count)++; } // Set the end of the heap range for the last NUMA node numa_node_to_heap_map[heap_no_to_numa_node[nheaps-1] + 1] = (uint16_t)nheaps; //mark the end with nheaps + total_numa_nodes++; + } + + // TODO: curently this doesn't work with GCHeapAffinitizeMask/GCHeapAffinitizeRanges + // because the heaps may not be on contiguous active procs. + // + // This is for scenarios where GCHeapCount is specified as something like + // (g_num_active_processors - 2) to allow less randomization to the Server GC threads. + // In this case we want to assign the right heaps to those procs, ie if they share + // the same numa node we want to assign local heaps to those procs. Otherwise we + // let the heap balancing mechanism take over for now. + static void distribute_other_procs() + { + if (affinity_config_specified_p) + return; + + uint16_t proc_no = 0; + uint16_t node_no = 0; + bool res = false; + int start_heap = -1; + int end_heap = -1; + int current_node_no = -1; + int current_heap_on_node = -1; + + for (int i = gc_heap::n_heaps; i < (int)g_num_active_processors; i++) + { + if (!GCToOSInterface::GetProcessorForHeap (i, &proc_no, &node_no)) + break; + + int start_heap = (int)numa_node_to_heap_map[node_no]; + int end_heap = (int)(numa_node_to_heap_map[node_no + 1]); + + if ((end_heap - start_heap) > 0) + { + if (node_no == current_node_no) + { + // We already iterated through all heaps on this node, don't add more procs to these + // heaps. + if (current_heap_on_node >= end_heap) + { + continue; + } + } + else + { + current_node_no = node_no; + current_heap_on_node = start_heap; + } + + proc_no_to_heap_no[proc_no] = current_heap_on_node; + proc_no_to_numa_node[proc_no] = node_no; + + current_heap_on_node++; + } + } } static void get_heap_range_for_heap(int hn, int* start, int* end) @@ -5225,6 +5329,43 @@ public: *start = (int)numa_node_to_heap_map[numa_node]; *end = (int)(numa_node_to_heap_map[numa_node+1]); } + + // This gets the next valid numa node index starting at current_index+1. + // It assumes that current_index is a valid node index. + // If current_index+1 is at the end this will start at the beginning. So this will + // always return a valid node index, along with that node's start/end heaps. + static uint16_t get_next_numa_node (uint16_t current_index, int* start, int* end) + { + int start_index = current_index + 1; + int nheaps = gc_heap::n_heaps; + + bool found_node_with_heaps_p = false; + do + { + int start_heap = (int)numa_node_to_heap_map[start_index]; + int end_heap = (int)numa_node_to_heap_map[start_index + 1]; + if (start_heap == nheaps) + { + // This is the last node. + start_index = 0; + continue; + } + + if ((end_heap - start_heap) == 0) + { + // This node has no heaps. + start_index++; + } + else + { + found_node_with_heaps_p = true; + *start = start_heap; + *end = end_heap; + } + } while (!found_node_with_heaps_p); + + return start_index; + } }; uint8_t* heap_select::sniff_buffer; unsigned heap_select::n_sniff_buffers; @@ -5232,7 +5373,318 @@ unsigned heap_select::cur_sniff_index; uint16_t heap_select::proc_no_to_heap_no[MAX_SUPPORTED_CPUS]; uint16_t heap_select::heap_no_to_proc_no[MAX_SUPPORTED_CPUS]; uint16_t heap_select::heap_no_to_numa_node[MAX_SUPPORTED_CPUS]; +uint16_t heap_select::proc_no_to_numa_node[MAX_SUPPORTED_CPUS]; uint16_t heap_select::numa_node_to_heap_map[MAX_SUPPORTED_CPUS+4]; +uint16_t heap_select::total_numa_nodes; +node_heap_count heap_select::heaps_on_node[MAX_SUPPORTED_NODES]; + +#ifdef HEAP_BALANCE_INSTRUMENTATION +// This records info we use to look at effect of different strategies +// for heap balancing. +struct heap_balance_info +{ + uint64_t timestamp; + // This also encodes when we detect the thread runs on + // different proc during a balance attempt. Sometimes + // I observe this happens multiple times during one attempt! + // If this happens, I just record the last proc we observe + // and set MSB. + int tid; + // This records the final alloc_heap for the thread. + // + // This also encodes the reason why we needed to set_home_heap + // in balance_heaps. + // If we set it because the home heap is not the same as the proc, + // we set MSB. + // + // If we set ideal proc, we set the 2nd MSB. + int alloc_heap; + int ideal_proc_no; +}; + +// This means inbetween each GC we can log at most this many entries per proc. +// This is usually enough. Most of the time we only need to log something every 128k +// of allocations in balance_heaps and gen0 budget is <= 200mb. +#define default_max_hb_heap_balance_info 4096 + +struct heap_balance_info_proc +{ + int count; + int index; + heap_balance_info hb_info[default_max_hb_heap_balance_info]; +}; + +struct heap_balance_info_numa +{ + heap_balance_info_proc* hb_info_procs; +}; + +uint64_t start_raw_ts = 0; +bool cpu_group_enabled_p = false; +uint32_t procs_per_numa_node = 0; +uint16_t total_numa_nodes_on_machine = 0; +uint32_t procs_per_cpu_group = 0; +uint16_t total_cpu_groups_on_machine = 0; +// Note this is still on one of the numa nodes, so we'll incur a remote access +// no matter what. +heap_balance_info_numa* hb_info_numa_nodes = NULL; + +// TODO: This doesn't work for multiple nodes per CPU group yet. +int get_proc_index_numa (int proc_no, int* numa_no) +{ + if (total_numa_nodes_on_machine == 1) + { + *numa_no = 0; + return proc_no; + } + else + { + if (cpu_group_enabled_p) + { + // see vm\gcenv.os.cpp GroupProcNo implementation. + *numa_no = proc_no >> 6; + return (proc_no % 64); + } + else + { + *numa_no = proc_no / procs_per_numa_node; + return (proc_no % procs_per_numa_node); + } + } +} + +// We could consider optimizing it so we don't need to get the tid +// everytime but it's not very expensive to get. +void add_to_hb_numa ( + int proc_no, + int ideal_proc_no, + int alloc_heap, + bool multiple_procs_p, + bool alloc_count_p, + bool set_ideal_p) +{ + int tid = (int)GCToOSInterface::GetCurrentThreadIdForLogging (); + uint64_t timestamp = RawGetHighPrecisionTimeStamp (); + + int saved_proc_no = proc_no; + int numa_no = -1; + proc_no = get_proc_index_numa (proc_no, &numa_no); + + heap_balance_info_numa* hb_info_numa_node = &hb_info_numa_nodes[numa_no]; + + heap_balance_info_proc* hb_info_proc = &(hb_info_numa_node->hb_info_procs[proc_no]); + int index = hb_info_proc->index; + int count = hb_info_proc->count; + + if (index == count) + { + // Too much info inbetween GCs. This can happen if the thread is scheduled on a different + // processor very often so it caused us to log many entries due to that reason. You could + // increase default_max_hb_heap_balance_info but this usually indicates a problem that + // should be investigated. + dprintf (HEAP_BALANCE_LOG, ("too much info between GCs, already logged %d entries", index)); + GCToOSInterface::DebugBreak (); + } + heap_balance_info* hb_info = &(hb_info_proc->hb_info[index]); + + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP[p%3d->%3d(i:%3d), N%d] #%4d: %I64d, tid %d, ah: %d, m: %d, p: %d, i: %d", + saved_proc_no, proc_no, ideal_proc_no, numa_no, index, + (timestamp - start_raw_ts), tid, alloc_heap, (int)multiple_procs_p, (int)(!alloc_count_p), (int)set_ideal_p)); + + if (multiple_procs_p) + { + tid |= (1 << (sizeof (tid) * 8 - 1)); + } + + if (!alloc_count_p) + { + alloc_heap |= (1 << (sizeof (alloc_heap) * 8 - 1)); + } + + if (set_ideal_p) + { + alloc_heap |= (1 << (sizeof (alloc_heap) * 8 - 2)); + } + + hb_info->timestamp = timestamp; + hb_info->tid = tid; + hb_info->alloc_heap = alloc_heap; + hb_info->ideal_proc_no = ideal_proc_no; + (hb_info_proc->index)++; +} + +const int hb_log_buffer_size = 1024; +static char hb_log_buffer[hb_log_buffer_size]; +int last_hb_recorded_gc_index = -1; +#endif //HEAP_BALANCE_INSTRUMENTATION + +// This logs what we recorded in balance_heaps +// The format for this is +// +// [ms since last GC end] +// [cpu index] +// all elements we stored before this GC for this CPU in the format +// timestamp,tid, alloc_heap_no +// repeat this for each CPU +// +// the timestamp here is just the result of calling QPC, +// it's not converted to ms. The conversion will be done when we process +// the log. +void gc_heap::hb_log_balance_activities() +{ +#ifdef HEAP_BALANCE_INSTRUMENTATION + char* log_buffer = hb_log_buffer; + + size_t now = GetHighPrecisionTimeStamp (); + size_t time_since_last_gc_ms = now - last_gc_end_time_ms; + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP%Id - %Id = %Id", now, last_gc_end_time_ms, time_since_last_gc_ms)); + + // We want to get the min and the max timestamp for all procs because it helps with our post processing + // to know how big an array to allocate to display the history inbetween the GCs. + uint64_t min_timestamp = 0xffffffffffffffff; + uint64_t max_timestamp = 0; + + for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++) + { + heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs; + for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++) + { + heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index]; + int total_entries_on_proc = hb_info_proc->index; + + if (total_entries_on_proc > 0) + { + min_timestamp = min (min_timestamp, hb_info_proc->hb_info[0].timestamp); + max_timestamp = max (max_timestamp, hb_info_proc->hb_info[total_entries_on_proc - 1].timestamp); + } + } + } + + dprintf (HEAP_BALANCE_LOG, ("[GCA#%Id %Id-%I64d-%I64d]", + settings.gc_index, time_since_last_gc_ms, (min_timestamp - start_raw_ts), (max_timestamp - start_raw_ts))); + + if (last_hb_recorded_gc_index == (int)settings.gc_index) + { + GCToOSInterface::DebugBreak (); + } + + last_hb_recorded_gc_index = (int)settings.gc_index; + + // When we print out the proc index we need to convert it to the actual proc index (this is contiguous). + // It helps with post processing. + for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++) + { + heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs; + for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++) + { + heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index]; + int total_entries_on_proc = hb_info_proc->index; + if (total_entries_on_proc > 0) + { + int total_exec_time_ms = (int)((hb_info_proc->hb_info[total_entries_on_proc - 1].timestamp - hb_info_proc->hb_info[0].timestamp) / (qpf / 1000)); + dprintf (HEAP_BALANCE_LOG, ("[p%d]-%d-%dms", (proc_index + numa_node_index * procs_per_numa_node), total_entries_on_proc, total_exec_time_ms)); + } + + for (int i = 0; i < hb_info_proc->index; i++) + { + heap_balance_info* hb_info = &hb_info_proc->hb_info[i]; + bool multiple_procs_p = false; + bool alloc_count_p = true; + bool set_ideal_p = false; + int tid = hb_info->tid; + int alloc_heap = hb_info->alloc_heap; + + if (tid & (1 << (sizeof (tid) * 8 - 1))) + { + multiple_procs_p = true; + tid &= ~(1 << (sizeof (tid) * 8 - 1)); + } + + if (alloc_heap & (1 << (sizeof (alloc_heap) * 8 - 1))) + { + alloc_count_p = false; + alloc_heap &= ~(1 << (sizeof (alloc_heap) * 8 - 1)); + } + + if (alloc_heap & (1 << (sizeof (alloc_heap) * 8 - 2))) + { + set_ideal_p = true; + alloc_heap &= ~(1 << (sizeof (alloc_heap) * 8 - 2)); + } + + // TODO - This assumes ideal proc is in the same cpu group which is not true + // when we don't have CPU groups. + int ideal_proc_no = hb_info->ideal_proc_no; + int ideal_node_no = -1; + ideal_proc_no = get_proc_index_numa (ideal_proc_no, &ideal_node_no); + ideal_proc_no = ideal_proc_no + ideal_node_no * procs_per_numa_node; + + dprintf (HEAP_BALANCE_LOG, ("%I64d,%d,%d,%d%s%s%s", + (hb_info->timestamp - start_raw_ts), + tid, + ideal_proc_no, + (int)alloc_heap, + (multiple_procs_p ? "|m" : ""), (!alloc_count_p ? "|p" : ""), (set_ideal_p ? "|i" : ""))); + } + } + } + + for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++) + { + heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs; + for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++) + { + heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index]; + hb_info_proc->index = 0; + } + } +#endif //HEAP_BALANCE_INSTRUMENTATION +} + +// The format for this is +// +// [GC_alloc_mb] +// h0_new_alloc, h1_new_alloc, ... +// +void gc_heap::hb_log_new_allocation() +{ +#ifdef HEAP_BALANCE_INSTRUMENTATION + char* log_buffer = hb_log_buffer; + + int desired_alloc_mb = (int)(dd_desired_allocation (g_heaps[0]->dynamic_data_of (0)) / 1024 / 1024); + + int buffer_pos = sprintf_s (hb_log_buffer, hb_log_buffer_size, "[GC_alloc_mb]\n"); + for (int numa_node_index = 0; numa_node_index < heap_select::total_numa_nodes; numa_node_index++) + { + int node_allocated_mb = 0; + + // I'm printing out the budget here instead of the numa node index so we know how much + // of the budget we consumed. + buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "[N#%3d]", + //numa_node_index); + desired_alloc_mb); + + int heaps_on_node = heap_select::heaps_on_node[numa_node_index].heap_count; + + for (int heap_index = 0; heap_index < heaps_on_node; heap_index++) + { + int actual_heap_index = heap_index + numa_node_index * heaps_on_node; + gc_heap* hp = g_heaps[actual_heap_index]; + dynamic_data* dd0 = hp->dynamic_data_of (0); + int allocated_mb = (int)((dd_desired_allocation (dd0) - dd_new_allocation (dd0)) / 1024 / 1024); + node_allocated_mb += allocated_mb; + buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "%d,", + allocated_mb); + } + + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPN#%d a %dmb(%dmb)", numa_node_index, node_allocated_mb, desired_alloc_mb)); + + buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "\n"); + } + + dprintf (HEAP_BALANCE_LOG, ("%s", hb_log_buffer)); +#endif //HEAP_BALANCE_INSTRUMENTATION +} BOOL gc_heap::create_thread_support (unsigned number_of_heaps) { @@ -5274,23 +5726,30 @@ void gc_heap::destroy_thread_support () } } -void set_thread_affinity_for_heap(int heap_number) +bool get_proc_and_numa_for_heap (int heap_number) { uint16_t proc_no; uint16_t node_no; - if (GCToOSInterface::GetProcessorForHeap(heap_number, &proc_no, &node_no)) + bool res = GCToOSInterface::GetProcessorForHeap (heap_number, &proc_no, &node_no); + if (res) { - heap_select::set_proc_no_for_heap(heap_number, proc_no); + heap_select::set_proc_no_for_heap (heap_number, proc_no); if (node_no != NUMA_NODE_UNDEFINED) { - heap_select::set_numa_node_for_heap(heap_number, node_no); - } - if (!GCToOSInterface::SetThreadAffinity(proc_no)) - { - dprintf(1, ("Failed to set thread affinity for server GC thread")); + heap_select::set_numa_node_for_heap_and_proc (heap_number, proc_no, node_no); } } + + return res; +} + +void set_thread_affinity_for_heap (int heap_number, uint16_t proc_no) +{ + if (!GCToOSInterface::SetThreadAffinity (proc_no)) + { + dprintf (1, ("Failed to set thread affinity for GC thread %d on proc #%d", heap_number, proc_no)); + } } bool gc_heap::create_gc_thread () @@ -5308,7 +5767,7 @@ void gc_heap::gc_thread_function () assert (gc_start_event.IsValid()); dprintf (3, ("gc thread started")); - heap_select::init_cpu_mapping(this, heap_number); + heap_select::init_cpu_mapping(heap_number); while (1) { @@ -5430,7 +5889,7 @@ bool gc_heap::virtual_alloc_commit_for_heap (void* addr, size_t size, int h_numb if (GCToOSInterface::CanEnableGCNumaAware()) { uint16_t numa_node = heap_select::find_numa_node_from_heap_no(h_number); - if (GCToOSInterface::VirtualCommit(addr, size, numa_node)) + if (GCToOSInterface::VirtualCommit (addr, size, numa_node)) return true; } } @@ -9029,6 +9488,7 @@ retry: inline size_t my_get_size (Object* ob) { MethodTable* mT = header(ob)->GetMethodTable(); + return (mT->GetBaseSize() + (mT->HasComponentSize() ? ((size_t)((CObjectHeader*)ob)->GetNumComponents() * mT->RawGetComponentSize()) : 0)); @@ -9317,8 +9777,8 @@ void gc_heap::decommit_heap_segment_pages (heap_segment* seg, size -= max (extra_space, 32*OS_PAGE_SIZE); virtual_decommit (page_start, size, heap_number); - dprintf (3, ("Decommitting heap segment [%Ix, %Ix[(%d)", - (size_t)page_start, + dprintf (3, ("Decommitting heap segment [%Ix, %Ix[(%d)", + (size_t)page_start, (size_t)(page_start + size), size)); heap_segment_committed (seg) = page_start; @@ -10025,7 +10485,7 @@ HRESULT gc_heap::initialize_gc (size_t segment_size, check_commit_cs.Initialize(); } - if (!reserve_initial_memory(segment_size,heap_size,block_count,use_large_pages_p)) + if (!reserve_initial_memory (segment_size,heap_size,block_count,use_large_pages_p)) return E_OUTOFMEMORY; #ifdef CARD_BUNDLE @@ -10182,7 +10642,6 @@ gc_heap::init_semi_shared() goto cleanup; } - fgn_maxgen_percent = 0; fgn_loh_percent = 0; full_gc_approach_event_set = false; @@ -10315,7 +10774,7 @@ gc_heap::wait_for_gc_done(int32_t timeOut) while (gc_heap::gc_started) { #ifdef MULTIPLE_HEAPS - wait_heap = GCHeap::GetHeap(heap_select::select_heap(NULL, 0))->pGenGCHeap; + wait_heap = GCHeap::GetHeap(heap_select::select_heap(NULL))->pGenGCHeap; dprintf(2, ("waiting for the gc_done_event on heap %d", wait_heap->heap_number)); #endif // MULTIPLE_HEAPS @@ -10680,6 +11139,7 @@ gc_heap::init_gc_heap (int h_number) allocation_running_amount = dd_min_size (dynamic_data_of (0)); #endif //!MULTIPLE_HEAPS + fgn_maxgen_percent = 0; fgn_last_alloc = dd_min_size (dynamic_data_of (0)); mark* arr = new (nothrow) (mark [MARK_STACK_INITIAL_LENGTH]); @@ -10721,8 +11181,7 @@ gc_heap::init_gc_heap (int h_number) #endif //MARK_ARRAY #ifdef MULTIPLE_HEAPS - //register the heap in the heaps array - + get_proc_and_numa_for_heap (heap_number); if (!create_gc_thread ()) return 0; @@ -11654,12 +12113,13 @@ size_t gc_heap::new_allocation_limit (size_t size, size_t physical_limit, int ge dynamic_data* dd = dynamic_data_of (gen_number); ptrdiff_t new_alloc = dd_new_allocation (dd); assert (new_alloc == (ptrdiff_t)Align (new_alloc, - get_alignment_constant (!(gen_number == (max_generation+1))))); + get_alignment_constant (!(gen_number == (max_generation + 1))))); ptrdiff_t logical_limit = max (new_alloc, (ptrdiff_t)size); size_t limit = min (logical_limit, (ptrdiff_t)physical_limit); assert (limit == Align (limit, get_alignment_constant (!(gen_number == (max_generation+1))))); dd_new_allocation (dd) = (new_alloc - limit); + return limit; } @@ -11917,7 +12377,13 @@ void gc_heap::send_full_gc_notification (int gen_num, BOOL due_to_alloc_p) wait_full_gc_status gc_heap::full_gc_wait (GCEvent *event, int time_out_ms) { - if (fgn_maxgen_percent == 0) + uint32_t maxgen_percent = 0; +#ifdef MULTIPLE_HEAPS + maxgen_percent = g_heaps[0]->fgn_maxgen_percent; +#else + maxgen_percent = fgn_maxgen_percent; +#endif //MULTIPLE_HEAPS + if (maxgen_percent == 0) { return wait_full_gc_na; } @@ -11926,7 +12392,7 @@ wait_full_gc_status gc_heap::full_gc_wait (GCEvent *event, int time_out_ms) if ((wait_result == WAIT_OBJECT_0) || (wait_result == WAIT_TIMEOUT)) { - if (fgn_maxgen_percent == 0) + if (maxgen_percent == 0) { return wait_full_gc_cancelled; } @@ -12351,10 +12817,9 @@ found_fit: VolatileStore(((void**)allocated - 1), (void*)0); //clear the sync block #endif //VERIFY_HEAP && _DEBUG - dprintf (3, ("found fit at end of seg: %Ix", old_alloc)); - uint8_t* old_alloc; old_alloc = allocated; + dprintf (3, ("found fit at end of seg: %Ix", old_alloc)); #ifdef BACKGROUND_GC if (cookie != -1) @@ -12471,7 +12936,7 @@ BOOL gc_heap::trigger_ephemeral_gc (gc_reason gr) BOOL did_full_compact_gc = FALSE; - dprintf (2, ("triggering a gen1 GC")); + dprintf (1, ("h%d triggering a gen1 GC", heap_number)); size_t last_full_compact_gc_count = get_full_compact_gc_count(); vm_heap->GarbageCollectGeneration(max_generation - 1, gr); @@ -13443,7 +13908,7 @@ allocation_state gc_heap::try_allocate_more_space (alloc_context* acontext, size #ifdef SYNCHRONIZATION_STATS bad_suspension++; #endif //SYNCHRONIZATION_STATS - dprintf (/*100*/ 2, ("running out of budget on gen%d, gc", gen_number)); + dprintf (2, ("h%d running out of budget on gen%d, gc", heap_number, gen_number)); if (!settings.concurrent || (gen_number == 0)) { @@ -13497,70 +13962,138 @@ void gc_heap::balance_heaps (alloc_context* acontext) { if (acontext->alloc_count == 0) { - acontext->set_home_heap(GCHeap::GetHeap( heap_select::select_heap(acontext, 0) )); - gc_heap* hp = acontext->get_home_heap()->pGenGCHeap; - dprintf (3, ("First allocation for context %Ix on heap %d\n", (size_t)acontext, (size_t)hp->heap_number)); - acontext->set_alloc_heap(acontext->get_home_heap()); + int home_hp_num = heap_select::select_heap (acontext); + acontext->set_home_heap (GCHeap::GetHeap (home_hp_num)); + gc_heap* hp = acontext->get_home_heap ()->pGenGCHeap; + acontext->set_alloc_heap (acontext->get_home_heap ()); hp->alloc_context_count++; + +#ifdef HEAP_BALANCE_INSTRUMENTATION + uint16_t ideal_proc_no = 0; + GCToOSInterface::GetCurrentThreadIdealProc (&ideal_proc_no); + + uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber (); + + add_to_hb_numa (proc_no, ideal_proc_no, + home_hp_num, false, true, false); + + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPafter GC: 1st alloc on p%3d, h%d, ip: %d", + proc_no, home_hp_num, ideal_proc_no)); +#endif //HEAP_BALANCE_INSTRUMENTATION } } else { BOOL set_home_heap = FALSE; - int hint = 0; + gc_heap* home_hp = NULL; + int proc_hp_num = 0; + +#ifdef HEAP_BALANCE_INSTRUMENTATION + bool alloc_count_p = true; + bool multiple_procs_p = false; + bool set_ideal_p = false; + uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber (); + uint32_t last_proc_no = proc_no; +#endif //HEAP_BALANCE_INSTRUMENTATION - if (heap_select::can_find_heap_fast()) + if (heap_select::can_find_heap_fast ()) { - if (acontext->get_home_heap() != NULL) - hint = acontext->get_home_heap()->pGenGCHeap->heap_number; - if (acontext->get_home_heap() != GCHeap::GetHeap(hint = heap_select::select_heap(acontext, hint)) || ((acontext->alloc_count & 15) == 0)) + assert (acontext->get_home_heap () != NULL); + home_hp = acontext->get_home_heap ()->pGenGCHeap; + proc_hp_num = heap_select::select_heap (acontext); + + if (acontext->get_home_heap () != GCHeap::GetHeap (proc_hp_num)) { +#ifdef HEAP_BALANCE_INSTRUMENTATION + alloc_count_p = false; +#endif //HEAP_BALANCE_INSTRUMENTATION + set_home_heap = TRUE; + } + else if ((acontext->alloc_count & 15) == 0) set_home_heap = TRUE; + + if (set_home_heap) + { } } else { - // can't use gdt if ((acontext->alloc_count & 3) == 0) set_home_heap = TRUE; } if (set_home_heap) { -/* - // Since we are balancing up to MAX_SUPPORTED_CPUS, no need for this. - if (n_heaps > MAX_SUPPORTED_CPUS) - { - // on machines with many processors cache affinity is really king, so don't even try - // to balance on these. - acontext->home_heap = GCHeap::GetHeap( heap_select::select_heap(acontext, hint) ); - acontext->alloc_heap = acontext->home_heap; - } - else -*/ + /* + // Since we are balancing up to MAX_SUPPORTED_CPUS, no need for this. + if (n_heaps > MAX_SUPPORTED_CPUS) + { + // on machines with many processors cache affinity is really king, so don't even try + // to balance on these. + acontext->home_heap = GCHeap::GetHeap( heap_select::select_heap(acontext)); + acontext->alloc_heap = acontext->home_heap; + } + else + */ { - gc_heap* org_hp = acontext->get_alloc_heap()->pGenGCHeap; + gc_heap* org_hp = acontext->get_alloc_heap ()->pGenGCHeap; + int org_hp_num = org_hp->heap_number; + int final_alloc_hp_num = org_hp_num; dynamic_data* dd = org_hp->dynamic_data_of (0); ptrdiff_t org_size = dd_new_allocation (dd); + ptrdiff_t total_size = (ptrdiff_t)dd_desired_allocation (dd); + +#ifdef HEAP_BALANCE_INSTRUMENTATION + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP[p%3d] ph h%3d, hh: %3d, ah: %3d (%dmb-%dmb), ac: %5d(%s)", + proc_no, proc_hp_num, home_hp->heap_number, + org_hp_num, (total_size / 1024 / 1024), (org_size / 1024 / 1024), + acontext->alloc_count, + ((proc_hp_num == home_hp->heap_number) ? "AC" : "H"))); +#endif //HEAP_BALANCE_INSTRUMENTATION + int org_alloc_context_count; int max_alloc_context_count; gc_heap* max_hp; + int max_hp_num = 0; ptrdiff_t max_size; - size_t delta = dd_min_size (dd)/4; + size_t local_delta = max (((size_t)org_size >> 6), min_gen0_balance_delta); + size_t delta = local_delta; + + if (((size_t)org_size + 2 * delta) >= (size_t)total_size) + { + acontext->alloc_count++; + return; + } int start, end, finish; - heap_select::get_heap_range_for_heap(org_hp->heap_number, &start, &end); + heap_select::get_heap_range_for_heap (org_hp->heap_number, &start, &end); finish = start + n_heaps; try_again: + gc_heap* new_home_hp = 0; + do { max_hp = org_hp; + max_hp_num = org_hp_num; max_size = org_size + delta; - acontext->set_home_heap(GCHeap::GetHeap( heap_select::select_heap(acontext, hint) )); +#ifdef HEAP_BALANCE_INSTRUMENTATION + proc_no = GCToOSInterface::GetCurrentProcessorNumber (); + if (proc_no != last_proc_no) + { + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSP: %d->%d", last_proc_no, proc_no)); + multiple_procs_p = true; + last_proc_no = proc_no; + } - if (org_hp == acontext->get_home_heap()->pGenGCHeap) + int current_hp_num = heap_select::proc_no_to_heap_no[proc_no]; + acontext->set_home_heap (GCHeap::GetHeap (current_hp_num)); +#else + acontext->set_home_heap (GCHeap::GetHeap (heap_select::select_heap (acontext))); +#endif //HEAP_BALANCE_INSTRUMENTATION + new_home_hp = acontext->get_home_heap ()->pGenGCHeap; + if (org_hp == new_home_hp) max_size = max_size + delta; org_alloc_context_count = org_hp->alloc_context_count; @@ -13568,62 +14101,98 @@ try_again: if (max_alloc_context_count > 1) max_size /= max_alloc_context_count; + int actual_start = start; + int actual_end = (end - 1); + for (int i = start; i < end; i++) { - gc_heap* hp = GCHeap::GetHeap(i%n_heaps)->pGenGCHeap; + gc_heap* hp = GCHeap::GetHeap (i % n_heaps)->pGenGCHeap; dd = hp->dynamic_data_of (0); ptrdiff_t size = dd_new_allocation (dd); - if (hp == acontext->get_home_heap()->pGenGCHeap) + + if (hp == new_home_hp) + { size = size + delta; + } int hp_alloc_context_count = hp->alloc_context_count; + if (hp_alloc_context_count > 0) + { size /= (hp_alloc_context_count + 1); + } if (size > max_size) { +#ifdef HEAP_BALANCE_INSTRUMENTATION + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPorg h%d(%dmb), m h%d(%dmb)", + org_hp_num, (max_size / 1024 / 1024), + hp->heap_number, (size / 1024 / 1024))); +#endif //HEAP_BALANCE_INSTRUMENTATION + max_hp = hp; max_size = size; + max_hp_num = max_hp->heap_number; max_alloc_context_count = hp_alloc_context_count; } } - } + } while (org_alloc_context_count != org_hp->alloc_context_count || - max_alloc_context_count != max_hp->alloc_context_count); + max_alloc_context_count != max_hp->alloc_context_count); if ((max_hp == org_hp) && (end < finish)) - { - start = end; end = finish; - delta = dd_min_size(dd)/2; // Make it twice as hard to balance to remote nodes on NUMA. + { + start = end; end = finish; + delta = local_delta * 2; // Make it twice as hard to balance to remote nodes on NUMA. goto try_again; } +#ifdef HEAP_BALANCE_INSTRUMENTATION + uint16_t ideal_proc_no_before_set_ideal = 0; + GCToOSInterface::GetCurrentThreadIdealProc (&ideal_proc_no_before_set_ideal); +#endif //HEAP_BALANCE_INSTRUMENTATION + if (max_hp != org_hp) { + final_alloc_hp_num = max_hp->heap_number; + org_hp->alloc_context_count--; max_hp->alloc_context_count++; - acontext->set_alloc_heap(GCHeap::GetHeap(max_hp->heap_number)); + + acontext->set_alloc_heap (GCHeap::GetHeap (final_alloc_hp_num)); if (!gc_thread_no_affinitize_p) { - uint16_t src_proc_no = heap_select::find_proc_no_from_heap_no(org_hp->heap_number); - uint16_t dst_proc_no = heap_select::find_proc_no_from_heap_no(max_hp->heap_number); + uint16_t src_proc_no = heap_select::find_proc_no_from_heap_no (org_hp->heap_number); + uint16_t dst_proc_no = heap_select::find_proc_no_from_heap_no (max_hp->heap_number); + + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSW! h%d(p%d)->h%d(p%d)", + org_hp_num, src_proc_no, final_alloc_hp_num, dst_proc_no)); + +#ifdef HEAP_BALANCE_INSTRUMENTATION + int current_proc_no_before_set_ideal = GCToOSInterface::GetCurrentProcessorNumber (); + if (current_proc_no_before_set_ideal != last_proc_no) + { + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSPa: %d->%d", last_proc_no, current_proc_no_before_set_ideal)); + multiple_procs_p = true; + } +#endif //HEAP_BALANCE_INSTRUMENTATION - if (!GCToOSInterface::SetCurrentThreadIdealAffinity(src_proc_no, dst_proc_no)) + if (!GCToOSInterface::SetCurrentThreadIdealAffinity (src_proc_no, dst_proc_no)) + { + dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPFailed to set the ideal processor for heap %d %d->%d", + org_hp->heap_number, (int)src_proc_no, (int)dst_proc_no)); + } +#ifdef HEAP_BALANCE_INSTRUMENTATION + else { - dprintf (3, ("Failed to set the ideal processor for heap %d.", - org_hp->heap_number)); + set_ideal_p = true; } +#endif //HEAP_BALANCE_INSTRUMENTATION } - dprintf (3, ("Switching context %p (home heap %d) ", - acontext, - acontext->get_home_heap()->pGenGCHeap->heap_number)); - dprintf (3, (" from heap %d (%Id free bytes, %d contexts) ", - org_hp->heap_number, - org_size, - org_alloc_context_count)); - dprintf (3, (" to heap %d (%Id free bytes, %d contexts)\n", - max_hp->heap_number, - dd_new_allocation(max_hp->dynamic_data_of(0)), - max_alloc_context_count)); } + +#ifdef HEAP_BALANCE_INSTRUMENTATION + add_to_hb_numa (proc_no, ideal_proc_no_before_set_ideal, + final_alloc_hp_num, multiple_procs_p, alloc_count_p, set_ideal_p); +#endif //HEAP_BALANCE_INSTRUMENTATION } } } @@ -13650,8 +14219,8 @@ ptrdiff_t gc_heap::get_balance_heaps_loh_effective_budget () gc_heap* gc_heap::balance_heaps_loh (alloc_context* acontext, size_t alloc_size) { - const int home_hp_num = heap_select::select_heap(acontext, 0); - dprintf (3, ("[h%d] LA: %Id", home_heap, alloc_size)); + const int home_hp_num = heap_select::select_heap(acontext); + dprintf (3, ("[h%d] LA: %Id", home_hp_num, alloc_size)); gc_heap* home_hp = GCHeap::GetHeap(home_hp_num)->pGenGCHeap; dynamic_data* dd = home_hp->dynamic_data_of (max_generation + 1); const ptrdiff_t home_hp_size = home_hp->get_balance_heaps_loh_effective_budget (); @@ -13705,7 +14274,7 @@ try_again: gc_heap* gc_heap::balance_heaps_loh_hard_limit_retry (alloc_context* acontext, size_t alloc_size) { assert (heap_hard_limit); - const int home_heap = heap_select::select_heap(acontext, 0); + const int home_heap = heap_select::select_heap(acontext); dprintf (3, ("[h%d] balance_heaps_loh_hard_limit_retry alloc_size: %d", home_heap, alloc_size)); int start, end; heap_select::get_heap_range_for_heap (home_heap, &start, &end); @@ -15056,13 +15625,13 @@ int gc_heap::generation_to_condemn (int n_initial, generation_free_obj_space (large_object_generation); //save new_allocation - for (i = 0; i <= max_generation+1; i++) + for (i = 0; i <= max_generation + 1; i++) { dynamic_data* dd = dynamic_data_of (i); - dprintf (GTC_LOG, ("h%d: g%d: l: %Id (%Id)", - heap_number, i, - dd_new_allocation (dd), - dd_desired_allocation (dd))); + dprintf (GTC_LOG, ("h%d: g%d: l: %Id (%Id)", + heap_number, i, + dd_new_allocation (dd), + dd_desired_allocation (dd))); dd_gc_new_allocation (dd) = dd_new_allocation (dd); } @@ -15788,6 +16357,14 @@ void gc_heap::gc1() dynamic_data* dd = dynamic_data_of (n); dd_gc_elapsed_time (dd) = end_gc_time - dd_time_clock (dd); +#ifdef HEAP_BALANCE_INSTRUMENTATION + if (heap_number == 0) + { + last_gc_end_time_ms = end_gc_time; + dprintf (HEAP_BALANCE_LOG, ("[GC#%Id-%Id-BGC]", settings.gc_index, dd_gc_elapsed_time (dd))); + } +#endif //HEAP_BALANCE_INSTRUMENTATION + free_list_info (max_generation, "after computing new dynamic data"); gc_history_per_heap* current_gc_data_per_heap = get_gc_data_per_heap(); @@ -15829,10 +16406,16 @@ void gc_heap::gc1() if (heap_number == 0) { - dprintf (GTC_LOG, ("GC#%d(gen%d) took %Idms", + size_t gc_elapsed_time = dd_gc_elapsed_time (dynamic_data_of (0)); +#ifdef HEAP_BALANCE_INSTRUMENTATION + last_gc_end_time_ms = end_gc_time; + dprintf (HEAP_BALANCE_LOG, ("[GC#%Id-%Id-%Id]", settings.gc_index, gc_elapsed_time, dd_time_clock (dynamic_data_of (0)))); +#endif //HEAP_BALANCE_INSTRUMENTATION + + dprintf (GTC_LOG, ("GC#%d(gen%d) took %Idms", dd_collection_count (dynamic_data_of (0)), settings.condemned_generation, - dd_gc_elapsed_time (dynamic_data_of (0)))); + gc_elapsed_time)); } for (int gen_number = 0; gen_number <= (max_generation + 1); gen_number++) @@ -16095,12 +16678,9 @@ void gc_heap::gc1() #if 1 //subsumed by the linear allocation model // to avoid spikes in mem usage due to short terms fluctuations in survivorship, // apply some smoothing. - static size_t smoothed_desired_per_heap = 0; size_t smoothing = 3; // exponential smoothing factor - if (smoothing > VolatileLoad(&settings.gc_index)) - smoothing = VolatileLoad(&settings.gc_index); smoothed_desired_per_heap = desired_per_heap / smoothing + ((smoothed_desired_per_heap / smoothing) * (smoothing-1)); - dprintf (1, ("sn = %Id n = %Id", smoothed_desired_per_heap, desired_per_heap)); + dprintf (HEAP_BALANCE_LOG, ("TEMPsn = %Id n = %Id", smoothed_desired_per_heap, desired_per_heap)); desired_per_heap = Align(smoothed_desired_per_heap, get_alignment_constant (true)); #endif //0 @@ -17162,6 +17742,11 @@ void gc_heap::garbage_collect (int n) settings.gc_index = (uint32_t)dd_collection_count (dynamic_data_of (0)) + 1; +#ifdef MULTIPLE_HEAPS + hb_log_balance_activities(); + hb_log_new_allocation(); +#endif //MULTIPLE_HEAPS + // Call the EE for start of GC work // just one thread for MP GC GCToEEInterface::GcStartWork (settings.condemned_generation, @@ -25445,7 +26030,7 @@ void gc_heap::gc_thread_stub (void* arg) // We are about to set affinity for GC threads. It is a good place to set up NUMA and // CPU groups because the process mask, processor number, and group number are all // readily available. - set_thread_affinity_for_heap(heap->heap_number); + set_thread_affinity_for_heap (heap->heap_number, heap_select::find_proc_no_from_heap_no (heap->heap_number)); } // server GC threads run at a higher priority than normal. @@ -27492,9 +28077,6 @@ void gc_heap::bgc_thread_function() dprintf (3, ("no concurrent GC needed, exiting")); break; } -#ifdef TRACE_GC - //trace_gc = TRUE; -#endif //TRACE_GC recursive_gc_sync::begin_background(); dprintf (2, ("beginning of bgc: gen2 FL: %d, FO: %d, frag: %d", generation_free_list_space (generation_of (max_generation)), @@ -27505,10 +28087,6 @@ void gc_heap::bgc_thread_function() current_bgc_state = bgc_not_in_process; -#ifdef TRACE_GC - //trace_gc = FALSE; -#endif //TRACE_GC - enable_preemptive (); #ifdef MULTIPLE_HEAPS bgc_t_join.join(this, gc_join_done); @@ -30060,7 +30638,6 @@ void gc_heap::init_static_data() } gen0_max_size = Align (gen0_max_size); - gen0_min_size = min (gen0_min_size, gen0_max_size); // TODO: gen0_max_size has a 200mb cap; gen1_max_size should also have a cap. @@ -30086,12 +30663,23 @@ void gc_heap::init_static_data() bool gc_heap::init_dynamic_data() { - qpf = GCToOSInterface::QueryPerformanceFrequency(); - - uint32_t now = (uint32_t)GetHighPrecisionTimeStamp(); + uint64_t now_raw_ts = RawGetHighPrecisionTimeStamp (); +#ifdef HEAP_BALANCE_INSTRUMENTATION + start_raw_ts = now_raw_ts; +#endif //HEAP_BALANCE_INSTRUMENTATION + uint32_t now = (uint32_t)(now_raw_ts / (qpf / 1000)); set_static_data(); + if (heap_number == 0) + { + smoothed_desired_per_heap = dynamic_data_of (0)->min_size; +#ifdef HEAP_BALANCE_INSTRUMENTATION + last_gc_end_time_ms = now; + dprintf (HEAP_BALANCE_LOG, ("qpf=%I64d, start: %I64d(%d)", qpf, start_raw_ts, now)); +#endif //HEAP_BALANCE_INSTRUMENTATION + } + for (int i = 0; i <= max_generation+1; i++) { dynamic_data* dd = dynamic_data_of (i); @@ -34129,6 +34717,8 @@ HRESULT GCHeap::Initialize() { HRESULT hr = S_OK; + qpf = GCToOSInterface::QueryPerformanceFrequency(); + g_gc_pFreeObjectMethodTable = GCToEEInterface::GetFreeObjectMethodTable(); g_num_processors = GCToOSInterface::GetTotalProcessorCount(); assert(g_num_processors != 0); @@ -34191,18 +34781,23 @@ HRESULT GCHeap::Initialize() return CLR_E_GC_BAD_AFFINITY_CONFIG; } + if ((cpu_index_ranges_holder.Get() != nullptr) || (config_affinity_mask != 0)) + { + affinity_config_specified_p = true; + } + nhp_from_config = static_cast(GCConfig::GetHeapCount()); - uint32_t nhp_from_process = GCToOSInterface::GetCurrentProcessCpuCount(); + g_num_active_processors = GCToOSInterface::GetCurrentProcessCpuCount(); if (nhp_from_config) { // Even when the user specifies a heap count, it should not be more // than the number of procs this process can use. - nhp_from_config = min (nhp_from_config, nhp_from_process); + nhp_from_config = min (nhp_from_config, g_num_active_processors); } - nhp = ((nhp_from_config == 0) ? nhp_from_process : nhp_from_config); + nhp = ((nhp_from_config == 0) ? g_num_active_processors : nhp_from_config); nhp = min (nhp, MAX_SUPPORTED_CPUS); #ifndef FEATURE_REDHAWK @@ -34350,8 +34945,62 @@ HRESULT GCHeap::Initialize() return hr; } } - // initialize numa node to heap map - heap_select::init_numa_node_to_heap_map(nhp); + + heap_select::init_numa_node_to_heap_map (nhp); + + // If we have more active processors than heaps we still want to initialize some of the + // mapping for the rest of the active processors because user threads can still run on + // them which means it's important to know their numa nodes and map them to a reasonable + // heap, ie, we wouldn't want to have all such procs go to heap 0. + if (g_num_active_processors > nhp) + heap_select::distribute_other_procs(); + + gc_heap* hp = gc_heap::g_heaps[0]; + + dynamic_data* gen0_dd = hp->dynamic_data_of (0); + gc_heap::min_gen0_balance_delta = (dd_min_size (gen0_dd) >> 3); + +#ifdef HEAP_BALANCE_INSTRUMENTATION + cpu_group_enabled_p = GCToOSInterface::CanEnableGCCPUGroups(); + + if (!GCToOSInterface::GetNumaInfo (&total_numa_nodes_on_machine, &procs_per_numa_node)) + { + total_numa_nodes_on_machine = 1; + + // Note that if we are in cpu groups we need to take the way proc index is calculated + // into consideration. It would mean we have more than 64 procs on one numa node - + // this is mostly for testing (if we want to simulate no numa on a numa system). + // see vm\gcenv.os.cpp GroupProcNo implementation. + if (GCToOSInterface::GetCPUGroupInfo (&total_cpu_groups_on_machine, &procs_per_cpu_group)) + procs_per_numa_node = procs_per_cpu_group + ((total_cpu_groups_on_machine - 1) << 6); + else + procs_per_numa_node = g_num_processors; + } + hb_info_numa_nodes = new (nothrow) heap_balance_info_numa[total_numa_nodes_on_machine]; + dprintf (HEAP_BALANCE_LOG, ("total: %d, numa: %d", g_num_processors, total_numa_nodes_on_machine)); + + int hb_info_size_per_proc = sizeof (heap_balance_info_proc); + + for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++) + { + int hb_info_size_per_node = hb_info_size_per_proc * procs_per_numa_node; + uint8_t* numa_mem = (uint8_t*)GCToOSInterface::VirtualReserve (hb_info_size_per_node, 0, 0, numa_node_index); + if (!numa_mem) + return E_FAIL; + if (!GCToOSInterface::VirtualCommit (numa_mem, hb_info_size_per_node, numa_node_index)) + return E_FAIL; + + heap_balance_info_proc* hb_info_procs = (heap_balance_info_proc*)numa_mem; + hb_info_numa_nodes[numa_node_index].hb_info_procs = hb_info_procs; + + for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++) + { + heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index]; + hb_info_proc->count = default_max_hb_heap_balance_info; + hb_info_proc->index = 0; + } + } +#endif //HEAP_BALANCE_INSTRUMENTATION #else hr = Init (0); #endif //MULTIPLE_HEAPS @@ -36106,13 +36755,14 @@ size_t GCHeap::ApproxTotalBytesInUse(BOOL small_heap_only) void GCHeap::AssignHeap (alloc_context* acontext) { // Assign heap based on processor - acontext->set_alloc_heap(GetHeap(heap_select::select_heap(acontext, 0))); + acontext->set_alloc_heap(GetHeap(heap_select::select_heap(acontext))); acontext->set_home_heap(acontext->get_alloc_heap()); } + GCHeap* GCHeap::GetHeap (int n) { assert (n < gc_heap::n_heaps); - return gc_heap::g_heaps [n]->vm_heap; + return gc_heap::g_heaps[n]->vm_heap; } #endif //MULTIPLE_HEAPS @@ -36244,6 +36894,7 @@ bool GCHeap::RegisterForFullGCNotification(uint32_t gen2Percentage, { gc_heap* hp = gc_heap::g_heaps [hn]; hp->fgn_last_alloc = dd_new_allocation (hp->dynamic_data_of (0)); + hp->fgn_maxgen_percent = gen2Percentage; } #else //MULTIPLE_HEAPS pGenGCHeap->fgn_last_alloc = dd_new_allocation (pGenGCHeap->dynamic_data_of (0)); @@ -36253,7 +36904,6 @@ bool GCHeap::RegisterForFullGCNotification(uint32_t gen2Percentage, pGenGCHeap->full_gc_end_event.Reset(); pGenGCHeap->full_gc_approach_event_set = false; - pGenGCHeap->fgn_maxgen_percent = gen2Percentage; pGenGCHeap->fgn_loh_percent = lohPercentage; return TRUE; diff --git a/src/coreclr/src/gc/gcpriv.h b/src/coreclr/src/gc/gcpriv.h index 7c8286b..c194c54 100644 --- a/src/coreclr/src/gc/gcpriv.h +++ b/src/coreclr/src/gc/gcpriv.h @@ -40,6 +40,13 @@ inline void FATAL_GC_ERROR() GCToEEInterface::HandleFatalError((unsigned int)COR_E_EXECUTIONENGINE); } +#ifdef MULTIPLE_HEAPS +// This turns on instrumentation that collects info for heap balancing. +// Define it and make sure you have HEAP_BALANCE_LOG/HEAP_BALANCE_TEMP_LOG +// level logging enabled *only*. +//#define HEAP_BALANCE_INSTRUMENTATION +#endif //MULTIPLE_HEAPS + #ifdef _MSC_VER #pragma inline_depth(20) #endif @@ -255,6 +262,15 @@ const int policy_expand = 2; #define SPINLOCK_LOG (DT_LOG_0 + 5) #define SNOOP_LOG (DT_LOG_0 + 6) +// NOTE! This is for HEAP_BALANCE_INSTRUMENTATION +// This particular one is special and needs to be well formatted because we +// do post processing on it with tools\GCLogParser. If you need to add some +// detail to help with investigation that's not 't processed by tooling +// prefix it with TEMP so that line will be written to the results as is in +// the result. I have some already logged with HEAP_BALANCE_TEMP_LOG. +#define HEAP_BALANCE_LOG (DT_LOG_0 + 7) +#define HEAP_BALANCE_TEMP_LOG (DT_LOG_0 + 8) + #ifndef DACCESS_COMPILE #ifdef SIMPLE_DPRINTF @@ -267,6 +283,7 @@ void GCLog (const char *fmt, ... ); //#define dprintf(l,x) {if (trace_gc && ((l <= 2) || (l == BGC_LOG) || (l==GTC_LOG))) {GCLog x;}} //#define dprintf(l,x) {if ((l == 1) || (l == 2222)) {GCLog x;}} #define dprintf(l,x) {if ((l <= 1) || (l == GTC_LOG)) {GCLog x;}} +//#define dprintf(l,x) {if (l == HEAP_BALANCE_LOG) {GCLog x;}} //#define dprintf(l,x) {if ((l==GTC_LOG) || (l <= 1)) {GCLog x;}} //#define dprintf(l,x) {if (trace_gc && ((l <= print_level) || (l==GTC_LOG))) {GCLog x;}} //#define dprintf(l,x) {if (l==GTC_LOG) {printf ("\n");printf x ; fflush(stdout);}} @@ -1222,6 +1239,12 @@ public: uint32_t flags); #ifdef MULTIPLE_HEAPS + PER_HEAP_ISOLATED + void hb_log_new_allocation(); + + PER_HEAP_ISOLATED + void hb_log_balance_activities(); + static void balance_heaps (alloc_context* acontext); PER_HEAP @@ -2965,7 +2988,7 @@ public: GCEvent full_gc_end_event; // Full GC Notification percentages. - PER_HEAP_ISOLATED + PER_HEAP uint32_t fgn_maxgen_percent; PER_HEAP_ISOLATED @@ -3026,6 +3049,9 @@ public: PER_HEAP heap_segment* new_heap_segment; + PER_HEAP_ISOLATED + size_t min_gen0_balance_delta; + #define alloc_quantum_balance_units (16) PER_HEAP_ISOLATED @@ -3091,7 +3117,7 @@ public: PER_HEAP_ISOLATED uint64_t total_physical_mem; - + PER_HEAP_ISOLATED uint64_t entry_available_physical_mem; @@ -3175,6 +3201,11 @@ public: PER_HEAP_ISOLATED size_t last_gc_index; +#ifdef HEAP_BALANCE_INSTRUMENTATION + PER_HEAP_ISOLATED + size_t last_gc_end_time_ms; +#endif //HEAP_BALANCE_INSTRUMENTATION + #ifdef SEG_MAPPING_TABLE PER_HEAP_ISOLATED size_t min_segment_size; diff --git a/src/coreclr/src/gc/unix/gcenv.unix.cpp b/src/coreclr/src/gc/unix/gcenv.unix.cpp index 3d7442a..4a48a4a 100644 --- a/src/coreclr/src/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/src/gc/unix/gcenv.unix.cpp @@ -527,9 +527,10 @@ static void* VirtualReserveInner(size_t size, size_t alignment, uint32_t flags, // size - size of the virtual memory range // alignment - requested memory alignment, 0 means no specific alignment requested // flags - flags to control special settings like write watching +// node - the NUMA node to reserve memory on // Return: // Starting virtual address of the reserved range -void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags) +void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node) { return VirtualReserveInner(size, alignment, flags); } @@ -910,6 +911,11 @@ bool GCToOSInterface::CanEnableGCNumaAware() return g_numaAvailable; } +bool GCToOSInterface::CanEnableGCCPUGroups() +{ + return false; +} + // Get processor number and optionally its NUMA node number for the specified heap number // Parameters: // heap_number - heap number to get the result for diff --git a/src/coreclr/src/gc/windows/gcenv.windows.cpp b/src/coreclr/src/gc/windows/gcenv.windows.cpp index e5c4d7a..e8f0035 100644 --- a/src/coreclr/src/gc/windows/gcenv.windows.cpp +++ b/src/coreclr/src/gc/windows/gcenv.windows.cpp @@ -37,6 +37,7 @@ typedef BOOL (WINAPI *PQUERY_INFORMATION_JOB_OBJECT)(HANDLE jobHandle, JOBOBJECT namespace { static bool g_fEnableGCNumaAware; +static uint32_t g_nNodes; class GroupProcNo { @@ -91,6 +92,7 @@ void InitNumaNodeInfo() if (!GetNumaHighestNodeNumber(&highest) || (highest == 0)) return; + g_nNodes = highest + 1; g_fEnableGCNumaAware = true; return; } @@ -659,6 +661,21 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t return success; } +bool GCToOSInterface::GetCurrentThreadIdealProc(uint16_t* procNo) +{ + PROCESSOR_NUMBER proc; + + bool success = GetThreadIdealProcessorEx (GetCurrentThread (), &proc); + + if (success) + { + GroupProcNo groupProcNo(proc.Group, proc.Number); + *procNo = groupProcNo.GetCombinedValue(); + } + + return success; +} + // Get the number of the current processor uint32_t GCToOSInterface::GetCurrentProcessorNumber() { @@ -713,17 +730,26 @@ void GCToOSInterface::YieldThread(uint32_t switchCount) // size - size of the virtual memory range // alignment - requested memory alignment, 0 means no specific alignment requested // flags - flags to control special settings like write watching +// node - the NUMA node to reserve memory on // Return: // Starting virtual address of the reserved range -void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags) +void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node) { // Windows already ensures 64kb alignment on VirtualAlloc. The current CLR // implementation ignores it on Windows, other than making some sanity checks on it. UNREFERENCED_PARAMETER(alignment); assert((alignment & (alignment - 1)) == 0); assert(alignment <= 0x10000); - DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE; - return ::VirtualAlloc(nullptr, size, memFlags, PAGE_READWRITE); + + if (node == NUMA_NODE_UNDEFINED) + { + DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE; + return ::VirtualAlloc (nullptr, size, memFlags, PAGE_READWRITE); + } + else + { + return ::VirtualAllocExNuma (::GetCurrentProcess (), NULL, size, MEM_RESERVE, PAGE_READWRITE, node); + } } // Release virtual memory range previously reserved using VirtualReserve @@ -1286,6 +1312,57 @@ bool GCToOSInterface::CanEnableGCNumaAware() return g_fEnableGCNumaAware; } +bool GCToOSInterface::GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node) +{ + if (g_fEnableGCNumaAware) + { + DWORD currentProcsOnNode = 0; + for (uint32_t i = 0; i < g_nNodes; i++) + { + GROUP_AFFINITY processorMask; + if (GetNumaNodeProcessorMaskEx(i, &processorMask)) + { + DWORD procsOnNode = 0; + uintptr_t mask = (uintptr_t)processorMask.Mask; + while (mask) + { + procsOnNode++; + mask &= mask - 1; + } + + currentProcsOnNode = max(currentProcsOnNode, procsOnNode); + } + *max_procs_per_node = currentProcsOnNode; + *total_nodes = g_nNodes; + } + return true; + } + + return false; +} + +bool GCToOSInterface::CanEnableGCCPUGroups() +{ + return g_fEnableGCCPUGroups; +} + +bool GCToOSInterface::GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group) +{ + if (g_fEnableGCCPUGroups) + { + *total_groups = (uint16_t)g_nGroups; + DWORD currentProcsInGroup = 0; + for (WORD i = 0; i < g_nGroups; i++) + { + currentProcsInGroup = max(currentProcsInGroup, g_CPUGroupInfoArray[i].nr_active); + } + *max_procs_per_group = currentProcsInGroup; + return true; + } + + return false; +} + // Get processor number and optionally its NUMA node number for the specified heap number // Parameters: // heap_number - heap number to get the result for diff --git a/src/coreclr/src/inc/utilcode.h b/src/coreclr/src/inc/utilcode.h index 1021a1d..5b8107c 100644 --- a/src/coreclr/src/inc/utilcode.h +++ b/src/coreclr/src/inc/utilcode.h @@ -1306,6 +1306,7 @@ class NumaNodeInfo { private: static BOOL m_enableGCNumaAware; + static uint16_t m_nNodes; static BOOL InitNumaNodeInfoAPI(); public: @@ -1319,6 +1320,7 @@ public: // functions DWORD allocType, DWORD prot, DWORD node); #ifndef FEATURE_PAL static BOOL GetNumaProcessorNodeEx(PPROCESSOR_NUMBER proc_no, PUSHORT node_no); + static bool GetNumaInfo(PUSHORT total_nodes, DWORD* max_procs_per_node); #else // !FEATURE_PAL static BOOL GetNumaProcessorNodeEx(USHORT proc_no, PUSHORT node_no); #endif // !FEATURE_PAL @@ -1363,6 +1365,7 @@ public: static void GetGroupForProcessor(WORD processor_number, WORD *group_number, WORD *group_processor_number); static DWORD CalculateCurrentProcessorNumber(); + static bool GetCPUGroupInfo(PUSHORT total_groups, DWORD* max_procs_per_group); //static void PopulateCPUUsageArray(void * infoBuffer, ULONG infoSize); #if !defined(FEATURE_REDHAWK) diff --git a/src/coreclr/src/tools/GCLogParser/App.config b/src/coreclr/src/tools/GCLogParser/App.config new file mode 100644 index 0000000..56efbc7 --- /dev/null +++ b/src/coreclr/src/tools/GCLogParser/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/src/coreclr/src/tools/GCLogParser/Properties/AssemblyInfo.cs b/src/coreclr/src/tools/GCLogParser/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..047ddd6 --- /dev/null +++ b/src/coreclr/src/tools/GCLogParser/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("parse-hb-log")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("parse-hb-log")] +[assembly: AssemblyCopyright("Copyright © 2019")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("6ae26cd6-c971-48d6-8c03-2ffa272b942c")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/src/coreclr/src/tools/GCLogParser/parse-hb-log.cs b/src/coreclr/src/tools/GCLogParser/parse-hb-log.cs new file mode 100644 index 0000000..44ac067 --- /dev/null +++ b/src/coreclr/src/tools/GCLogParser/parse-hb-log.cs @@ -0,0 +1,1099 @@ +// parse-hb-log.exe -ia 1 -l gclog.pid.log +// where gclog.pid.log is the log you get from GC with HEAP_BALANCE_LOG. See +// comments for it in gcpriv.h +// +// -ia 1 means to include all timestamp instead of only timestamps where we +// observed samples. This is usually what you want as it makes it really +// obvious when a thread is not active enough. +// +// This prints out the following in the console: +// Samples observed on each thread and thread mapping (from id to index) +// and it tells you the BFR (Budget Fill Ratio) like this: +// +//Printing out proc [0,[28 +//Total 1074 GCs, avg 0ms, node 0: BFR %90.83 +//Printing out proc [28,[56 +//Total 1074 GCs, avg 0ms, node 1: BFR %89.43 +// +// and generates 2 logs per numa node - +// pass-zero-pass1-nX-ia1-alloc.txt +// pass-zero-pass1-nX-ia1-thread.txt +// +// thread tells you the thread index running on each proc at each timestamp. +// 4240| 63₉ | 65₉ | 62₉ | 56₁₀| 87₁₀|109₁₀| 59₉ | 70₁₀| 78₉ | 64₉ | 71₁₀|107₁₀| +// +// 4240 is the 4240th ms since we started recording. +// the numbers following are the thread indices and the subscript is the # of samples +// observed during that ms. The tool can do a time unit that's larger than 1ms. +// +// alloc tells you which alloc heap the each proc, for the same timestamp +// 4240| 56 | 57 | 58ⁱ | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | +// 56 means heap 56. The subscript i means we did a SetIdealProcessor during this +// ms. You may also see +// ᵖ meaning we went through the balancing logic due to the proc for the thread changed +// from the home heap. +// ᵐ meaning while we were going through balancing logic the proc switched. + +using System; +using System.Collections.Generic; +using System.Text; +using System.IO; +using System.Linq; + +namespace parse_hb_log +{ + enum HeapBalanceFlagMask + { + MutipleProcs = 0xf, + EnterDueToProc = 0xf0, + SetIdeal = 0xf00, + }; + struct SampleInfo + { + // -1 means uninit. + public int tid; + public int allocHeap; + public int countSamples; + // This is to present m/p/i - each takes 4 bits, mask for them + // is defined in HeapBalanceFlagMask. Most of the time the flags are + // 0s. + public int flags; + public int idealProcNo; + + public SampleInfo(int _tid, int _allocHeap, int _countSamples, int _flags, int _idealProcNo) + { + tid = _tid; + allocHeap = _allocHeap; + countSamples = _countSamples; + flags = _flags; + idealProcNo = _idealProcNo; + } + }; + + enum PassOneViewType + { + Thread = 0, + AllocHeap = 1, + MaxType = 2 + }; + class Program + { + static bool fLogging = false; + + static string ParseString(string str, string strStart, string strEnd, out string strRemaining) + { + int startIndex = 0; + if (strStart != null) + { + startIndex = str.IndexOf(strStart); + if (startIndex == -1) + { + Console.WriteLine("couldn't find {0} in \"{1}\"!!!", strStart, str); + strRemaining = str; + return null; + } + + startIndex += strStart.Length; + } + + //string strRest = str.Substring(startIndex + strStart.Length); + string strRest = str.Substring(startIndex); + string strRet = null; + //Console.WriteLine(strRest); + if (strEnd == null) + { + strRet = strRest; + strRemaining = null; + } + else + { + int endIndex = strRest.IndexOf(strEnd); + strRet = strRest.Substring(0, endIndex); + strRemaining = strRest.Substring(endIndex); + } + + //Console.WriteLine("string is --{0}--, remaining --{1}--", strRet, strRemaining); + return strRet; + } + + static StreamWriter swPassZero = null; + static string strPassZeroLog = "pass-zero.txt"; + static int totalProcs = 0; + static int totalNodes = 0; + static int procsPerNode = 0; + // this is qpf / 1000 so we calculate ms instead of s. + static UInt64 qpfAdjusted = 0; + // we log the current qpc so substract by this. + static UInt64 qpcStart = 0; + static Int32 totalAllocThreads = 0; + static Dictionary threadMapping = new Dictionary(112); + // We do compress the samples, this stores the aggregated sample counts for all procs. + static int[] aggregatedSampleCount = null; + + static int timeUnitMS = 1; + // We wanna compress the log by compressing the samples in the same unit of time. + // We get samples by procs, so on the same proc if we observe the same thread + // allocating on the same alloc heap in the same time unit we simply count the total + // samples. However we do want to remember if any of the p/m/i is set. + static string strLastTID = null; + static string strTID = null; + static string strLastThreadAllocHeap = null; + static string strAllocHeap = null; + static string strIdealProc = null; + static int lastTimeUnit = 0; + static int lastThreadSampleCount = 0; + static int lastThreadPCount = 0; + static int lastThreadMCount = 0; + static int lastThreadICount = 0; + static int lastProcIndex = -1; + static int largestTimeInBetweenGCs = 0; + + static void InitSampleInfoPassZero() + { + strLastTID = null; + strTID = null; + strLastThreadAllocHeap = null; + strAllocHeap = null; + lastTimeUnit = 0; + lastThreadSampleCount = 0; + lastThreadPCount = 0; + lastThreadMCount = 0; + lastThreadICount = 0; + } + + static void LogPassZeroAggregatedSample(int _lastTimeUnit, int _tid, int _sampleCount, string _strAllocHeap) + { + swPassZero.WriteLine("{0}ms,{1}({2}),{3},m:{4},p:{5},i:{6}({7})", + _lastTimeUnit, + _tid, + _sampleCount, + _strAllocHeap, + lastThreadMCount, lastThreadPCount, lastThreadICount, + strIdealProc); + + (aggregatedSampleCount[lastProcIndex])++; + } + + // Aside from writing out a new log this also prints out some stats for further processing: + // How many samples are observed on each proc. Because we could use this for many proc analysis, + // it's impractical to have a wide enough window to display all procs. + static void PassZero(string strLog) + { + swPassZero = new StreamWriter(strPassZeroLog); + string strTemp, strRemaining; + using (StreamReader sr = new StreamReader(strLog)) + { + string s; + + while ((s = sr.ReadLine()) != null) + { + if (s.StartsWith("[")) + { + string strAfterTID = ParseString(s, "]", null, out strRemaining); + //Console.WriteLine(s); + + if (strAfterTID.StartsWith("qpf")) + { + // [15900]qpf=10000000, start: 17262240813778(1726224081) + strTemp = ParseString(s, "qpf=", ",", out strRemaining); + qpfAdjusted = UInt64.Parse(strTemp) / 1000; + strTemp = ParseString(strRemaining, "start:", "(", out strRemaining); + qpcStart = UInt64.Parse(strTemp); + Console.WriteLine("QPF adjusted: {0}, init QPC: {1}", qpfAdjusted, qpcStart); + } + else if (strAfterTID.StartsWith("total: ")) + { + // [15900]total: 112, numa: 2 + strTemp = ParseString(strAfterTID, "total: ", ",", out strRemaining); + totalProcs = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, "numa: ", null, out strRemaining); + totalNodes = Int32.Parse(strTemp); + Console.WriteLine("total procs: {0}, nodes: {1}", totalProcs, totalNodes); + procsPerNode = totalProcs / totalNodes; + swPassZero.WriteLine("P: {0}, N: {1}", totalProcs, totalNodes); + aggregatedSampleCount = new int[totalProcs]; + } + else if (strAfterTID == "[GC_alloc_mb]") + { + if (strTID != null) + { + LogPassZeroAggregatedSample( + //swPassZero.WriteLine("last sample before GC: {0}ms,{1}({2}),{3},m:{4},p:{5},i:{6}", + lastTimeUnit, + threadMapping[strTID], + lastThreadSampleCount, + strAllocHeap); + InitSampleInfoPassZero(); + } + + swPassZero.WriteLine("[GC_alloc_mb]"); + //[36812][GC_alloc_mb] + //[N# 24]0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + //[N# 24]0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + int totalNodesRead = 0; + while ((s = sr.ReadLine()) != null) + { + if (s.StartsWith("[N")) + { + //Console.WriteLine(s); + swPassZero.WriteLine(s); + totalNodesRead++; + if (totalNodesRead == totalNodes) + break; + } + } + } + else if (strAfterTID.StartsWith("[p")) + { + if (strTID != null) + { + //swPassZero.WriteLine("last sample before p: {0}ms,{1}({2}),{3},m:{4},p:{5},i:{6}", + LogPassZeroAggregatedSample( + lastTimeUnit, + threadMapping[strTID], + lastThreadSampleCount, + strAllocHeap); + InitSampleInfoPassZero(); + } + + // [36812][p22]-192-254ms + //Console.WriteLine(strAfterTID); + swPassZero.WriteLine(strAfterTID); + strTemp = ParseString(strAfterTID, "p", "]", out strRemaining); + lastProcIndex = Int32.Parse(strTemp); + } + else if (strAfterTID.StartsWith("[GCA#")) + { + // 270 is the time elapsed in ms since the last GC end. + // 18280955362126/18280957621306 are the min/max timestamps observed + // on all procs for this period. + // [39652][GCA#1 270-18280955362126-18280957621306] + //Console.WriteLine(strAfterTID); + // convert the raw ts to relative ms. + strTemp = ParseString(strAfterTID, "-", "-", out strRemaining); + //Console.WriteLine("min ts is {0}", strTemp); + UInt64 minTimestampMS = UInt64.Parse(strTemp); + minTimestampMS /= qpfAdjusted; + strTemp = ParseString(strRemaining, "-", "]", out strRemaining); + //Console.WriteLine("max ts is {0}", strTemp); + UInt64 maxTimestampMS = UInt64.Parse(strTemp); + maxTimestampMS /= qpfAdjusted; + strTemp = ParseString(strAfterTID, null, "-", out strRemaining); + swPassZero.WriteLine("{0}-{1}-{2}", strTemp, minTimestampMS, maxTimestampMS); + + largestTimeInBetweenGCs = Math.Max(largestTimeInBetweenGCs, (int)(maxTimestampMS - minTimestampMS)); + } + else if (strAfterTID.StartsWith("[GC#")) + { + // 12 is GC duration in ms + // [15628][GC#1-12] + //Console.WriteLine(strAfterTID); + swPassZero.WriteLine(strAfterTID); + } + else if (strAfterTID.StartsWith("TEMP")) + { + // If I want to only log something temporarily I prefix it with TEMP so we know to just + // write it out as is. + swPassZero.WriteLine(strAfterTID); + } + else + { + // This is the majority of the log - + // qpc,tid,ideal_proc,alloc_heap + // |m means we observed the thread changed the proc it was running on + // during the iterations we do in balance_heaps since we check at various + // points for the current proc. + // |p means we detected proc != home_heap proc + // |i means we did SetIdeal + //[38328]12791000,80,24540|m,101|p + //[38328]12792000,80,24540,101 + //[38328]12801000,90,33356,111|p + //[38328]12802000,90,33356,111 + // + // this will be converted to + //1279ms,2(2),80,101,m:1,p:1,i:0 + //1280ms,3(2),90,111,m:0,p:1,i:0 + // + // the format is: + // relative time from start in ms, thread index(sample count during the unit of time), + // count for m/p/i observed during this unit of time. + // + strTemp = ParseString(s, "]", ",", out strRemaining); + //UInt64 currentQPC = UInt64.Parse(strTemp); + UInt64 currentQPC = 0; + if (!UInt64.TryParse(strTemp, out currentQPC)) + { + continue; + } + if (fLogging) + { + //Console.WriteLine("ts: {0}-{1}={2}", currentQPC, qpcStart, (currentQPC - qpcStart)); + Console.WriteLine(); + Console.WriteLine(strAfterTID); + } + //double timestamp = (double)(currentQPC - qpcStart) / qpfAdjusted; + double timestamp = (double)currentQPC / qpfAdjusted; + + strTID = ParseString(strRemaining, ",", ",", out strRemaining); + if (strTID == "15580|m") + { + Console.WriteLine(s); + } + if (!threadMapping.ContainsKey(strTID)) + { + threadMapping.Add(strTID, totalAllocThreads); + if (fLogging) + { + Console.WriteLine("Adding {0} as T#{1}", strTID, totalAllocThreads); + } + totalAllocThreads++; + } + + strIdealProc = ParseString(strRemaining, ",", ",", out strRemaining); + + bool multiple_procs_p = false; + bool alloc_count_p = true; + bool set_ideal_p = false; + + if (strRemaining.Contains("|")) + { + strAllocHeap = ParseString(strRemaining, ",", "|", out strRemaining); + if (strRemaining.Contains("m")) + multiple_procs_p = true; + if (strRemaining.Contains("p")) + alloc_count_p = false; + if (strRemaining.Contains("i")) + set_ideal_p = true; + } + else + { + strAllocHeap = ParseString(strRemaining, ",", null, out strRemaining); + } + + if (fLogging) + { + Console.WriteLine("{0:0.00}ms, tid: {1}/{2}, alloc heap: {3}, m: {4}, p: {5}, i: {6}, ideal {7}", + timestamp, strTID, threadMapping[strTID], + strAllocHeap, multiple_procs_p, !alloc_count_p, set_ideal_p, + strIdealProc); + } + //swPassZero.WriteLine("{0:0.00}ms,{1},{2},m:{3},p:{4},i:{5}", + // timestamp, + // threadMapping[strTID], + // strAllocHeap, + // (multiple_procs_p ? "T" : ""), + // (!alloc_count_p ? "T" : ""), + // (set_ideal_p? "T" : "")); + + if ((strTID == strLastTID) && (strLastThreadAllocHeap == strAllocHeap)) + { + if (((int)timestamp - lastTimeUnit) >= timeUnitMS) + { + if (fLogging) + { + Console.WriteLine("time is now {7}>{0}+{8},{1}({2}),{3},m:{4},p:{5},i:{6}", + lastTimeUnit, + threadMapping[strTID], + lastThreadSampleCount, + strAllocHeap, + lastThreadMCount, lastThreadPCount, lastThreadICount, + (int)timestamp, timeUnitMS); + } + LogPassZeroAggregatedSample( + lastTimeUnit, + threadMapping[strTID], + lastThreadSampleCount, + strAllocHeap); + if (fLogging) + { + Console.WriteLine("Set last time to {0} for thread {1}, alloc {2}", + (int)timestamp, strLastTID, strLastThreadAllocHeap); + } + lastTimeUnit = (int)timestamp; + lastThreadSampleCount = 0; + lastThreadMCount = lastThreadPCount = lastThreadICount = 0; + } + } + else + { + if (strLastTID != null) + { + // If we detect the thread or alloc heap changed, we always want to + // log. + if (fLogging) + { + Console.WriteLine("{0},{1}({2}),{3},m:{4},p:{5},i:{6} -> {7}/{8}", + lastTimeUnit, + threadMapping[strLastTID], + lastThreadSampleCount, + strLastThreadAllocHeap, + lastThreadMCount, lastThreadPCount, lastThreadICount, + strTID, strAllocHeap); + } + LogPassZeroAggregatedSample( + lastTimeUnit, + threadMapping[strLastTID], + lastThreadSampleCount, + strLastThreadAllocHeap); + } + + if (fLogging) + { + Console.WriteLine("last tid {0}, last alloc heap {1}, diff, set last time to {2}", + strLastTID, strLastThreadAllocHeap, (int)timestamp); + } + strLastTID = strTID; + strLastThreadAllocHeap = strAllocHeap; + lastTimeUnit = (int)timestamp; + lastThreadSampleCount = 0; + lastThreadPCount = 0; + lastThreadMCount = 0; + lastThreadICount = 0; + } + + lastThreadSampleCount++; + if (multiple_procs_p) lastThreadMCount++; + if (!alloc_count_p) lastThreadPCount++; + if (set_ideal_p) lastThreadICount++; + + //if ((lastThreadMCount > 1) || (lastThreadPCount > 1) || (lastThreadICount > 1)) + //{ + // swPassZero.WriteLine("DETECTED: m: {0}, p: {1}, i: {2}", + // lastThreadMCount, lastThreadPCount, lastThreadICount); + //} + + if (fLogging) + { + Console.WriteLine("timeunit {0}, sample->{1}, MCount->{2}, PCount->{3}, ICount->{4}", + lastTimeUnit, lastThreadSampleCount, + lastThreadMCount, lastThreadPCount, lastThreadICount); + } + } + } + } + } + + Console.WriteLine("{0,-3} | {1,-10}", "p", "samples"); + for (int procIndex = 0; procIndex < totalProcs; procIndex++) + { + if (aggregatedSampleCount[procIndex] != 0) + { + Console.WriteLine("{0,-3} | {1,-10} ", procIndex, aggregatedSampleCount[procIndex]); + } + } + + Console.WriteLine("---Threading mapping---"); + foreach (var t in threadMapping) + { + Console.WriteLine("t#{0,-3} - {1}", t.Key, t.Value); + } + + Console.WriteLine("Largest time span inbetween GCs: {0}ms", largestTimeInBetweenGCs); + swPassZero.Close(); + } + + //========================================================================= + // Pass one. TODO: should separate this from pass zero. + //========================================================================= + static StreamWriter[] swPassOneFiles = null; + // It's difficult to print all of the procs on one line so we only print per node. + static int nodeIndexToPrint = -1; + static bool fIncludeAllTime = false; + static bool fPrintThreadInfoPerTimeUnit = false; + + // This represents the samples for all the procs inbetween GCs so it doesn't grow very large. + // Cleared every GC. + static SampleInfo[][] samples; + // This is the min/max time for inbetween each GC. + static int startTimeMS = 0; + static int endTimeMS = 0; + static int currentProcIndex = -1; + static int lastTimeIndex = -1; + + // This respresents the threads we see in samples so we can get info such as when threads + // start running and how active they are. + // Cleared every GC. + static List threadsSeenPerTimeUnit = new List(56); + static Dictionary threadsSeen = new Dictionary(56); + static Dictionary threadsSeenTotal = new Dictionary(56); + static List threadsToPrint = new List(8); + static string strThreadIndices = null; + static int gcIndexToPrintStart = -1; + static int gcIndexToPrintEnd = 1000000; + static string strGCRange = null; + static int totalGCCount = 0; + static int totalGCDurationMS = 0; + static int totalAllocMB = 0; + static int totalBudgetMB = 0; + + // This represents allocated MB on each heap; parse from the lines following [GC_alloc_mb] + static int[] AllocMB; + static int budgetMB = 0; + // These are subscript chars for 0-9. + static char[] unicodeChars = { '\x2080', '\x2081', '\x2082', '\x2083', '\x2084', '\x2085', '\x2086', '\x2087', '\x2088', '\x2089' }; + static string[] passOneFileTypes = { "thread", "alloc" }; + + static void PrintToAllPassOneFiles(string strLine) + { + for (int fileIndex = 0; fileIndex < (int)PassOneViewType.MaxType; fileIndex++) + { + swPassOneFiles[fileIndex].WriteLine(strLine); + } + } + + static void PrintAllocToAllPassOneFiles() + { + string strAlloc = string.Format("{0,6}", nodeIndexToPrint); + int procStart = procsPerNode * nodeIndexToPrint; + int procEnd = procsPerNode * (nodeIndexToPrint + 1); + + int currentGCAllocMBAllHeaps = 0; + int currentGCAllocMB = 0; + for (int procIndex = procStart; procIndex < procEnd; procIndex++) + { + currentGCAllocMB += AllocMB[procIndex]; + //Console.WriteLine("P#{0,2} alloc {1,3}mb, total is now {2,3}mb", + // procIndex, AllocMB[procIndex], + // currentGCAllocMB); + } + + currentGCAllocMBAllHeaps = currentGCAllocMB; + currentGCAllocMB /= procsPerNode; + totalAllocMB += currentGCAllocMB; + totalBudgetMB += budgetMB; + + //Console.WriteLine("N#{0} GC#{1,2} alloc {2,3}mb, budget {3,3}mb, total alloc {4,3}mb, total budget {5,3}mb", + // nodeIndexToPrint, totalGCCount, currentGCAllocMB, budgetMB, totalAllocMB, totalBudgetMB); + + //Console.WriteLine("{0,3}mb - {1,3}mb (%{2:f2} - {3}ms ({4:f2}mb/ms)", + // budgetMB, currentGCAllocMB, ((currentGCAllocMB * 100.0) / budgetMB), + // (endTimeMS - startTimeMS), + // ((double)currentGCAllocMBAllHeaps / (endTimeMS - startTimeMS))); + + int procIndexBase = nodeIndexToPrint * procsPerNode; + for (int procIndex = procIndexBase; procIndex < (procIndexBase + procsPerNode); procIndex++) + { + strAlloc += string.Format("|{0,5}", AllocMB[procIndex]); + } + strAlloc += string.Format("|"); + PrintToAllPassOneFiles(strAlloc); + } + static void CloseAllPassOneFiles() + { + Console.WriteLine("Total {0} GCs, avg {1}ms, node {2}: BFR %{3:f2}", + totalGCCount, + (totalGCDurationMS / totalGCCount), + nodeIndexToPrint, + (totalAllocMB * 100.0 / totalBudgetMB)); + + for (int fileIndex = 0; fileIndex < (int)PassOneViewType.MaxType; fileIndex++) + { + swPassOneFiles[fileIndex].Close(); + } + } + + static void PrintHeader() + { + // TEMP.. + //procsPerNode = 16; + int procStart = procsPerNode * nodeIndexToPrint; + int procEnd = procsPerNode * (nodeIndexToPrint + 1); + + Console.WriteLine("Printing out proc [{0},[{1}", procStart, procEnd); + + swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("##########Thread view - indices of threads currently running on each proc in between GCs\n"); + swPassOneFiles[(int)PassOneViewType.AllocHeap].WriteLine("##########Alloc heap view - alloc heaps for threads currently running on each proc in between GCs\n"); + string strHeader = string.Format("{0,6}", "ms"); + for (int procIndex = procStart; procIndex < procEnd; procIndex++) + { + strHeader += string.Format("|{0,5}", procIndex); + } + strHeader += string.Format("|"); + PrintToAllPassOneFiles(strHeader); + } + + // Note that I'm only allocating 2 chars for count and we don't expect to have more than 99 of them. + static string FormatCount(int count) + { + if (count > 99) + { + Console.WriteLine("actually have {0} samples in {1}ms!!!!", count, timeUnitMS); + count = 99; + } + + string strFormattedCount = null; + + if (count >= 10) + { + strFormattedCount += unicodeChars[count / 10]; + } + + strFormattedCount += unicodeChars[count % 10]; + return strFormattedCount; + } + + static string FormatFlags(int flags) + { + string strFormattedFlags = ""; + if ((flags & (int)HeapBalanceFlagMask.MutipleProcs) != 0) + { + strFormattedFlags += "ᵐ"; + } + if ((flags & (int)HeapBalanceFlagMask.EnterDueToProc) != 0) + { + strFormattedFlags += "ᵖ"; + } + if ((flags & (int)HeapBalanceFlagMask.SetIdeal) != 0) + { + strFormattedFlags += "ⁱ"; + } + + return strFormattedFlags; + } + + // We display the following - + // Each row represents a time unit, each column represents a proc. + // Each element represents a piece of info on that proc for that time. + // This info could be the thread index, or the alloc heap, along with + // things like m/p/i or count of samples for that time period for that + // thread and alloc heap. + // + // There could be multiple entries for the same time on a proc because + // we happen to observe multiple threads running during that time. + // But we only store one piece of info. Here are the rules to decide + // what we store - + // We take the last one on the same time unless we see one with interesting + // info (ie, one of m/p/i isn't 0); or one with higher sample count. + // + static void PrintProcActivityOnNode() + { + // TEMP.. + //procsPerNode = 16; + int totalTimeUnits = (endTimeMS - startTimeMS) / timeUnitMS + 1; + int procStart = procsPerNode * nodeIndexToPrint; + int procEnd = procsPerNode * (nodeIndexToPrint + 1); + + bool fCheckThreadIndex = (threadsToPrint.Count != 0); + + threadsSeen.Clear(); + + for (int timeIndex = 0; timeIndex < totalTimeUnits; timeIndex++) + { + bool fPrint = fIncludeAllTime; + + if (!fPrint) + { + // Only write something if there's samples observed on at least one proc. + for (int procIndex = procStart; procIndex < procEnd; procIndex++) + { + if (samples[timeIndex][procIndex].tid != -1) + { + fPrint = true; + break; + } + } + } + + // see https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts + // for subscript characters + // ᵐ,ᵖ,ⁱ + if (fPrint) + { + int procsHadSamples = 0; + if (fPrintThreadInfoPerTimeUnit) + threadsSeenPerTimeUnit.Clear(); + + string strCellFormat = "|{0,3}{1,-2}"; + string strThread = string.Format("{0,6}", (timeIndex + startTimeMS)); + string strAllocHeap = string.Format("{0,6}", (timeIndex + startTimeMS)); + + for (int procIndex = procStart; procIndex < procEnd; procIndex++) + { + SampleInfo currentSample = samples[timeIndex][procIndex]; + int tid = currentSample.tid; + bool fIncludeThisSample = true; + + if (fCheckThreadIndex) + { + if (!threadsToPrint.Contains(tid)) + { + fIncludeThisSample = false; + } + } + + if ((tid == -1) || !fIncludeThisSample) + { + strThread += string.Format(strCellFormat, "", ""); + strAllocHeap += string.Format(strCellFormat, "", ""); + } + else + { + if (fPrintThreadInfoPerTimeUnit) + { + if (!threadsSeenPerTimeUnit.Contains(tid)) + threadsSeenPerTimeUnit.Add(tid); + procsHadSamples++; + } + + if (!threadsSeen.ContainsKey(tid)) + threadsSeen.Add(tid, currentSample.countSamples); + else + threadsSeen[tid] += currentSample.countSamples; + + string strCount = ((currentSample.countSamples > 0) ? FormatCount(currentSample.countSamples) : ""); + strThread += string.Format(strCellFormat, tid.ToString(), strCount); + + string strFlags = FormatFlags(currentSample.flags); + strAllocHeap += string.Format(strCellFormat, currentSample.allocHeap.ToString(), strFlags); + } + } + //swPassOne.WriteLine("|"); + strThread += "|"; + swPassOneFiles[(int)PassOneViewType.Thread].WriteLine(strThread); + + if (fPrintThreadInfoPerTimeUnit) + { + swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("----{0,3} threads on {1,3} procs----", + threadsSeenPerTimeUnit.Count, procsHadSamples); + } + + strAllocHeap += "|"; + swPassOneFiles[(int)PassOneViewType.AllocHeap].WriteLine(strAllocHeap); + } + } + + //var threadsSeenOrdered = threadsSeen.OrderByDescending(i => i.Value); + //foreach (var item in threadsSeenOrdered) + //{ + // int k = item.Key; + // int v = item.Value; + // swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("tid: {0,5}-{1,-5}", k, v); + // if (!threadsSeenTotal.ContainsKey(k)) + // threadsSeenTotal.Add(k, v); + // else + // threadsSeenTotal[k] += v; + //} + } + + static int GetAdjustedProcIndex(int pIndex) + { + return (pIndex % procsPerNode); + } + + static void ParseThreadIndices(string _strThreadIndices) + { + strThreadIndices = _strThreadIndices; + string[] fields = strThreadIndices.Split(new Char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); + for (int i = 0; i < fields.Length; i++) + { + threadsToPrint.Add(Int32.Parse(fields[i])); + } + + int len = threadsToPrint.Count; + Console.WriteLine("getting info for {0} threads", len); + for (int i = 0; i < len; i++) + { + Console.WriteLine(threadsToPrint[i]); + } + } + + static void ParseGCRange(string _strGCRange) + { + strGCRange = _strGCRange; + int dashIndex= strGCRange.IndexOf('-'); + if (dashIndex== -1) + { + Console.WriteLine("Invalid GC range {0}", strGCRange); + return; + } + + string strStart = strGCRange.Substring(0, dashIndex); + string strEnd = strGCRange.Substring(dashIndex + 1); + + if (strStart != "start") + gcIndexToPrintStart = Int32.Parse(strStart); + if (strEnd != "end") + gcIndexToPrintEnd = Int32.Parse(strEnd); + + Console.WriteLine("printing GC {0}->{1}", gcIndexToPrintStart, gcIndexToPrintEnd); + } + + static void PassOne(string strPassZeroLog) + { + // This generates 2 files - thread view and alloc heap view. This is just so you could + // compare them SxS. + string strLogNameWithoutExtension = Path.GetFileNameWithoutExtension(strPassZeroLog); + string strIncludeAll = (fIncludeAllTime ? "-ia1-" : "-ia0-"); + string strTI = ((strThreadIndices == null) ? "" : strThreadIndices); + string strPassOneLog = strLogNameWithoutExtension + "-pass1-n" + nodeIndexToPrint + strIncludeAll + strTI; + + if (strGCRange != null) + strPassOneLog += "-" + strGCRange + "-"; + + swPassOneFiles = new StreamWriter[(int)PassOneViewType.MaxType]; + for (int fileIndex = 0; fileIndex < (int)PassOneViewType.MaxType; fileIndex++) + { + swPassOneFiles[fileIndex] = new StreamWriter(strPassOneLog + passOneFileTypes[fileIndex] + ".txt"); + } + + string strTemp, strRemaining; + using (StreamReader sr = new StreamReader(strPassZeroLog)) + { + string s; + bool fSkip = false; + + while ((s = sr.ReadLine()) != null) + { + if (s.StartsWith("P:")) + { + //P: 112, N: 2 + strTemp = ParseString(s, "P: ", ",", out strRemaining); + totalProcs = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, "N: ", null, out strRemaining); + totalNodes = Int32.Parse(strTemp); + procsPerNode = totalProcs / totalNodes; + PrintHeader(); + + int timeUnits = largestTimeInBetweenGCs / timeUnitMS + 1; + //Console.WriteLine("allocating {0} columns, {1} rows", totalProcs, timeUnits); + samples = new SampleInfo[timeUnits][]; + for (int timeIndex = 0; timeIndex < timeUnits; timeIndex++) + { + samples[timeIndex] = new SampleInfo[totalProcs]; + for (int procIndex = 0; procIndex < totalProcs; procIndex++) + { + samples[timeIndex][procIndex].tid = -1; + } + } + + // Note - I'm allocating totalProcs elements but there are not necessarily that many + // heaps. But there can't be more heaps than procs so we just waste some space here which + // is ok. + AllocMB = new int[totalProcs]; + + totalGCCount = 0; + totalGCDurationMS = 0; + totalAllocMB = 0; + totalBudgetMB = 0; + } + else if (s.StartsWith("TEMP")) + { + // If I want to only log something temporarily I prefix it with TEMP so we know to just + // write it out as is. + //PrintToAllPassOneFiles(s); + } + else if (s.StartsWith("[GCA#")) + { + // Clear what we recorded for the last GC. + int usedEntryCount = (endTimeMS - startTimeMS) / timeUnitMS + 1; + //Console.WriteLine("Clearing {0} time entries", usedEntryCount); + for (int timeIndex = 0; timeIndex < usedEntryCount; timeIndex++) + { + for (int procIndex = 0; procIndex < totalProcs; procIndex++) + { + samples[timeIndex][procIndex].tid = -1; + } + } + + // GCA#gc_index time_since_last_gc-min_time-max_time + //[GCA#1 270-42-268 + strTemp = ParseString(s, "A#", " ", out strRemaining); + int gcIndex = Int32.Parse(strTemp); + + fSkip = !((gcIndex >= gcIndexToPrintStart) && (gcIndex <= gcIndexToPrintEnd)); + + if (fSkip) + { + //Console.WriteLine("GC#{0} is skipped", gcIndex); + continue; + } + else + { + //Console.WriteLine("GC#{0} is NOT skipped", gcIndex); + } + //swPassOne.WriteLine("[GC#{0}-GC#{1}]", (gcIndex - 1), gcIndex); + strTemp = ParseString(s, "-", "-", out strRemaining); + startTimeMS = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, "-", null, out strRemaining); + endTimeMS = Int32.Parse(strTemp); + //Console.WriteLine("Before GC#{0} time {1} {2}, {3} entries", + // gcIndex, startTimeMS, endTimeMS, ((endTimeMS - startTimeMS) / timeUnitMS)); + } + else if (s.StartsWith("[p")) + { + if (fSkip) + continue; + + //[p78]-192-225ms + strTemp = ParseString(s, "[p", "]", out strRemaining); + currentProcIndex = Int32.Parse(strTemp); + lastTimeIndex = -1; + //currentProcIndex = GetAdjustedProcIndex(currentProcIndex); + } + else if (s.StartsWith("[GC_")) + { + if (fSkip) + continue; + + //[GC_alloc_mb] + //[N# 24]17,16,16,15,16,16,17,16,16,15,17,16,15,16,17,16,17,15,16,16,16,16,17,16,16,16,16,16,16,16,16,16,17,15,16,16,16,16,16,16,16,15,16,16,16,16,16,16,16,16,16,15,16,16,16,16, + //[N# 24]21,22,22,23,22,22,21,22,22,22,23,21,22,22,22,22,22,22,22,21,21,24,4,22,21,22,22,22,22,22,22,22,22,23,21,21,22,23,23,22,21,22,23,21,22,22,22,22,22,22,22,21,21,22,21,22, + int totalNodesRead = 0; + while ((s = sr.ReadLine()) != null) + { + if (s.StartsWith("[N")) + { + //Console.WriteLine(s); + int procIndexBase = totalNodesRead * procsPerNode; + strTemp = ParseString(s, "[N#", "]", out strRemaining); + budgetMB = Int32.Parse(strTemp); + string strAllocLine = s.Substring(7); + //Console.WriteLine("spliting {0}", strAllocLine); + string[] fieldsAlloc = strAllocLine.Split(new Char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); + for (int fieldIndex = 0; fieldIndex < procsPerNode; fieldIndex++) + { + AllocMB[procIndexBase + fieldIndex] = Int32.Parse(fieldsAlloc[fieldIndex]); + } + totalNodesRead++; + if (totalNodesRead == totalNodes) + break; + } + } + } + else if (s.StartsWith("[GC#")) + { + if (fSkip) + continue; + + //Console.WriteLine(s); + PrintProcActivityOnNode(); + PrintAllocToAllPassOneFiles(); + + // gc index-duration + //[GC#1-15-2911711] + strTemp = ParseString(s, "-", "-", out strRemaining); + totalGCCount++; + totalGCDurationMS += Int32.Parse(strTemp); + + //Console.WriteLine(s); + //Console.WriteLine("GC#{0} {1}ms ->total {2}ms", totalGCCount, Int32.Parse(strTemp), totalGCDurationMS); + + PrintToAllPassOneFiles(s); + } + else if (s.StartsWith("[N#")) + { + if (fSkip) + continue; + } + else + { + if (fSkip) + continue; + + // Majority of the log, (83) is the ideal proc for that thread. + //1051ms,95(1),83,m:0,p:0,i:0(83) + strTemp = ParseString(s, null, "ms", out strRemaining); + int currentTimeIndex = Int32.Parse(strTemp); + + if (currentTimeIndex < lastTimeIndex) + { + //Console.WriteLine("!!bad time stamp at p{0} -{1}-", currentProcIndex, s); + continue; + } + + lastTimeIndex = currentTimeIndex; + + currentTimeIndex -= startTimeMS; + currentTimeIndex /= timeUnitMS; + strTemp = ParseString(strRemaining, ",", "(", out strRemaining); + int tid = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, "(", ")", out strRemaining); + int countSamples = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, ",", ",", out strRemaining); + int allocHeap = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, ",m:", ",", out strRemaining); + int flags = Int32.Parse(strTemp); + strTemp = ParseString(strRemaining, ",p:", ",", out strRemaining); + flags |= Int32.Parse(strTemp) << 4; + strTemp = ParseString(strRemaining, ",i:", "(", out strRemaining); + flags |= Int32.Parse(strTemp) << 8; + strTemp = ParseString(strRemaining, "(", ")", out strRemaining); + int idealProcNo = Int32.Parse(strTemp); + //Console.WriteLine("ADDING time {0}ms, entry {1}, thread #{2}, ah {3}, ideal {4}", + // currentTimeIndex, currentProcIndex, tid, allocHeap, idealProcNo); + samples[currentTimeIndex][currentProcIndex] = new SampleInfo(tid, allocHeap, countSamples, flags, idealProcNo); + } + } + } + + var threadsSeenTotalOrdered = threadsSeenTotal.OrderByDescending(i => i.Value); + swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("\n-----------Total samples per thread-----------"); + foreach (var item in threadsSeenTotalOrdered) + { + int k = item.Key; + int v = item.Value; + swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("tid: {0,5}-{1,-5}", k, v); + } + + CloseAllPassOneFiles(); + } + + // TODO: in pass zero there's merit in assigning thread indices based on the first CPU they appear on, + // instead of just assigning one as we come across them. + static void Main(string[] args) + { + int len = args.Length; + string strLog = null; + + for (int i = 0; i < args.Length; ++i) + { + string currentArg = args[i]; + string currentArgValue; + if (currentArg.Equals("-l") || currentArg.Equals("-Log")) + { + strLog = args[++i]; + } + else if (currentArg.Equals("-ia") || currentArg.Equals("-IncludeAll")) + { + currentArgValue = args[++i]; + fIncludeAllTime = (Int32.Parse(currentArgValue) == 1); + } + else if (currentArg.Equals("-ti") || currentArg.Equals("-ThreadIndices")) + { + // This is a comma separated indices + // -ti 88,110 + currentArgValue = args[++i]; + ParseThreadIndices(currentArgValue); + } + else if (currentArg.Equals("-gr") || currentArg.Equals("-GCRange")) + { + // This specifies a range of GCs, inclusive + //10-20 + //10-end + //start-10 + currentArgValue = args[++i]; + ParseGCRange(currentArgValue); + } + else if (currentArg.Equals("-pti") || currentArg.Equals("-PrintThreadInfo")) + { + // prints the # of threads we see for each unit of time we display and how many + // total procs they ran on during that time. This shows us how volatile threads + // are jumping between procs. + currentArgValue = args[++i]; + if (Int32.Parse(currentArgValue) == 1) + fPrintThreadInfoPerTimeUnit = true; + } + } + + Console.WriteLine("Processing {0}, {1}", strLog, (fIncludeAllTime ? "full time view" : "compressed time view")); + PassZero(strLog); + // Feel free to convert PassOne to print out all node's activity at once. + // I used pass in the node # to print so did this the lazy way. + for (int i = 0; i < totalNodes; i++) + { + nodeIndexToPrint = i; + PassOne(strPassZeroLog); + } + } + } +} diff --git a/src/coreclr/src/tools/GCLogParser/parse-hb-log.csproj b/src/coreclr/src/tools/GCLogParser/parse-hb-log.csproj new file mode 100644 index 0000000..2c8953b --- /dev/null +++ b/src/coreclr/src/tools/GCLogParser/parse-hb-log.csproj @@ -0,0 +1,73 @@ + + + + + Debug + AnyCPU + {6AE26CD6-C971-48D6-8C03-2FFA272B942C} + Exe + parse_hb_log + parse-hb-log + v4.7.2 + 512 + true + true + + + AnyCPU + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + true + bin\x64\Debug\ + DEBUG;TRACE + full + x64 + prompt + MinimumRecommendedRules.ruleset + true + + + bin\x64\Release\ + TRACE + true + pdbonly + x64 + prompt + MinimumRecommendedRules.ruleset + true + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/coreclr/src/tools/GCLogParser/parse-hb-log.sln b/src/coreclr/src/tools/GCLogParser/parse-hb-log.sln new file mode 100644 index 0000000..5ae5dcb --- /dev/null +++ b/src/coreclr/src/tools/GCLogParser/parse-hb-log.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.28803.452 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "parse-hb-log", "parse-hb-log.csproj", "{6AE26CD6-C971-48D6-8C03-2FFA272B942C}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|x64.ActiveCfg = Debug|x64 + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|x64.Build.0 = Debug|x64 + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|Any CPU.Build.0 = Release|Any CPU + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|x64.ActiveCfg = Release|x64 + {6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {7A3E16A0-5AC4-4DB6-A016-161E57874740} + EndGlobalSection +EndGlobal diff --git a/src/coreclr/src/utilcode/util.cpp b/src/coreclr/src/utilcode/util.cpp index e709160..0e142cf 100644 --- a/src/coreclr/src/utilcode/util.cpp +++ b/src/coreclr/src/utilcode/util.cpp @@ -738,6 +738,34 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr, { return ::GetNumaProcessorNodeEx(proc_no, node_no); } +/*static*/ bool NumaNodeInfo::GetNumaInfo(PUSHORT total_nodes, DWORD* max_procs_per_node) +{ + if (m_enableGCNumaAware) + { + DWORD currentProcsOnNode = 0; + for (int i = 0; i < m_nNodes; i++) + { + GROUP_AFFINITY processorMask; + if (GetNumaNodeProcessorMaskEx(i, &processorMask)) + { + DWORD procsOnNode = 0; + uintptr_t mask = (uintptr_t)processorMask.Mask; + while (mask) + { + procsOnNode++; + mask &= mask - 1; + } + + currentProcsOnNode = max(currentProcsOnNode, procsOnNode); + } + *max_procs_per_node = currentProcsOnNode; + *total_nodes = m_nNodes; + } + return true; + } + + return false; +} #else // !FEATURE_PAL /*static*/ BOOL NumaNodeInfo::GetNumaProcessorNodeEx(USHORT proc_no, PUSHORT node_no) { @@ -747,6 +775,7 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr, #endif /*static*/ BOOL NumaNodeInfo::m_enableGCNumaAware = FALSE; +/*static*/ uint16_t NumaNodeInfo::m_nNodes = 0; /*static*/ BOOL NumaNodeInfo::InitNumaNodeInfoAPI() { #if !defined(FEATURE_REDHAWK) @@ -760,6 +789,8 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr, if (!::GetNumaHighestNodeNumber(&highest) || (highest == 0)) return FALSE; + m_nNodes = (USHORT)(highest + 1); + return TRUE; #else return FALSE; @@ -1058,6 +1089,7 @@ retry: { *group_number = i; *group_processor_number = bDiff; + break; } bDiff = processor_number - bTemp; @@ -1098,6 +1130,24 @@ retry: #endif } +// There can be different numbers of procs in groups. We take the max. +/*static*/ bool CPUGroupInfo::GetCPUGroupInfo(PUSHORT total_groups, DWORD* max_procs_per_group) +{ + if (m_enableGCCPUGroups) + { + *total_groups = m_nGroups; + DWORD currentProcsInGroup = 0; + for (WORD i = 0; i < m_nGroups; i++) + { + currentProcsInGroup = max(currentProcsInGroup, m_CPUGroupInfoArray[i].nr_active); + } + *max_procs_per_group = currentProcsInGroup; + return true; + } + + return false; +} + #if !defined(FEATURE_REDHAWK) //Lock ThreadStore before calling this function, so that updates of weights/counts are consistent /*static*/ void CPUGroupInfo::ChooseCPUGroupAffinity(GROUP_AFFINITY *gf) diff --git a/src/coreclr/src/vm/gcenv.os.cpp b/src/coreclr/src/vm/gcenv.os.cpp index 5658b19..5fb3745 100644 --- a/src/coreclr/src/vm/gcenv.os.cpp +++ b/src/coreclr/src/vm/gcenv.os.cpp @@ -38,7 +38,7 @@ class GroupProcNo public: - static const uint16_t NoGroup = 0x3ff; + static const uint16_t NoGroup = 0; GroupProcNo(uint16_t groupProc) : m_groupProc(groupProc) { @@ -46,7 +46,8 @@ public: GroupProcNo(uint16_t group, uint16_t procIndex) : m_groupProc((group << 6) | procIndex) { - assert(group <= 0x3ff); + // Making this the same as the # of NUMA node we support. + assert(group < 0x40); assert(procIndex <= 0x3f); } @@ -208,13 +209,13 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t #else PROCESSOR_NUMBER proc; - if (dstGroupProcNo.GetGroup() != GroupProcNo::NoGroup) + if (CPUGroupInfo::CanEnableGCCPUGroups()) { proc.Group = (WORD)dstGroupProcNo.GetGroup(); proc.Number = (BYTE)dstGroupProcNo.GetProcIndex(); proc.Reserved = 0; - success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL); + success = !!SetThreadIdealProcessorEx(GetCurrentThread (), &proc, NULL); } else { @@ -235,13 +236,38 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t #endif // !FEATURE_PAL } +bool GCToOSInterface::GetCurrentThreadIdealProc(uint16_t* procNo) +{ + LIMITED_METHOD_CONTRACT; + bool success = false; +#ifndef FEATURE_PAL + PROCESSOR_NUMBER proc; + success = !!GetThreadIdealProcessorEx(GetCurrentThread(), &proc); + if (success) + { + GroupProcNo groupProcNo(proc.Group, proc.Number); + *procNo = groupProcNo.GetCombinedValue(); + } +#endif //FEATURE_PAL + return success; +} + // Get the number of the current processor uint32_t GCToOSInterface::GetCurrentProcessorNumber() { LIMITED_METHOD_CONTRACT; _ASSERTE(CanGetCurrentProcessorNumber()); + +#ifndef FEATURE_PAL + PROCESSOR_NUMBER proc_no_cpu_group; + GetCurrentProcessorNumberEx(&proc_no_cpu_group); + + GroupProcNo groupProcNo(proc_no_cpu_group.Group, proc_no_cpu_group.Number); + return groupProcNo.GetCombinedValue(); +#else return ::GetCurrentProcessorNumber(); +#endif //!FEATURE_PAL } // Check if the OS supports getting current processor number @@ -295,25 +321,33 @@ void GCToOSInterface::YieldThread(uint32_t switchCount) // size - size of the virtual memory range // alignment - requested memory alignment // flags - flags to control special settings like write watching +// node - the NUMA node to reserve memory on // Return: // Starting virtual address of the reserved range -void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags) +void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node) { LIMITED_METHOD_CONTRACT; - DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE; - - // This is not strictly necessary for a correctness standpoint. Windows already guarantees - // allocation granularity alignment when using MEM_RESERVE, so aligning the size here has no effect. - // However, ClrVirtualAlloc does expect the size to be aligned to the allocation granularity. - size_t aligned_size = (size + g_SystemInfo.dwAllocationGranularity - 1) & ~static_cast(g_SystemInfo.dwAllocationGranularity - 1); - if (alignment == 0) + if (node == NUMA_NODE_UNDEFINED) { - return ::ClrVirtualAlloc(0, aligned_size, memFlags, PAGE_READWRITE); + DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE; + + // This is not strictly necessary for a correctness standpoint. Windows already guarantees + // allocation granularity alignment when using MEM_RESERVE, so aligning the size here has no effect. + // However, ClrVirtualAlloc does expect the size to be aligned to the allocation granularity. + size_t aligned_size = (size + g_SystemInfo.dwAllocationGranularity - 1) & ~static_cast(g_SystemInfo.dwAllocationGranularity - 1); + if (alignment == 0) + { + return ::ClrVirtualAlloc (0, aligned_size, memFlags, PAGE_READWRITE); + } + else + { + return ::ClrVirtualAllocAligned (0, aligned_size, memFlags, PAGE_READWRITE, alignment); + } } else { - return ::ClrVirtualAllocAligned(0, aligned_size, memFlags, PAGE_READWRITE, alignment); + return NumaNodeInfo::VirtualAllocExNuma (::GetCurrentProcess (), NULL, size, MEM_RESERVE, PAGE_READWRITE, node); } } @@ -425,7 +459,7 @@ bool GCToOSInterface::SupportsWriteWatch() // check if the OS supports write-watch. // Drawbridge does not support write-watch so we still need to do the runtime detection for them. // Otherwise, all currently supported OSes do support write-watch. - void* mem = VirtualReserve (g_SystemInfo.dwAllocationGranularity, 0, VirtualReserveFlags::WriteWatch); + void* mem = VirtualReserve(g_SystemInfo.dwAllocationGranularity, 0, VirtualReserveFlags::WriteWatch); if (mem != NULL) { VirtualRelease (mem, g_SystemInfo.dwAllocationGranularity); @@ -493,7 +527,7 @@ bool GCToOSInterface::SetThreadAffinity(uint16_t procNo) #ifndef FEATURE_PAL GroupProcNo groupProcNo(procNo); - if (groupProcNo.GetGroup() != GroupProcNo::NoGroup) + if (CPUGroupInfo::CanEnableGCCPUGroups()) { GROUP_AFFINITY ga; ga.Group = (WORD)groupProcNo.GetGroup(); @@ -937,6 +971,34 @@ bool GCToOSInterface::CanEnableGCNumaAware() return NumaNodeInfo::CanEnableGCNumaAware() != FALSE; } +bool GCToOSInterface::GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node) +{ +#ifndef FEATURE_PAL + return NumaNodeInfo::GetNumaInfo(total_nodes, (DWORD*)max_procs_per_node); +#else + return false; +#endif //!FEATURE_PAL +} + +bool GCToOSInterface::GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group) +{ +#ifndef FEATURE_PAL + return CPUGroupInfo::GetCPUGroupInfo(total_groups, (DWORD*)max_procs_per_group); +#else + return false; +#endif //!FEATURE_PAL +} + +bool GCToOSInterface::CanEnableGCCPUGroups() +{ + LIMITED_METHOD_CONTRACT; +#ifndef FEATURE_PAL + return CPUGroupInfo::CanEnableGCCPUGroups() != FALSE; +#else + return false; +#endif //!FEATURE_PAL +} + // Get processor number and optionally its NUMA node number for the specified heap number // Parameters: // heap_number - heap number to get the result for -- 2.7.4