From: Jack Pappas Date: Tue, 6 Nov 2018 23:07:47 +0000 (-0500) Subject: Loop-free GC rounding helpers with _BitScanReverse. (dotnet/coreclr#20157) X-Git-Tag: submit/tizen/20210909.063632~11030^2~3500 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ca47a87eb96af1f6e1e0a773976c3f8d20965b1f;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Loop-free GC rounding helpers with _BitScanReverse. (dotnet/coreclr#20157) Commit migrated from https://github.com/dotnet/coreclr/commit/2bf55bc5ca8b09dd26e32a9ee259ab22fb69806b --- diff --git a/src/coreclr/src/gc/env/gcenv.base.h b/src/coreclr/src/gc/env/gcenv.base.h index da66e9b..187d9ca 100644 --- a/src/coreclr/src/gc/env/gcenv.base.h +++ b/src/coreclr/src/gc/env/gcenv.base.h @@ -220,8 +220,10 @@ typedef DWORD (WINAPI *PTHREAD_START_ROUTINE)(void* lpThreadParameter); #ifdef _MSC_VER #pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) #if _WIN64 #pragma intrinsic(_BitScanForward64) + #pragma intrinsic(_BitScanReverse64) #endif #endif // _MSC_VER @@ -281,6 +283,56 @@ inline uint8_t BitScanForward64(uint32_t *bitIndex, uint64_t mask) #endif // _MSC_VER } +// Cross-platform wrapper for the _BitScanReverse compiler intrinsic. +inline uint8_t BitScanReverse(uint32_t *bitIndex, uint32_t mask) +{ +#ifdef _MSC_VER + return _BitScanReverse((unsigned long*)bitIndex, mask); +#else // _MSC_VER + // The result of __builtin_clzl is undefined when mask is zero, + // but it's still OK to call the intrinsic in that case (just don't use the output). + // Unconditionally calling the intrinsic in this way allows the compiler to + // emit branchless code for this function when possible (depending on how the + // intrinsic is implemented for the target platform). + int lzcount = __builtin_clzl(mask); + *bitIndex = static_cast(31 - lzcount); + return mask != 0 ? TRUE : FALSE; +#endif // _MSC_VER +} + +// Cross-platform wrapper for the _BitScanReverse64 compiler intrinsic. +inline uint8_t BitScanReverse64(uint32_t *bitIndex, uint64_t mask) +{ +#ifdef _MSC_VER + #if _WIN64 + return _BitScanReverse64((unsigned long*)bitIndex, mask); + #else + // MSVC targeting a 32-bit target does not support this intrinsic. + // We can fake it checking whether the upper 32 bits are zeros (or not) + // then calling _BitScanReverse() on either the upper or lower 32 bits. + uint32_t upper = static_cast(mask >> 32); + + if (upper != 0) + { + uint8_t result = _BitScanReverse((unsigned long*)bitIndex, upper); + *bitIndex += 32; + return result; + } + + return _BitScanReverse((unsigned long*)bitIndex, static_cast(mask)); + #endif // _WIN64 +#else + // The result of __builtin_clzll is undefined when mask is zero, + // but it's still OK to call the intrinsic in that case (just don't use the output). + // Unconditionally calling the intrinsic in this way allows the compiler to + // emit branchless code for this function when possible (depending on how the + // intrinsic is implemented for the target platform). + int lzcount = __builtin_clzll(mask); + *bitIndex = static_cast(63 - lzcount); + return mask != 0 ? TRUE : FALSE; +#endif // _MSC_VER +} + // Aligns a size_t to the specified alignment. Alignment must be a power // of two. inline size_t ALIGN_UP(size_t val, size_t alignment) diff --git a/src/coreclr/src/gc/gc.cpp b/src/coreclr/src/gc/gc.cpp index c5c178b..a8a47c7 100644 --- a/src/coreclr/src/gc/gc.cpp +++ b/src/coreclr/src/gc/gc.cpp @@ -308,78 +308,69 @@ void GCStatistics::DisplayAndUpdate() #endif // GC_STATS -#ifdef BIT64 -#define TOTAL_TIMES_TO_SHIFT 6 -#else -#define TOTAL_TIMES_TO_SHIFT 5 -#endif // BIT64 - +inline size_t round_up_power2 (size_t size) { - unsigned short shift = 1; - size_t shifted = 0; - - size--; - for (unsigned short i = 0; i < TOTAL_TIMES_TO_SHIFT; i++) - { - shifted = size | (size >> shift); - if (shifted == size) - { - break; - } - - size = shifted; - shift <<= 1; - } - shifted++; + // Get the 0-based index of the most-significant bit in size-1. + // If the call failed (because size-1 is zero), size must be 1, + // so return 1 (because 1 rounds up to itself). + DWORD highest_set_bit_index; + if (0 == +#ifdef BIT64 + BitScanReverse64( +#else + BitScanReverse( +#endif + &highest_set_bit_index, size - 1)) { return 1; } - return shifted; + // The size == 0 case (which would have overflowed to SIZE_MAX when decremented) + // is handled below by relying on the fact that highest_set_bit_index is the maximum value + // (31 or 63, depending on sizeof(size_t)) and left-shifting a value >= 2 by that + // number of bits shifts in zeros from the right, resulting in an output of zero. + return static_cast(2) << highest_set_bit_index; } inline size_t round_down_power2 (size_t size) { - size_t power2 = round_up_power2 (size); - - if (power2 != size) - { - power2 >>= 1; - } + // Get the 0-based index of the most-significant bit in size. + // If the call failed, size must be zero so return zero. + DWORD highest_set_bit_index; + if (0 == +#ifdef BIT64 + BitScanReverse64( +#else + BitScanReverse( +#endif + &highest_set_bit_index, size)) { return 0; } - return power2; + // Left-shift 1 by highest_set_bit_index to get back a value containing only + // the most-significant set bit of size, i.e. size rounded down + // to the next power-of-two value. + return static_cast(1) << highest_set_bit_index; } -// the index starts from 0. -int index_of_set_bit (size_t power2) +// Get the 0-based index of the most-significant bit in the value. +// Returns -1 if the input value is zero (i.e. has no set bits). +inline +int index_of_highest_set_bit (size_t value) { - int low = 0; - int high = sizeof (size_t) * 8 - 1; - int mid; - while (low <= high) - { - mid = ((low + high)/2); - size_t temp = (size_t)1 << mid; - if (power2 & temp) - { - return mid; - } - else if (power2 < temp) - { - high = mid - 1; - } - else - { - low = mid + 1; - } - } - - return -1; + // Get the 0-based index of the most-significant bit in the value. + // If the call failed (because value is zero), return -1. + DWORD highest_set_bit_index; + return (0 == +#ifdef BIT64 + BitScanReverse64( +#else + BitScanReverse( +#endif + &highest_set_bit_index, value)) ? -1 : static_cast(highest_set_bit_index); } inline int relative_index_power2_plug (size_t power2) { - int index = index_of_set_bit (power2); + int index = index_of_highest_set_bit (power2); assert (index <= MAX_INDEX_POWER2); return ((index < MIN_INDEX_POWER2) ? 0 : (index - MIN_INDEX_POWER2)); @@ -388,7 +379,7 @@ int relative_index_power2_plug (size_t power2) inline int relative_index_power2_free_space (size_t power2) { - int index = index_of_set_bit (power2); + int index = index_of_highest_set_bit (power2); assert (index <= MAX_INDEX_POWER2); return ((index < MIN_INDEX_POWER2) ? -1 : (index - MIN_INDEX_POWER2)); @@ -8688,7 +8679,7 @@ public: } } - int bucket_power2 = index_of_set_bit (round_down_power2 (size)); + int bucket_power2 = index_of_highest_set_bit (size); if (bucket_power2 < base_power2) { return; @@ -8791,7 +8782,7 @@ public: plug_size_to_fit += (pad_in_front ? Align(min_obj_size) : 0); #endif //SHORT_PLUGS - int plug_power2 = index_of_set_bit (round_up_power2 (plug_size_to_fit + Align(min_obj_size))); + int plug_power2 = index_of_highest_set_bit (round_up_power2 (plug_size_to_fit + Align(min_obj_size))); ptrdiff_t i; uint8_t* new_address = 0; @@ -8873,9 +8864,9 @@ retry: (plug_size - pad), pad, pinned_plug (m), - index_of_set_bit (round_down_power2 (free_space_size)), + index_of_highest_set_bit (free_space_size), (pinned_plug (m) - pinned_len (m)), - index_of_set_bit (round_down_power2 (new_free_space_size)))); + index_of_highest_set_bit (new_free_space_size))); #endif //SIMPLE_DPRINTF #ifdef SHORT_PLUGS @@ -8919,9 +8910,9 @@ retry: old_loc, new_address, (plug_size - pad), - index_of_set_bit (round_down_power2 (free_space_size)), + index_of_highest_set_bit (free_space_size), heap_segment_plan_allocated (seg), - index_of_set_bit (round_down_power2 (new_free_space_size)))); + index_of_highest_set_bit (new_free_space_size))); #endif //SIMPLE_DPRINTF if (realign_padding_p) @@ -8953,7 +8944,7 @@ retry: (!chosen_power2) && (i < free_space_count)); } - int new_bucket_power2 = index_of_set_bit (round_down_power2 (new_free_space_size)); + int new_bucket_power2 = index_of_highest_set_bit (new_free_space_size); if (new_bucket_power2 < base_power2) { @@ -33465,7 +33456,7 @@ HRESULT GCHeap::Initialize () gc_heap::min_loh_segment_size = large_seg_size; gc_heap::min_segment_size = min (seg_size, large_seg_size); #ifdef SEG_MAPPING_TABLE - gc_heap::min_segment_size_shr = index_of_set_bit (gc_heap::min_segment_size); + gc_heap::min_segment_size_shr = index_of_highest_set_bit (gc_heap::min_segment_size); #endif //SEG_MAPPING_TABLE #ifdef MULTIPLE_HEAPS diff --git a/src/coreclr/src/gc/gcrecord.h b/src/coreclr/src/gc/gcrecord.h index 9c3282b..3096695 100644 --- a/src/coreclr/src/gc/gcrecord.h +++ b/src/coreclr/src/gc/gcrecord.h @@ -311,7 +311,9 @@ static gc_mechanism_descr gc_mechanisms_descr[max_mechanism_per_heap] = }; #endif //DT_LOG -int index_of_set_bit (size_t power2); +// Get the 0-based index of the most-significant bit in the value. +// Returns -1 if the input value is zero (i.e. has no set bits). +int index_of_highest_set_bit (size_t value); #define mechanism_mask (1 << (sizeof (uint32_t) * 8 - 1)) // interesting per heap data we want to record for each GC. @@ -372,7 +374,7 @@ public: if (mechanism & mechanism_mask) { - int index = index_of_set_bit ((size_t)(mechanism & (~mechanism_mask))); + int index = index_of_highest_set_bit ((size_t)(mechanism & (~mechanism_mask))); assert (index != -1); return index; } diff --git a/src/coreclr/src/pal/inc/pal.h b/src/coreclr/src/pal/inc/pal.h index 2a51d58..c4a1d64 100644 --- a/src/coreclr/src/pal/inc/pal.h +++ b/src/coreclr/src/pal/inc/pal.h @@ -3302,6 +3302,55 @@ BitScanForward64( return qwMask != 0 ? TRUE : FALSE; } +// Define BitScanReverse64 and BitScanReverse +// Per MSDN, BitScanReverse64 will search the mask data from MSB to LSB for a set bit. +// If one is found, its bit position is stored in the out PDWORD argument and 1 is returned. +// Otherwise, an undefined value is stored in the out PDWORD argument and 0 is returned. +// +// GCC/clang don't have a directly equivalent intrinsic; they do provide the __builtin_clzll +// intrinsic, which returns the number of leading 0-bits in x starting at the most significant +// bit position (the result is undefined when x = 0). +// +// The same is true for BitScanReverse, except that the GCC function is __builtin_clzl. + +EXTERN_C +PALIMPORT +inline +unsigned char +PALAPI +BitScanReverse( + IN OUT PDWORD Index, + IN UINT qwMask) +{ + // The result of __builtin_clzl is undefined when qwMask is zero, + // but it's still OK to call the intrinsic in that case (just don't use the output). + // Unconditionally calling the intrinsic in this way allows the compiler to + // emit branchless code for this function when possible (depending on how the + // intrinsic is implemented for the target platform). + int lzcount = __builtin_clzl(qwMask); + *Index = (DWORD)(31 - lzcount); + return qwMask != 0; +} + +EXTERN_C +PALIMPORT +inline +unsigned char +PALAPI +BitScanReverse64( + IN OUT PDWORD Index, + IN UINT64 qwMask) +{ + // The result of __builtin_clzll is undefined when qwMask is zero, + // but it's still OK to call the intrinsic in that case (just don't use the output). + // Unconditionally calling the intrinsic in this way allows the compiler to + // emit branchless code for this function when possible (depending on how the + // intrinsic is implemented for the target platform). + int lzcount = __builtin_clzll(qwMask); + *Index = (DWORD)(63 - lzcount); + return qwMask != 0; +} + FORCEINLINE void PAL_ArmInterlockedOperationBarrier() { #ifdef _ARM64_ diff --git a/src/coreclr/src/pal/src/include/pal/palinternal.h b/src/coreclr/src/pal/src/include/pal/palinternal.h index a66ef7e..67236aa 100644 --- a/src/coreclr/src/pal/src/include/pal/palinternal.h +++ b/src/coreclr/src/pal/src/include/pal/palinternal.h @@ -342,6 +342,7 @@ function_name() to call the system's implementation #if !defined(_MSC_VER) && defined(_WIN64) #undef _BitScanForward64 +#undef _BitScanReverse64 #endif /* pal.h defines alloca(3) as a compiler builtin.