From 6fe7effad7fddf8d5dc0b3ac3d5be5ec80e158ff Mon Sep 17 00:00:00 2001 From: Jack Pappas Date: Thu, 1 Nov 2018 01:50:23 -0400 Subject: [PATCH] Make BitScanForward/BitScanForward64 PAL wrappers branchless. (#20412) The BitScanForward/BitScanForward64 wrapper functions from the PAL and gcenv have been modified so they're faster (and branchless), while also adhering more closely to the behavior of the MSVC intrinsics. Use _BitScanForward64 when targeting 64-bit Windows. The _WIN32 macro is always defined by MSVC, even when targeting 64-bit versions of Windows. Use the _WIN64 macro instead to check whether the build is targeting 64-bit Windows, and if so, use the _BitScanForward64 intrinsic for the BitScanForward64 wrapper instead of the 32-bit-based fallback. --- src/gc/env/gcenv.base.h | 44 ++++++++++++++++++++++---------------------- src/pal/inc/pal.h | 38 +++++++++++++++----------------------- 2 files changed, 37 insertions(+), 45 deletions(-) diff --git a/src/gc/env/gcenv.base.h b/src/gc/env/gcenv.base.h index 15a81d7..da66e9b 100644 --- a/src/gc/env/gcenv.base.h +++ b/src/gc/env/gcenv.base.h @@ -220,34 +220,40 @@ typedef DWORD (WINAPI *PTHREAD_START_ROUTINE)(void* lpThreadParameter); #ifdef _MSC_VER #pragma intrinsic(_BitScanForward) -#if WIN64 +#if _WIN64 #pragma intrinsic(_BitScanForward64) #endif #endif // _MSC_VER // Cross-platform wrapper for the _BitScanForward compiler intrinsic. +// A value is unconditionally stored through the bitIndex argument, +// but callers should only rely on it when the function returns TRUE; +// otherwise, the stored value is undefined and varies by implementation +// and hardware platform. inline uint8_t BitScanForward(uint32_t *bitIndex, uint32_t mask) { #ifdef _MSC_VER return _BitScanForward((unsigned long*)bitIndex, mask); #else // _MSC_VER - unsigned char ret = FALSE; - int iIndex = __builtin_ffsl(mask); - if (iIndex != 0) - { - *bitIndex = (uint32_t)(iIndex - 1); - ret = TRUE; - } - - return ret; + int iIndex = __builtin_ffs(mask); + *bitIndex = static_cast(iIndex - 1); + // Both GCC and Clang generate better, smaller code if we check whether the + // mask was/is zero rather than the equivalent check that iIndex is zero. + return mask != 0 ? TRUE : FALSE; #endif // _MSC_VER } // Cross-platform wrapper for the _BitScanForward64 compiler intrinsic. +// A value is unconditionally stored through the bitIndex argument, +// but callers should only rely on it when the function returns TRUE; +// otherwise, the stored value is undefined and varies by implementation +// and hardware platform. inline uint8_t BitScanForward64(uint32_t *bitIndex, uint64_t mask) { #ifdef _MSC_VER - #if _WIN32 + #if _WIN64 + return _BitScanForward64((unsigned long*)bitIndex, mask); + #else // MSVC targeting a 32-bit target does not support this intrinsic. // We can fake it using two successive invocations of _BitScanForward. uint32_t hi = (mask >> 32) & 0xFFFFFFFF; @@ -265,19 +271,13 @@ inline uint8_t BitScanForward64(uint32_t *bitIndex, uint64_t mask) } return result; - #else - return _BitScanForward64((unsigned long*)bitIndex, mask); - #endif // _WIN32 + #endif // _WIN64 #else - unsigned char ret = FALSE; int iIndex = __builtin_ffsll(mask); - if (iIndex != 0) - { - *bitIndex = (uint32_t)(iIndex - 1); - ret = TRUE; - } - - return ret; + *bitIndex = static_cast(iIndex - 1); + // Both GCC and Clang generate better, smaller code if we check whether the + // mask was/is zero rather than the equivalent check that iIndex is zero. + return mask != 0 ? TRUE : FALSE; #endif // _MSC_VER } diff --git a/src/pal/inc/pal.h b/src/pal/inc/pal.h index f117a6e..6908040 100644 --- a/src/pal/inc/pal.h +++ b/src/pal/inc/pal.h @@ -3254,10 +3254,10 @@ typedef EXCEPTION_DISPOSITION (PALAPI *PVECTORED_EXCEPTION_HANDLER)( // Define BitScanForward64 and BitScanForward // Per MSDN, BitScanForward64 will search the mask data from LSB to MSB for a set bit. -// If one is found, its bit position is returned in the out PDWORD argument and 1 is returned. -// Otherwise, 0 is returned. +// If one is found, its bit position is stored in the out PDWORD argument and 1 is returned; +// otherwise, an undefined value is stored in the out PDWORD argument and 0 is returned. // -// On GCC, the equivalent function is __builtin_ffsl. It returns 1+index of the least +// On GCC, the equivalent function is __builtin_ffsll. It returns 1+index of the least // significant set bit, or 0 if if mask is zero. // // The same is true for BitScanForward, except that the GCC function is __builtin_ffs. @@ -3270,16 +3270,12 @@ BitScanForward( IN OUT PDWORD Index, IN UINT qwMask) { - unsigned char bRet = FALSE; - int iIndex = __builtin_ffsl(qwMask); - if (iIndex != 0) - { - // Set the Index after deducting unity - *Index = (DWORD)(iIndex - 1); - bRet = TRUE; - } - - return bRet; + int iIndex = __builtin_ffs(qwMask); + // Set the Index after deducting unity + *Index = (DWORD)(iIndex - 1); + // Both GCC and Clang generate better, smaller code if we check whether the + // mask was/is zero rather than the equivalent check that iIndex is zero. + return qwMask != 0 ? TRUE : FALSE; } EXTERN_C @@ -3291,16 +3287,12 @@ BitScanForward64( IN OUT PDWORD Index, IN UINT64 qwMask) { - unsigned char bRet = FALSE; - int iIndex = __builtin_ffsl(qwMask); - if (iIndex != 0) - { - // Set the Index after deducting unity - *Index = (DWORD)(iIndex - 1); - bRet = TRUE; - } - - return bRet; + int iIndex = __builtin_ffsll(qwMask); + // Set the Index after deducting unity + *Index = (DWORD)(iIndex - 1); + // Both GCC and Clang generate better, smaller code if we check whether the + // mask was/is zero rather than the equivalent check that iIndex is zero. + return qwMask != 0 ? TRUE : FALSE; } FORCEINLINE void PAL_ArmInterlockedOperationBarrier() -- 2.7.4