From 387b72731dbf79450050987b9b36bf70f286b098 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Cristian=20Rodr=C3=ADguez?= Date: Tue, 8 May 2012 23:58:19 -0400 Subject: [PATCH] bitmath: Finish up optimizations This patch adds support for other compilers and systems including MSVC, Intel C compiler etc.. Signed-off-by: Erik de Castro Lopo --- src/libFLAC/bitmath.c | 48 -------------- src/libFLAC/bitreader.c | 54 ++------------- src/libFLAC/include/private/bitmath.h | 120 ++++++++++++++++++++++++++++++---- 3 files changed, 116 insertions(+), 106 deletions(-) diff --git a/src/libFLAC/bitmath.c b/src/libFLAC/bitmath.c index 189977c..4fdde4b 100644 --- a/src/libFLAC/bitmath.c +++ b/src/libFLAC/bitmath.c @@ -36,54 +36,6 @@ #include "private/bitmath.h" #include "FLAC/assert.h" -/* An example of what FLAC__bitmath_ilog2() computes: - * - * ilog2( 0) = assertion failure - * ilog2( 1) = 0 - * ilog2( 2) = 1 - * ilog2( 3) = 1 - * ilog2( 4) = 2 - * ilog2( 5) = 2 - * ilog2( 6) = 2 - * ilog2( 7) = 2 - * ilog2( 8) = 3 - * ilog2( 9) = 3 - * ilog2(10) = 3 - * ilog2(11) = 3 - * ilog2(12) = 3 - * ilog2(13) = 3 - * ilog2(14) = 3 - * ilog2(15) = 3 - * ilog2(16) = 4 - * ilog2(17) = 4 - * ilog2(18) = 4 - */ - -#ifndef __GNUC__ - -/* For GNUC, use static inline version in include/private/bitmath.h. */ - -unsigned FLAC__bitmath_ilog2(FLAC__uint32 v) -{ - unsigned l = 0; - if (v == 0) - return 0; - while(v >>= 1) - l++; - return l; -} - -unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v) -{ - unsigned l = 0; - if (v == 0) - return 0; - while(v >>= 1) - l++; - return l; -} -#endif - /* An example of what FLAC__bitmath_silog2() computes: * * silog2(-10) = 5 diff --git a/src/libFLAC/bitreader.c b/src/libFLAC/bitreader.c index dcd9e42..9e15db0 100644 --- a/src/libFLAC/bitreader.c +++ b/src/libFLAC/bitreader.c @@ -43,7 +43,7 @@ #include "share/endswap.h" /* Things should be fastest when this matches the machine word size */ -/* WATCHOUT: if you change this you must also change the following #defines down to COUNT_ZERO_MSBS below to match */ +/* WATCHOUT: if you change this you must also change the following #defines down to FLAC__clz_uint32 below to match */ /* WATCHOUT: there are a few places where the code will not work unless uint32_t is >= 32 bits wide */ /* also, some sections currently only have fast versions for 4 or 8 bytes per word */ #define FLAC__BYTES_PER_WORD 4 /* sizeof uint32_t */ @@ -56,27 +56,6 @@ #define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_32(x) #endif -#if defined(__GNUC__) -/* "int __builtin_clz (unsigned int x) If x is 0, the result is undefined" */ -static inline uint32_t -COUNT_ZERO_MSBS (uint32_t word) -{ - if (word == 0) - return 32; - return __builtin_clz (word); -} -#else -/* counts the # of zero MSBs in a word */ -#define COUNT_ZERO_MSBS(word) ( \ - (word) > 0xffffff ? byte_to_unary_table[(word) >> 24] : \ - !(word) ? 32 : \ - (word) > 0xffff ? byte_to_unary_table[(word) >> 16] + 8 : \ - (word) > 0xff ? byte_to_unary_table[(word) >> 8] + 16 : \ - byte_to_unary_table[(word)] + 24 \ -) -#endif - - /* * This should be at least twice as large as the largest number of words * required to represent any 'number' (in any encoding) you are going to @@ -93,25 +72,6 @@ COUNT_ZERO_MSBS (uint32_t word) */ static const unsigned FLAC__BITREADER_DEFAULT_CAPACITY = 65536u / FLAC__BITS_PER_WORD; /* in words */ -static const unsigned char byte_to_unary_table[] = { - 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - /* adjust for compilers that can't understand using LLU suffix for uint64_t literals */ #ifdef _MSC_VER #define FLAC__U64L(x) x @@ -679,7 +639,7 @@ FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *va while(br->consumed_words < br->words) { /* if we've not consumed up to a partial tail word... */ uint32_t b = br->buffer[br->consumed_words] << br->consumed_bits; if(b) { - i = COUNT_ZERO_MSBS(b); + i = FLAC__clz_uint32(b); *val += i; i++; br->consumed_bits += i; @@ -709,7 +669,7 @@ FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *va const unsigned end = br->bytes * 8; uint32_t b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << br->consumed_bits; if(b) { - i = COUNT_ZERO_MSBS(b); + i = FLAC__clz_uint32(b); *val += i; i++; br->consumed_bits += i; @@ -800,7 +760,7 @@ FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[ mov i, eax } #else - i = COUNT_ZERO_MSBS(b); + i = FLAC__clz_uint32(b); #endif uval += i; bits = parameter; @@ -832,7 +792,7 @@ FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[ const unsigned end = br->bytes * 8; uint32_t b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << cbits; if(b) { - i = COUNT_ZERO_MSBS(b); + i = FLAC__clz_uint32(b); uval += i; bits = parameter; i++; @@ -984,7 +944,7 @@ break2: : "r"(b) ); #else - i = COUNT_ZERO_MSBS(b); + i = FLAC__clz_uint32(b); #endif uval += i; cbits += i; @@ -1015,7 +975,7 @@ break2: const unsigned end = br->bytes * 8; uint32_t b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits; if(b) { - i = COUNT_ZERO_MSBS(b); + i = FLAC__clz_uint32(b); uval += i; cbits += i; cbits++; /* skip over stop bit */ diff --git a/src/libFLAC/include/private/bitmath.h b/src/libFLAC/include/private/bitmath.h index b3d18f4..61b0e03 100644 --- a/src/libFLAC/include/private/bitmath.h +++ b/src/libFLAC/include/private/bitmath.h @@ -34,28 +34,126 @@ #include "FLAC/ordinals.h" +/* for CHAR_BIT */ +#include -#if defined(__GNUC__) +#if defined(_MSC_VER) && (_MSC_VER >= 1400) +#include /* for _BitScanReverse* */ +#endif + +/* Will never be emitted for MSVC, GCC, Intel compilers */ +inline unsigned int FLAC__clz_soft_uint32(unsigned int word) +{ + static const unsigned char byte_to_unary_table[] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + return (word) > 0xffffff ? byte_to_unary_table[(word) >> 24] : + !(word) ? 32 : + (word) > 0xffff ? byte_to_unary_table[(word) >> 16] + 8 : + (word) > 0xff ? byte_to_unary_table[(word) >> 8] + 16 : + byte_to_unary_table[(word)] + 24; +} + +static inline unsigned int FLAC__clz_uint32(FLAC__uint32 v) +{ +/* Never used with input 0 */ +#if defined(__INTEL_COMPILER) + return _bit_scan_reverse(n) ^ 31U; +#elif defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +/* This will translate either to (bsr ^ 31U), clz , ctlz, cntlz, lzcnt depending on + * -march= setting or to a software rutine in exotic machines. */ + return __builtin_clz(v); +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) + FLAC__uint32 idx; + _BitScanReverse(&idx, v); + return idx ^ 31U; +#else + return FLAC__clz_soft_uint32(v); +#endif +} + +/* An example of what FLAC__bitmath_ilog2() computes: + * + * ilog2( 0) = undefined + * ilog2( 1) = 0 + * ilog2( 2) = 1 + * ilog2( 3) = 1 + * ilog2( 4) = 2 + * ilog2( 5) = 2 + * ilog2( 6) = 2 + * ilog2( 7) = 2 + * ilog2( 8) = 3 + * ilog2( 9) = 3 + * ilog2(10) = 3 + * ilog2(11) = 3 + * ilog2(12) = 3 + * ilog2(13) = 3 + * ilog2(14) = 3 + * ilog2(15) = 3 + * ilog2(16) = 4 + * ilog2(17) = 4 + * ilog2(18) = 4 + */ static inline unsigned FLAC__bitmath_ilog2(FLAC__uint32 v) { - if (v == 0) - return 0; - return sizeof(FLAC__uint32) * __CHAR_BIT__ - 1 - __builtin_clz(v); + return sizeof(FLAC__uint32) * CHAR_BIT - 1 - FLAC__clz_uint32(v); } + +#ifdef FLAC__INTEGER_ONLY_LIBRARY /*Unused otherwise */ + static inline unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v) { if (v == 0) return 0; - return sizeof(FLAC__uint64) * __CHAR_BIT__ - 1 - __builtin_clzll(v); -} - +#if && defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) + return sizeof(FLAC__uint64) * CHAR_BIT - 1 - __builtin_clzll(v); +/* Sorry, only supported in win64/Itanium.. */ +#elif (defined(_MSC_VER) && (_MSC_VER >= 1400)) && (defined(_M_IA64) || defined(_WIN64)) + FLAC__uint64 idx; + _BitScanReverse64(&idx, v); + return idx ^ 63U; #else - -unsigned FLAC__bitmath_ilog2(FLAC__uint32 v); -unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v); - +/* Brain-damaged compilers will use the fastest possible way that is, + de Bruijn sequences (http://supertech.csail.mit.edu/papers/debruijn.pdf) + (C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 LGPL (v2 or later). +*/ + static const unsigned char DEBRUIJN_IDX64[64]={ + 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40, + 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57, + 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56, + 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58 + }; + int ret; + ret= v>0; + v|= v>>1; + v|= v>>2; + v|= v>>4; + v|= v>>8; + v|= v>>16; + v|= v>>32; + v= (v>>1)+1; + ret+=DEBRUIJN_IDX64[v*0x218A392CD3D5DBF>>58&0x3F]; + return ret; +#endif +} #endif unsigned FLAC__bitmath_silog2(int v); -- 2.7.4