From 5a03596c39fdf326ad8ef534de33756c8abbb2b2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=E3=81=A1=E3=83=BC=E3=81=9A=28=EF=BD=A58=EF=BD=A5=29?= =?utf8?q?=E3=81=91=E3=83=BC=E3=81=8D?= <31585494+MineCake147E@users.noreply.github.com> Date: Fri, 7 Jul 2023 23:57:03 +0900 Subject: [PATCH] Optimized conversions between `Half` and `Single`. (#81632) Closes #69667. --- .../System.Private.CoreLib/src/System/Half.cs | 279 +++++++++++++++++---- 1 file changed, 231 insertions(+), 48 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Half.cs b/src/libraries/System.Private.CoreLib/src/System/Half.cs index 6415acc..4b1c947 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Half.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Half.cs @@ -8,6 +8,7 @@ using System.Globalization; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; namespace System { @@ -614,30 +615,165 @@ namespace System /// converted to its nearest representable half-precision floating-point value. public static explicit operator Half(float value) { - const int SingleMaxExponent = 0xFF; - - uint floatInt = BitConverter.SingleToUInt32Bits(value); - bool sign = (floatInt & float.SignMask) >> float.SignShift != 0; - int exp = (int)(floatInt & float.BiasedExponentMask) >> float.BiasedExponentShift; - uint sig = floatInt & float.TrailingSignificandMask; - - if (exp == SingleMaxExponent) - { - if (sig != 0) // NaN - { - return CreateHalfNaN(sign, (ulong)sig << 41); // Shift the significand bits to the left end - } - return sign ? NegativeInfinity : PositiveInfinity; - } - - uint sigHalf = sig >> 9 | ((sig & 0x1FFU) != 0 ? 1U : 0U); // RightShiftJam - - if ((exp | (int)sigHalf) == 0) - { - return new Half(sign, 0, 0); - } - - return new Half(RoundPackToHalf(sign, (short)(exp - 0x71), (ushort)(sigHalf | 0x4000))); + #region Explanation of this algorithm + // This algorithm converts a single-precision floating-point number to a half-precision floating-point number by multiplying it as a floating-point number and rearranging the bit sequence. + // However, it introduces some tricks to implement rounding correctly, to avoid multiplying denormalized numbers and to deal with exceptions such as infinity and NaN without using branch instructions. + // + // The bit sequence of a half-precision floating-point number is as follows + // seee_eeff_ffff_ffff + // The bit sequence of a single-precision floating-point number is as follows + // seee_eeee_efff_ffff_ffff_ffff_ffff_ffff + // In both cases, "_" is the hexadecimal separator, "s" is the sign, "e" is the exponent part, and "f" is the mantissa part. + // In half-precision, the exponent part is 5 bits and the mantissa part is 10 bits. In single precision, the exponent is 8 bits and the mantissa is 23 bits. + // Both formats use an offset binary representation for the exponent part: the exponent part for 1.0 is half of the maximum value for either precision, i.e., 127 for single-precision and 15 for half-precision. + // The mantissa part is normalized when the exponent part is nonzero, since in binary numbers, 1 appears as the most significant digit for any nonzero number. + // + // This conversion algorithm takes advantage of the similarity between the two formats. + // By isolating the sign part from the single-precision bitstring, limiting the range of absolute value, rounding the lower bits to match the half-precision, and shifting it 13 bits to the right, the boundary between the exponent and mantissa parts matches with that of half-precision. + // In other words, + // sEEEeeeeeffffffffffxxxxxxxxxxxxx is rearranged to + // seeeeeffffffffff + // The x is the part that certainly gets rounded. + // + // When you operate with floating-point number, rounding occurs after every single floating-point operation. + // For example, when you add 1.1f with MathF.PI, the internal representation of both value is: + // 0 01111111 00011001100110011001101 for 1.1f, and + // 0 10000000 10010010000111111011011 for MathF.PI (3.1415927f). + // And raw binary representation of both numbers is: + // 1.00011001100110011001101 for 1.1f, and + // 11.0010010000111111011011 for 3.1415927f. + // We matched the point for adding them properly. + // Adding these numbers results: + // 100.00111101110110010000011 + // After normalizing the number: + // 1.0000111101110110010000011 x 2^2 + // But it has 25 bits below the point. So we should round the number to 23bits by the method called "Round to nearest, ties to even" + // - Round to the nearest value + // - If the number is at the midway, round it to the nearest value with an even least significant digit. + // So we apply this: + // 1.00001111011101100100001 x 2^2 + // And the result is: + // 0 10000001 00001111011101100100001 + // Which matches the ground truth of `BitConverter.SingleToUInt32Bits(MathF.PI + 1.3f)`: + // 0 10000001 00001111011101100100001 + // + // When we want to round the number to a certain precision, we can take advantage of this specification. + // If we craft a value to add carefully, the result of addition is rounded wherever we expect. + // For instance, MathF.PI (3.1415927f) is: + // 0 10000000 10010010000111111011011 + // We craft the adding value to round the MathF.PI into half-precision by adding (exponentOffset0 in the actual code) by: + // - Making sure that both the exponentOffset0 and the value is smaller than MaxHalfValueBelowInfinity(65520.0f) as larger values goes infinity in Half, while letting NaN be as it is + // - Making sure that the exponentOffset0 is larger than MinExp (0x3880_0000u) as smaller values goes subnormal in Half + // - Clearing the fraction bits in exponentOffset0 + // - Adding Exponent13 (0x0680_0000u) to exponentOffset0 with integer ALU, effectively adding 13 to the exponent part of exponentOffset0 + // For 3.1415927f, the exponentOffset0 is: + // 0 10001101 00000000000000000000000 (16384f) + // Adding these numbers with floating-point arithmetic unit results: + // 0 10001101 00000000000011001001000 (16387.14f) + // You can see the first 11 bits of 11.0010010000111111011011 rounded appears at the bottom of the fraction part of the result. + // By subtracting the 16384f from this with floating-point arithmetic unit, we get this: + // 0 10000000 10010010000000000000000 (3.140625f) + // And here is the `BitConverter.HalfToUInt16Bits((Half)MathF.PI)` in binary: + // 0 10000 1001001000 (3.14) + // + // Now we have to resolve the difference of the exponent parts. + // We can simply multiply the 1.92593E-34f in the floating-point number multiplication unit, to adjust the exponent part. + // However, most hardware cannot efficiently handle the multiplication of denormalized numbers. + // Adding the exponentOffset0 (16384f) to 3.1415927f with floating-point arithmetic unit results: + // 0 10001101 00000000000011001001000 (16387.14f) + // Then subtract the Exponent126 (0x3f00_0000u) from it with integer ALU: + // 0 00001111 00000000000011001001000 (1.9262991E-34f) + // And here is the `BitConverter.HalfToUInt16Bits((Half)MathF.PI)` in binary: + // 0 10000 1001001000 (3.14) + // Note that we left the leading 1 in fraction on top of the 10 lowest significant bits. + // Now we have to rearrange the bitstring. + // By shifting the internal representation of 1.9262991E-34f right by 13 bits, we get this: + // 0 01111 0000000000 ((Half)1.0f) + // By adding it to the internal representation of 1.9262991E-34f if the value isn't NaN, we get this: + // (0 11110000 000) 0 10000 1001001000 (3.14 in Half with some garbage on top of it) + // Now we have to merge the sign bit at the right position, and clear the garbage on top of 16-bit final bitstring: + // 0 10000 1001001000 (3.14 in Half) + // And here is the `BitConverter.HalfToUInt16Bits((Half)MathF.PI)` in binary: + // 0 10000 1001001000 (3.14 in Half) + // + // If the value is NaN in Half, we should further modify the exponent part of the intermediate value. + // For the 0xffbf_ffffu (NaN, + // 1 11111111 01111111111111111111111 in binary), the exponentOffset0 is: + // 1 00001100 00000000000000000000000 (-2.4074124E-35f) + // It doesn't look correct! But don't worry. + // And the result of `value + exponentOffset0` is: + // 0 11111111 11111111111111111111111 (NaN) + // As the sign part is isolated at the beginning, the sign bit is 0 here. + // The exponent don't seem to be changed at all, and the only difference here from the original value 0xffbf_ffffu is the sign bit and the highest bit of fraction part. + // Setting the highest bit of fraction part is an expected behavior. + // After subtracting the Exponent126 from it, we get this: + // 0 10000001 11111111111111111111111 (7.9999995f) + // By shifting the internal representation of it right by 13 bits, we get this: + // 0010 0 00001 1111111111 + // By adding it to the internal representation of 7.9999995f if the original value isn't NaN, we get this: + // 0010 0 00001 1111111111 + // Here we changed nothing because the original value is NaN, so 7.9999995f is thrown away from scope already. + // The maskedHalfExponentForNaN was generated before checking for the underflow. The value of maskedHalfExponentForNaN here is: + // - ExponentMask (0x7c00u) if the value is NaN, 0 otherwise + // Then the signAndMaskedExponent is also generated by ORing the maskedHalfExponentForNaN and the isolated sign bit shifted 16 bits right (0x8000u in this case): + // 1 11111 0000000000 (Half.NegativeInfinity) + // The exponent part here is also a complete gibberish, so we clear them by ANDing the ~maskedHalfExponentForNaN: + // 0010 0 00000 1111111111 (6.1E-05 in Half with some garbage on top of it) + // Then merge the signAndMaskedExponent with it, and clear the garbage on top of 16-bit final bitstring: + // 1 11111 1111111111 (NaN) + // And here is the `BitConverter.HalfToUInt16Bits((Half)BitConverter.UInt32BitsToSingle(0xffbf_ffffu))` in binary: + // 1 11111 1111111111 (NaN) + // + // This code does all of above steps, without any single branches. + #endregion + // Minimum exponent for rounding + const uint MinExp = 0x3880_0000u; + // Exponent displacement #1 + const uint Exponent126 = 0x3f00_0000u; + // Exponent mask + const uint SingleBiasedExponentMask = float.BiasedExponentMask; + // Exponent displacement #2 + const uint Exponent13 = 0x0680_0000u; + // Maximum value that is not Infinity in Half + const float MaxHalfValueBelowInfinity = 65520.0f; + // Mask for exponent bits in Half + const uint ExponentMask = BiasedExponentMask; + uint bitValue = BitConverter.SingleToUInt32Bits(value); + // Extract sign bit + uint sign = (bitValue & float.SignMask) >> 16; + // Detecting NaN (~0u if a is not NaN) + uint realMask = (uint)(Unsafe.BitCast(float.IsNaN(value)) - 1); + // Clear sign bit + value = float.Abs(value); + // Rectify values that are Infinity in Half. (float.Min now emits vminps instruction if one of two arguments is a constant) + value = float.Min(MaxHalfValueBelowInfinity, value); + // Rectify lower exponent + uint exponentOffset0 = BitConverter.SingleToUInt32Bits(float.Max(value, BitConverter.UInt32BitsToSingle(MinExp))); + // Extract exponent + exponentOffset0 &= SingleBiasedExponentMask; + // Add exponent by 13 + exponentOffset0 += Exponent13; + // Round Single into Half's precision (NaN also gets modified here, just setting the MSB of fraction) + value += BitConverter.UInt32BitsToSingle(exponentOffset0); + bitValue = BitConverter.SingleToUInt32Bits(value); + // Only exponent bits will be modified if NaN + uint maskedHalfExponentForNaN = ~realMask & ExponentMask; + // Subtract exponent by 126 + bitValue -= Exponent126; + // Shift bitValue right by 13 bits to match the boundary of exponent part and fraction part. + uint newExponent = bitValue >> 13; + // Clear the fraction parts if the value was NaN. + bitValue &= realMask; + // Merge the exponent part with fraction part, and add the exponent part and fraction part's overflow. + bitValue += newExponent; + // Clear exponents if value is NaN + bitValue &= ~maskedHalfExponentForNaN; + // Merge sign bit with possible NaN exponent + uint signAndMaskedExponent = maskedHalfExponentForNaN | sign; + // Merge sign bit and possible NaN exponent + bitValue |= signAndMaskedExponent; + // The final result + return BitConverter.UInt16BitsToHalf((ushort)bitValue); } /// Explicitly converts a value to its nearest representable half-precision floating-point value. @@ -883,30 +1019,77 @@ namespace System /// converted to its nearest representable value. public static explicit operator float(Half value) { - bool sign = IsNegative(value); - int exp = value.BiasedExponent; - uint sig = value.TrailingSignificand; - - if (exp == MaxBiasedExponent) - { - if (sig != 0) - { - return CreateSingleNaN(sign, (ulong)sig << 54); - } - return sign ? float.NegativeInfinity : float.PositiveInfinity; - } - - if (exp == 0) - { - if (sig == 0) - { - return BitConverter.UInt32BitsToSingle(sign ? float.SignMask : 0); // Positive / Negative zero - } - (exp, sig) = NormSubnormalF16Sig(sig); - exp -= 1; - } - - return CreateSingle(sign, (byte)(exp + 0x70), sig << 13); + #region Explanation of this algorithm + // This algorithm converts a half-precision floating-point number to a single-precision floating-point number by rearranging the bit sequence and multiplying it as a floating-point number. + // However, it introduces some tricks to avoid multiplying denormalized numbers and to deal with exceptions such as infinity and NaN without using branch instructions. + // + // The bit sequence of a half-precision floating-point number is as follows + // seee_eeff_ffff_ffff + // The bit sequence of a single-precision floating-point number is as follows + // seee_eeee_efff_ffff_ffff_ffff_ffff_ffff + // In both cases, "_" is the hexadecimal separator, "s" is the sign, "e" is the exponent part, and "f" is the mantissa part. + // In half-precision, the exponent part is 5 bits and the mantissa part is 10 bits. In single precision, the exponent is 8 bits and the mantissa is 23 bits. + // Both formats use an offset binary representation for the exponent part: the exponent part for 1.0 is half of the maximum value for either precision, i.e., 127 for single-precision and 15 for half-precision. + // The mantissa part is normalized when the exponent part is nonzero, since in binary numbers, 1 appears as the most significant digit for any nonzero number. + // + // This conversion algorithm takes advantage of the similarity between the two formats. + // By isolating the sign part from the half-precision bitstring and shifting it 13 bits to the left, the boundary between the exponent and mantissa parts matches with that of single-precision. + // In other words, + // 0eeeeeffffffffff is rearranged to + // 0000eeeeeffffffffff0000000000000 + // which matches the boundary between the exponent and mantissa parts of single-precision floating-point number: + // seeeeeeeefffffffffffffffffffffff + // + // After rearrangement, this bit sequence is multiplied by the constant 5.192297E+33f in the floating-point number multiplication unit. + // However, most hardware cannot efficiently handle the multiplication of denormalized numbers. + // Denormalized numbers are more common in half-precision than in single-precision, so they cannot be ignored. + // + // First, if the value is a denormalized number, the constant 0x3880_0000u is added beforehand in the integer addition unit to make it behave as a normalized number. + // For Infinity or NaN, the constant 0x7000_0000u is added beforehand in the integer adder. + // These numbers are then converted to single-precision floating-point numbers as per the IEEE754 specification by the following operations. + // Next, regardless of whether the value is a denormalized number or not, add the constant 0x3800_0000u to this bit string in the integer addition unit. The constant is chosen to add 112 to the exponent part; 112 is 127 subtracted by 15. + // Then, if the value is a denormalized number, the constant 6.1035156E-05f is subtracted in the floating-point number subtraction unit. + // The above operation produces the same result as if the rearranged bit sequence were multiplied by the constant 5.192297E+33f. + // Finally, merging the isolated sign bits completes the conversion. + #endregion + // The smallest positive normal number in Half, converted to Single + const uint ExponentLowerBound = 0x3880_0000u; + // BitConverter.SingleToUInt32Bits(1.0f) - ((uint)BitConverter.HalfToUInt16Bits((Half)1.0f) << 13) + const uint ExponentOffset = 0x3800_0000u; + // Mask for sign bit in Single + const uint FloatSignMask = float.SignMask; + // Mask for exponent bits in Half + const uint HalfExponentMask = BiasedExponentMask; + // Mask for bits in Single converted from Half + const int HalfToSingleBitsMask = 0x0FFF_E000; + // Extract the internal representation of value + short valueInInt16Bits = BitConverter.HalfToInt16Bits(value); + // Extract sign bit of value + uint sign = (uint)(int)valueInInt16Bits & FloatSignMask; + // Copy sign bit to upper bits + uint bitValueInProcess = (uint)valueInInt16Bits; + // Extract exponent bits of value (BiasedExponent is not for here as it performs unnecessary shift) + uint offsetExponent = bitValueInProcess & HalfExponentMask; + // ~0u when value is subnormal, 0 otherwise + uint subnormalMask = (uint)-Unsafe.BitCast(offsetExponent == 0u); + // ~0u when value is either Infinity or NaN, 0 otherwise + int infinityOrNaNMask = Unsafe.BitCast(offsetExponent == HalfExponentMask); + // 0x3880_0000u if value is subnormal, 0 otherwise + uint maskedExponentLowerBound = subnormalMask & ExponentLowerBound; + // 0x3880_0000u if value is subnormal, 0x3800_0000u otherwise + uint offsetMaskedExponentLowerBound = ExponentOffset | maskedExponentLowerBound; + // Match the position of the boundary of exponent bits and fraction bits with IEEE 754 Binary32(Single) + bitValueInProcess <<= 13; + // Double the offsetMaskedExponentLowerBound if value is either Infinity or NaN + offsetMaskedExponentLowerBound <<= infinityOrNaNMask; + // Extract exponent bits and fraction bits of value + bitValueInProcess &= HalfToSingleBitsMask; + // Adjust exponent to match the range of exponent + bitValueInProcess += offsetMaskedExponentLowerBound; + // If value is subnormal, remove unnecessary 1 on top of fraction bits. + uint absoluteValue = BitConverter.SingleToUInt32Bits(BitConverter.UInt32BitsToSingle(bitValueInProcess) - BitConverter.UInt32BitsToSingle(maskedExponentLowerBound)); + // Merge sign bit with rest + return BitConverter.UInt32BitsToSingle(absoluteValue | sign); } // IEEE 754 specifies NaNs to be propagated -- 2.7.4