using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
namespace System
{
/// <returns><paramref name="value" /> converted to its nearest representable half-precision floating-point value.</returns>
public static explicit operator Half(float value)
{
- const int SingleMaxExponent = 0xFF;
-
- uint floatInt = BitConverter.SingleToUInt32Bits(value);
- bool sign = (floatInt & float.SignMask) >> float.SignShift != 0;
- int exp = (int)(floatInt & float.BiasedExponentMask) >> float.BiasedExponentShift;
- uint sig = floatInt & float.TrailingSignificandMask;
-
- if (exp == SingleMaxExponent)
- {
- if (sig != 0) // NaN
- {
- return CreateHalfNaN(sign, (ulong)sig << 41); // Shift the significand bits to the left end
- }
- return sign ? NegativeInfinity : PositiveInfinity;
- }
-
- uint sigHalf = sig >> 9 | ((sig & 0x1FFU) != 0 ? 1U : 0U); // RightShiftJam
-
- if ((exp | (int)sigHalf) == 0)
- {
- return new Half(sign, 0, 0);
- }
-
- return new Half(RoundPackToHalf(sign, (short)(exp - 0x71), (ushort)(sigHalf | 0x4000)));
+ #region Explanation of this algorithm
+ // This algorithm converts a single-precision floating-point number to a half-precision floating-point number by multiplying it as a floating-point number and rearranging the bit sequence.
+ // However, it introduces some tricks to implement rounding correctly, to avoid multiplying denormalized numbers and to deal with exceptions such as infinity and NaN without using branch instructions.
+ //
+ // The bit sequence of a half-precision floating-point number is as follows
+ // seee_eeff_ffff_ffff
+ // The bit sequence of a single-precision floating-point number is as follows
+ // seee_eeee_efff_ffff_ffff_ffff_ffff_ffff
+ // In both cases, "_" is the hexadecimal separator, "s" is the sign, "e" is the exponent part, and "f" is the mantissa part.
+ // In half-precision, the exponent part is 5 bits and the mantissa part is 10 bits. In single precision, the exponent is 8 bits and the mantissa is 23 bits.
+ // Both formats use an offset binary representation for the exponent part: the exponent part for 1.0 is half of the maximum value for either precision, i.e., 127 for single-precision and 15 for half-precision.
+ // The mantissa part is normalized when the exponent part is nonzero, since in binary numbers, 1 appears as the most significant digit for any nonzero number.
+ //
+ // This conversion algorithm takes advantage of the similarity between the two formats.
+ // By isolating the sign part from the single-precision bitstring, limiting the range of absolute value, rounding the lower bits to match the half-precision, and shifting it 13 bits to the right, the boundary between the exponent and mantissa parts matches with that of half-precision.
+ // In other words,
+ // sEEEeeeeeffffffffffxxxxxxxxxxxxx is rearranged to
+ // seeeeeffffffffff
+ // The x is the part that certainly gets rounded.
+ //
+ // When you operate with floating-point number, rounding occurs after every single floating-point operation.
+ // For example, when you add 1.1f with MathF.PI, the internal representation of both value is:
+ // 0 01111111 00011001100110011001101 for 1.1f, and
+ // 0 10000000 10010010000111111011011 for MathF.PI (3.1415927f).
+ // And raw binary representation of both numbers is:
+ // 1.00011001100110011001101 for 1.1f, and
+ // 11.0010010000111111011011 for 3.1415927f.
+ // We matched the point for adding them properly.
+ // Adding these numbers results:
+ // 100.00111101110110010000011
+ // After normalizing the number:
+ // 1.0000111101110110010000011 x 2^2
+ // But it has 25 bits below the point. So we should round the number to 23bits by the method called "Round to nearest, ties to even"
+ // - Round to the nearest value
+ // - If the number is at the midway, round it to the nearest value with an even least significant digit.
+ // So we apply this:
+ // 1.00001111011101100100001 x 2^2
+ // And the result is:
+ // 0 10000001 00001111011101100100001
+ // Which matches the ground truth of `BitConverter.SingleToUInt32Bits(MathF.PI + 1.3f)`:
+ // 0 10000001 00001111011101100100001
+ //
+ // When we want to round the number to a certain precision, we can take advantage of this specification.
+ // If we craft a value to add carefully, the result of addition is rounded wherever we expect.
+ // For instance, MathF.PI (3.1415927f) is:
+ // 0 10000000 10010010000111111011011
+ // We craft the adding value to round the MathF.PI into half-precision by adding (exponentOffset0 in the actual code) by:
+ // - Making sure that both the exponentOffset0 and the value is smaller than MaxHalfValueBelowInfinity(65520.0f) as larger values goes infinity in Half, while letting NaN be as it is
+ // - Making sure that the exponentOffset0 is larger than MinExp (0x3880_0000u) as smaller values goes subnormal in Half
+ // - Clearing the fraction bits in exponentOffset0
+ // - Adding Exponent13 (0x0680_0000u) to exponentOffset0 with integer ALU, effectively adding 13 to the exponent part of exponentOffset0
+ // For 3.1415927f, the exponentOffset0 is:
+ // 0 10001101 00000000000000000000000 (16384f)
+ // Adding these numbers with floating-point arithmetic unit results:
+ // 0 10001101 00000000000011001001000 (16387.14f)
+ // You can see the first 11 bits of 11.0010010000111111011011 rounded appears at the bottom of the fraction part of the result.
+ // By subtracting the 16384f from this with floating-point arithmetic unit, we get this:
+ // 0 10000000 10010010000000000000000 (3.140625f)
+ // And here is the `BitConverter.HalfToUInt16Bits((Half)MathF.PI)` in binary:
+ // 0 10000 1001001000 (3.14)
+ //
+ // Now we have to resolve the difference of the exponent parts.
+ // We can simply multiply the 1.92593E-34f in the floating-point number multiplication unit, to adjust the exponent part.
+ // However, most hardware cannot efficiently handle the multiplication of denormalized numbers.
+ // Adding the exponentOffset0 (16384f) to 3.1415927f with floating-point arithmetic unit results:
+ // 0 10001101 00000000000011001001000 (16387.14f)
+ // Then subtract the Exponent126 (0x3f00_0000u) from it with integer ALU:
+ // 0 00001111 00000000000011001001000 (1.9262991E-34f)
+ // And here is the `BitConverter.HalfToUInt16Bits((Half)MathF.PI)` in binary:
+ // 0 10000 1001001000 (3.14)
+ // Note that we left the leading 1 in fraction on top of the 10 lowest significant bits.
+ // Now we have to rearrange the bitstring.
+ // By shifting the internal representation of 1.9262991E-34f right by 13 bits, we get this:
+ // 0 01111 0000000000 ((Half)1.0f)
+ // By adding it to the internal representation of 1.9262991E-34f if the value isn't NaN, we get this:
+ // (0 11110000 000) 0 10000 1001001000 (3.14 in Half with some garbage on top of it)
+ // Now we have to merge the sign bit at the right position, and clear the garbage on top of 16-bit final bitstring:
+ // 0 10000 1001001000 (3.14 in Half)
+ // And here is the `BitConverter.HalfToUInt16Bits((Half)MathF.PI)` in binary:
+ // 0 10000 1001001000 (3.14 in Half)
+ //
+ // If the value is NaN in Half, we should further modify the exponent part of the intermediate value.
+ // For the 0xffbf_ffffu (NaN,
+ // 1 11111111 01111111111111111111111 in binary), the exponentOffset0 is:
+ // 1 00001100 00000000000000000000000 (-2.4074124E-35f)
+ // It doesn't look correct! But don't worry.
+ // And the result of `value + exponentOffset0` is:
+ // 0 11111111 11111111111111111111111 (NaN)
+ // As the sign part is isolated at the beginning, the sign bit is 0 here.
+ // The exponent don't seem to be changed at all, and the only difference here from the original value 0xffbf_ffffu is the sign bit and the highest bit of fraction part.
+ // Setting the highest bit of fraction part is an expected behavior.
+ // After subtracting the Exponent126 from it, we get this:
+ // 0 10000001 11111111111111111111111 (7.9999995f)
+ // By shifting the internal representation of it right by 13 bits, we get this:
+ // 0010 0 00001 1111111111
+ // By adding it to the internal representation of 7.9999995f if the original value isn't NaN, we get this:
+ // 0010 0 00001 1111111111
+ // Here we changed nothing because the original value is NaN, so 7.9999995f is thrown away from scope already.
+ // The maskedHalfExponentForNaN was generated before checking for the underflow. The value of maskedHalfExponentForNaN here is:
+ // - ExponentMask (0x7c00u) if the value is NaN, 0 otherwise
+ // Then the signAndMaskedExponent is also generated by ORing the maskedHalfExponentForNaN and the isolated sign bit shifted 16 bits right (0x8000u in this case):
+ // 1 11111 0000000000 (Half.NegativeInfinity)
+ // The exponent part here is also a complete gibberish, so we clear them by ANDing the ~maskedHalfExponentForNaN:
+ // 0010 0 00000 1111111111 (6.1E-05 in Half with some garbage on top of it)
+ // Then merge the signAndMaskedExponent with it, and clear the garbage on top of 16-bit final bitstring:
+ // 1 11111 1111111111 (NaN)
+ // And here is the `BitConverter.HalfToUInt16Bits((Half)BitConverter.UInt32BitsToSingle(0xffbf_ffffu))` in binary:
+ // 1 11111 1111111111 (NaN)
+ //
+ // This code does all of above steps, without any single branches.
+ #endregion
+ // Minimum exponent for rounding
+ const uint MinExp = 0x3880_0000u;
+ // Exponent displacement #1
+ const uint Exponent126 = 0x3f00_0000u;
+ // Exponent mask
+ const uint SingleBiasedExponentMask = float.BiasedExponentMask;
+ // Exponent displacement #2
+ const uint Exponent13 = 0x0680_0000u;
+ // Maximum value that is not Infinity in Half
+ const float MaxHalfValueBelowInfinity = 65520.0f;
+ // Mask for exponent bits in Half
+ const uint ExponentMask = BiasedExponentMask;
+ uint bitValue = BitConverter.SingleToUInt32Bits(value);
+ // Extract sign bit
+ uint sign = (bitValue & float.SignMask) >> 16;
+ // Detecting NaN (~0u if a is not NaN)
+ uint realMask = (uint)(Unsafe.BitCast<bool, sbyte>(float.IsNaN(value)) - 1);
+ // Clear sign bit
+ value = float.Abs(value);
+ // Rectify values that are Infinity in Half. (float.Min now emits vminps instruction if one of two arguments is a constant)
+ value = float.Min(MaxHalfValueBelowInfinity, value);
+ // Rectify lower exponent
+ uint exponentOffset0 = BitConverter.SingleToUInt32Bits(float.Max(value, BitConverter.UInt32BitsToSingle(MinExp)));
+ // Extract exponent
+ exponentOffset0 &= SingleBiasedExponentMask;
+ // Add exponent by 13
+ exponentOffset0 += Exponent13;
+ // Round Single into Half's precision (NaN also gets modified here, just setting the MSB of fraction)
+ value += BitConverter.UInt32BitsToSingle(exponentOffset0);
+ bitValue = BitConverter.SingleToUInt32Bits(value);
+ // Only exponent bits will be modified if NaN
+ uint maskedHalfExponentForNaN = ~realMask & ExponentMask;
+ // Subtract exponent by 126
+ bitValue -= Exponent126;
+ // Shift bitValue right by 13 bits to match the boundary of exponent part and fraction part.
+ uint newExponent = bitValue >> 13;
+ // Clear the fraction parts if the value was NaN.
+ bitValue &= realMask;
+ // Merge the exponent part with fraction part, and add the exponent part and fraction part's overflow.
+ bitValue += newExponent;
+ // Clear exponents if value is NaN
+ bitValue &= ~maskedHalfExponentForNaN;
+ // Merge sign bit with possible NaN exponent
+ uint signAndMaskedExponent = maskedHalfExponentForNaN | sign;
+ // Merge sign bit and possible NaN exponent
+ bitValue |= signAndMaskedExponent;
+ // The final result
+ return BitConverter.UInt16BitsToHalf((ushort)bitValue);
}
/// <summary>Explicitly converts a <see cref="ushort" /> value to its nearest representable half-precision floating-point value.</summary>
/// <returns><paramref name="value" /> converted to its nearest representable <see cref="float" /> value.</returns>
public static explicit operator float(Half value)
{
- bool sign = IsNegative(value);
- int exp = value.BiasedExponent;
- uint sig = value.TrailingSignificand;
-
- if (exp == MaxBiasedExponent)
- {
- if (sig != 0)
- {
- return CreateSingleNaN(sign, (ulong)sig << 54);
- }
- return sign ? float.NegativeInfinity : float.PositiveInfinity;
- }
-
- if (exp == 0)
- {
- if (sig == 0)
- {
- return BitConverter.UInt32BitsToSingle(sign ? float.SignMask : 0); // Positive / Negative zero
- }
- (exp, sig) = NormSubnormalF16Sig(sig);
- exp -= 1;
- }
-
- return CreateSingle(sign, (byte)(exp + 0x70), sig << 13);
+ #region Explanation of this algorithm
+ // This algorithm converts a half-precision floating-point number to a single-precision floating-point number by rearranging the bit sequence and multiplying it as a floating-point number.
+ // However, it introduces some tricks to avoid multiplying denormalized numbers and to deal with exceptions such as infinity and NaN without using branch instructions.
+ //
+ // The bit sequence of a half-precision floating-point number is as follows
+ // seee_eeff_ffff_ffff
+ // The bit sequence of a single-precision floating-point number is as follows
+ // seee_eeee_efff_ffff_ffff_ffff_ffff_ffff
+ // In both cases, "_" is the hexadecimal separator, "s" is the sign, "e" is the exponent part, and "f" is the mantissa part.
+ // In half-precision, the exponent part is 5 bits and the mantissa part is 10 bits. In single precision, the exponent is 8 bits and the mantissa is 23 bits.
+ // Both formats use an offset binary representation for the exponent part: the exponent part for 1.0 is half of the maximum value for either precision, i.e., 127 for single-precision and 15 for half-precision.
+ // The mantissa part is normalized when the exponent part is nonzero, since in binary numbers, 1 appears as the most significant digit for any nonzero number.
+ //
+ // This conversion algorithm takes advantage of the similarity between the two formats.
+ // By isolating the sign part from the half-precision bitstring and shifting it 13 bits to the left, the boundary between the exponent and mantissa parts matches with that of single-precision.
+ // In other words,
+ // 0eeeeeffffffffff is rearranged to
+ // 0000eeeeeffffffffff0000000000000
+ // which matches the boundary between the exponent and mantissa parts of single-precision floating-point number:
+ // seeeeeeeefffffffffffffffffffffff
+ //
+ // After rearrangement, this bit sequence is multiplied by the constant 5.192297E+33f in the floating-point number multiplication unit.
+ // However, most hardware cannot efficiently handle the multiplication of denormalized numbers.
+ // Denormalized numbers are more common in half-precision than in single-precision, so they cannot be ignored.
+ //
+ // First, if the value is a denormalized number, the constant 0x3880_0000u is added beforehand in the integer addition unit to make it behave as a normalized number.
+ // For Infinity or NaN, the constant 0x7000_0000u is added beforehand in the integer adder.
+ // These numbers are then converted to single-precision floating-point numbers as per the IEEE754 specification by the following operations.
+ // Next, regardless of whether the value is a denormalized number or not, add the constant 0x3800_0000u to this bit string in the integer addition unit. The constant is chosen to add 112 to the exponent part; 112 is 127 subtracted by 15.
+ // Then, if the value is a denormalized number, the constant 6.1035156E-05f is subtracted in the floating-point number subtraction unit.
+ // The above operation produces the same result as if the rearranged bit sequence were multiplied by the constant 5.192297E+33f.
+ // Finally, merging the isolated sign bits completes the conversion.
+ #endregion
+ // The smallest positive normal number in Half, converted to Single
+ const uint ExponentLowerBound = 0x3880_0000u;
+ // BitConverter.SingleToUInt32Bits(1.0f) - ((uint)BitConverter.HalfToUInt16Bits((Half)1.0f) << 13)
+ const uint ExponentOffset = 0x3800_0000u;
+ // Mask for sign bit in Single
+ const uint FloatSignMask = float.SignMask;
+ // Mask for exponent bits in Half
+ const uint HalfExponentMask = BiasedExponentMask;
+ // Mask for bits in Single converted from Half
+ const int HalfToSingleBitsMask = 0x0FFF_E000;
+ // Extract the internal representation of value
+ short valueInInt16Bits = BitConverter.HalfToInt16Bits(value);
+ // Extract sign bit of value
+ uint sign = (uint)(int)valueInInt16Bits & FloatSignMask;
+ // Copy sign bit to upper bits
+ uint bitValueInProcess = (uint)valueInInt16Bits;
+ // Extract exponent bits of value (BiasedExponent is not for here as it performs unnecessary shift)
+ uint offsetExponent = bitValueInProcess & HalfExponentMask;
+ // ~0u when value is subnormal, 0 otherwise
+ uint subnormalMask = (uint)-Unsafe.BitCast<bool, byte>(offsetExponent == 0u);
+ // ~0u when value is either Infinity or NaN, 0 otherwise
+ int infinityOrNaNMask = Unsafe.BitCast<bool, byte>(offsetExponent == HalfExponentMask);
+ // 0x3880_0000u if value is subnormal, 0 otherwise
+ uint maskedExponentLowerBound = subnormalMask & ExponentLowerBound;
+ // 0x3880_0000u if value is subnormal, 0x3800_0000u otherwise
+ uint offsetMaskedExponentLowerBound = ExponentOffset | maskedExponentLowerBound;
+ // Match the position of the boundary of exponent bits and fraction bits with IEEE 754 Binary32(Single)
+ bitValueInProcess <<= 13;
+ // Double the offsetMaskedExponentLowerBound if value is either Infinity or NaN
+ offsetMaskedExponentLowerBound <<= infinityOrNaNMask;
+ // Extract exponent bits and fraction bits of value
+ bitValueInProcess &= HalfToSingleBitsMask;
+ // Adjust exponent to match the range of exponent
+ bitValueInProcess += offsetMaskedExponentLowerBound;
+ // If value is subnormal, remove unnecessary 1 on top of fraction bits.
+ uint absoluteValue = BitConverter.SingleToUInt32Bits(BitConverter.UInt32BitsToSingle(bitValueInProcess) - BitConverter.UInt32BitsToSingle(maskedExponentLowerBound));
+ // Merge sign bit with rest
+ return BitConverter.UInt32BitsToSingle(absoluteValue | sign);
}
// IEEE 754 specifies NaNs to be propagated