Initial commit for System.Text.Rune (#20935)
authorLevi Broderick <GrabYourPitchforks@users.noreply.github.com>
Wed, 14 Nov 2018 01:05:02 +0000 (17:05 -0800)
committerGitHub <noreply@github.com>
Wed, 14 Nov 2018 01:05:02 +0000 (17:05 -0800)
This type represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; and [ U+E000..U+10FFFF ], inclusive). The primary scenario is for having a consistent representation of Unicode data regardless of the underlying input encoding type, including abstracting away surrogate code points.

src/System.Private.CoreLib/Resources/Strings.resx
src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
src/System.Private.CoreLib/shared/System/Text/Rune.cs [new file with mode: 0644]
src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs [new file with mode: 0644]
src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs [new file with mode: 0644]
src/System.Private.CoreLib/src/System/ThrowHelper.cs

index 42dcde3e2fe12414a55d825347bb1da8cabb0db7..4f17a289c846a429f27f4f593b1a6b243a4ab8cf 100644 (file)
   <data name="Argument_MethodRedefined" xml:space="preserve">
     <value>Method has been already defined.</value>
   </data>
+  <data name="Argument_CannotExtractScalar" xml:space="preserve">
+    <value>Cannot extract a Unicode scalar value from the specified index in the input.</value>
+  </data>
   <data name="Argument_CannotParsePrecision" xml:space="preserve">
     <value>Characters following the format symbol must be a number of {0} or less.</value>
   </data>
index 79956c4342b0d9e59f850ae63cbd7fab1825e3b6..8768d19223d41f602d798517aeba7b724573fa62 100644 (file)
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingProvider.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Latin1Encoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\NormalizationForm.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Rune.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.Debug.cs" Condition="'$(Configuration)' == 'Debug'" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf16Utility.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
new file mode 100644 (file)
index 0000000..a4ef3a3
--- /dev/null
@@ -0,0 +1,731 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Globalization;
+using System.Runtime.CompilerServices;
+
+namespace System.Text
+{
+    /// <summary>
+    /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
+    /// </summary>
+    /// <remarks>
+    /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
+    /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
+    /// </remarks>
+    [DebuggerDisplay("{DebuggerDisplay,nq}")]
+    public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
+    {
+        private const byte IsWhiteSpaceFlag = 0x80;
+        private const byte IsLetterOrDigitFlag = 0x40;
+        private const byte UnicodeCategoryMask = 0x1F;
+
+        // Contains information about the ASCII character range [ U+0000..U+007F ], with:
+        // - 0x80 bit if set means 'is whitespace'
+        // - 0x40 bit if set means 'is letter or digit'
+        // - 0x20 bit is reserved for future use
+        // - bottom 5 bits are the UnicodeCategory of the character
+        private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
+        {
+            0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
+            0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
+            0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
+            0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
+            0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
+            0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
+            0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
+            0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
+        };
+
+        private readonly uint _value;
+
+        /// <summary>
+        /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
+        /// </summary>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
+        /// U+D800..U+DFFF, inclusive.
+        /// </exception>
+        public Rune(char ch)
+        {
+            uint expanded = ch;
+            if (UnicodeUtility.IsSurrogateCodePoint(expanded))
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
+            }
+            _value = expanded;
+        }
+
+        /// <summary>
+        /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
+        /// </summary>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// If <paramref name="value"/> does not represent a value Unicode scalar value.
+        /// </exception>
+        public Rune(int value)
+            : this((uint)value)
+        {
+        }
+
+        /// <summary>
+        /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
+        /// </summary>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// If <paramref name="value"/> does not represent a value Unicode scalar value.
+        /// </exception>
+        [CLSCompliant(false)]
+        public Rune(uint value)
+        {
+            if (!UnicodeUtility.IsValidUnicodeScalar(value))
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
+            }
+            _value = value;
+        }
+
+        // non-validating ctor
+        private Rune(uint scalarValue, bool unused)
+        {
+            UnicodeDebug.AssertIsValidScalar(scalarValue);
+            _value = scalarValue;
+        }
+
+        public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
+
+        public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
+
+        public static bool operator <(Rune left, Rune right) => (left._value < right._value);
+
+        public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
+
+        public static bool operator >(Rune left, Rune right) => (left._value > right._value);
+
+        public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
+
+        // Operators below are explicit because they may throw.
+
+        public static explicit operator Rune(char ch) => new Rune(ch);
+
+        [CLSCompliant(false)]
+        public static explicit operator Rune(uint value) => new Rune(value);
+
+        public static explicit operator Rune(int value) => new Rune(value);
+
+        // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
+        private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
+
+        /// <summary>
+        /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
+        /// and therefore representable by a single UTF-8 code unit.
+        /// </summary>
+        public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
+
+        /// <summary>
+        /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
+        /// and therefore representable by a single UTF-16 code unit.
+        /// </summary>
+        public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
+
+        /// <summary>
+        /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
+        /// </summary>
+        public int Plane => UnicodeUtility.GetPlane(_value);
+
+        /// <summary>
+        /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
+        /// </summary>
+        public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
+
+        /// <summary>
+        /// Returns the length in code units (<see cref="Char"/>) of the
+        /// UTF-16 sequence required to represent this scalar value.
+        /// </summary>
+        /// <remarks>
+        /// The return value will be 1 or 2.
+        /// </remarks>
+        public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
+
+        /// <summary>
+        /// Returns the length in code units (<see cref="Utf8Char"/>) of the
+        /// UTF-8 sequence required to represent this scalar value.
+        /// </summary>
+        /// <remarks>
+        /// The return value will be 1 through 4, inclusive.
+        /// </remarks>
+        public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
+
+        /// <summary>
+        /// Returns the Unicode scalar value as an integer.
+        /// </summary>
+        public int Value => (int)_value;
+
+        private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper)
+        {
+            if (culture == null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
+            }
+
+            var textInfo = culture.TextInfo;
+
+            Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
+            Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
+
+            int charCount = rune.EncodeToUtf16(original);
+            original = original.Slice(0, charCount);
+            modified = modified.Slice(0, charCount);
+
+            if (toUpper)
+            {
+                textInfo.ChangeCaseToUpper(original, modified);
+            }
+            else
+            {
+                textInfo.ChangeCaseToLower(original, modified);
+            }
+
+            // We use simple case folding rules, which disallows moving between the BMP and supplementary
+            // planes when performing a case conversion. The helper methods which reconstruct a Rune
+            // contain debug asserts for this condition.
+
+            if (rune.IsBmp)
+            {
+                return UnsafeCreate(modified[0]);
+            }
+            else
+            {
+                return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
+            }
+        }
+
+        public int CompareTo(Rune other) => this._value.CompareTo(other._value);
+
+        // returns the number of chars written
+        private int EncodeToUtf16(Span<char> destination)
+        {
+            Debug.Assert(destination.Length >= Utf16SequenceLength, "Caller should've provided a large enough buffer.");
+            bool success = TryEncode(destination, out int charsWritten);
+            Debug.Assert(success, "TryEncode should never fail given a large enough buffer.");
+            return charsWritten;
+        }
+
+        public override bool Equals(object obj) => (obj is Rune other) && this.Equals(other);
+
+        public bool Equals(Rune other) => (this == other);
+
+        public override int GetHashCode() => Value;
+
+        /// <summary>
+        /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
+        /// string <paramref name="input"/>.
+        /// </summary>
+        /// <remarks>
+        /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
+        /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
+        /// </remarks>
+        public static Rune GetRuneAt(string input, int index)
+        {
+            int runeValue = ReadRuneFromString(input, index);
+            if (runeValue < 0)
+            {
+                ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
+            }
+
+            return UnsafeCreate((uint)runeValue);
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+        /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+        /// </summary>
+        public static bool IsValid(int value) => IsValid((uint)value);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+        /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+        /// </summary>
+        [CLSCompliant(false)]
+        public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
+
+        // returns a negative number on failure
+        private static int ReadRuneFromString(string input, int index)
+        {
+            if (input is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
+            }
+
+            if ((uint)index >= (uint)input.Length)
+            {
+                ThrowHelper.ThrowArgumentOutOfRange_IndexException();
+            }
+
+            // Optimistically assume input is within BMP.
+
+            uint returnValue = input[index];
+            if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
+            {
+                if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
+                {
+                    return -1;
+                }
+
+                // Treat 'returnValue' as the high surrogate.
+                //
+                // If this becomes a hot code path, we can skip the below bounds check by reading
+                // off the end of the string using unsafe code. Since strings are null-terminated,
+                // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
+                // the string terminates unexpectedly.
+
+                index++;
+                if ((uint)index >= (uint)input.Length)
+                {
+                    return -1; // not an argument exception - just a "bad data" failure
+                }
+
+                uint potentialLowSurrogate = input[index];
+                if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
+                {
+                    return -1;
+                }
+
+                returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
+            }
+
+            return (int)returnValue;
+        }
+
+        /// <summary>
+        /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
+        /// </summary>
+        public override string ToString()
+        {
+            Span<char> chars = stackalloc char[2]; // worst case
+            return new string(chars.Slice(0, EncodeToUtf16(chars)));
+        }
+
+        /// <summary>
+        /// Attempts to create a <see cref="Rune"/> from the provided input value.
+        /// </summary>
+        public static bool TryCreate(char ch, out Rune result)
+        {
+            uint extendedValue = ch;
+            if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
+            {
+                result = UnsafeCreate(extendedValue);
+                return true;
+            }
+            else
+            {
+                result = default;
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to create a <see cref="Rune"/> from the provided input value.
+        /// </summary>
+        public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
+
+        /// <summary>
+        /// Attempts to create a <see cref="Rune"/> from the provided input value.
+        /// </summary>
+        [CLSCompliant(false)]
+        public static bool TryCreate(uint value, out Rune result)
+        {
+            if (UnicodeUtility.IsValidUnicodeScalar(value))
+            {
+                result = UnsafeCreate(value);
+                return true;
+            }
+            else
+            {
+                result = default;
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
+        /// </summary>
+        /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
+        /// <param name="charsWritten">
+        /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
+        /// or 0 if the destination buffer is not large enough to contain the output.</param>
+        /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
+        /// <remarks>
+        /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
+        /// the required size of the <paramref name="destination"/> buffer.
+        /// </remarks>
+        public bool TryEncode(Span<char> destination, out int charsWritten)
+        {
+            if (destination.Length >= 1)
+            {
+                if (IsBmp)
+                {
+                    destination[0] = (char)_value;
+                    charsWritten = 1;
+                    return true;
+                }
+                else if (destination.Length >= 2)
+                {
+                    UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
+                    charsWritten = 2;
+                    return true;
+                }
+            }
+
+            // Destination buffer not large enough
+
+            charsWritten = default;
+            return false;
+        }
+
+        /// <summary>
+        /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
+        /// </summary>
+        /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
+        /// <param name="charsWritten">
+        /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
+        /// or 0 if the destination buffer is not large enough to contain the output.</param>
+        /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
+        /// <remarks>
+        /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
+        /// the required size of the <paramref name="destination"/> buffer.
+        /// </remarks>
+        // ** This is public so it can be unit tested but isn't yet exposed via the reference assemblies. **
+        public bool TryEncodeToUtf8Bytes(Span<byte> destination, out int bytesWritten)
+        {
+            // TODO: Optimize some of these writes by using BMI2 instructions.
+
+            // The bit patterns below come from the Unicode Standard, Table 3-6.
+
+            if (destination.Length >= 1)
+            {
+                if (IsAscii)
+                {
+                    destination[0] = (byte)_value;
+                    bytesWritten = 1;
+                    return true;
+                }
+
+                if (destination.Length >= 2)
+                {
+                    if (_value <= 0x7FFu)
+                    {
+                        // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
+                        destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
+                        destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
+                        bytesWritten = 2;
+                        return true;
+                    }
+
+                    if (destination.Length >= 3)
+                    {
+                        if (_value <= 0xFFFFu)
+                        {
+                            // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
+                            destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
+                            destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
+                            destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
+                            bytesWritten = 3;
+                            return true;
+                        }
+
+                        if (destination.Length >= 4)
+                        {
+                            // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+                            destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
+                            destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
+                            destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
+                            destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
+                            bytesWritten = 4;
+                            return true;
+                        }
+                    }
+                }
+            }
+
+            // Destination buffer not large enough
+
+            bytesWritten = default;
+            return false;
+        }
+
+        /// <summary>
+        /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
+        /// string <paramref name="input"/>.
+        /// </summary>
+        /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
+        /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
+        /// <remarks>
+        /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
+        /// </remarks>
+        public static bool TryGetRuneAt(string input, int index, out Rune value)
+        {
+            int runeValue = ReadRuneFromString(input, index);
+            if (runeValue >= 0)
+            {
+                value = UnsafeCreate((uint)runeValue);
+                return true;
+            }
+            else
+            {
+                value = default;
+                return false;
+            }
+        }
+
+        // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
+        // validation. It is the caller's responsibility to have performed manual validation
+        // before calling this method. If a Rune instance is forcibly constructed
+        // from invalid input, the APIs on this type have undefined behavior, potentially including
+        // introducing a security hole in the consuming application.
+        //
+        // An example of a security hole resulting from an invalid Rune value, which could result
+        // in a stack overflow.
+        //
+        // public int GetMarvin32HashCode(Rune r) {
+        //   Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
+        //   r.TryEncode(buffer, ...);
+        //   return Marvin32.ComputeHash(buffer.AsBytes());
+        // }
+
+        /// <summary>
+        /// Creates a <see cref="Rune"/> without performing validation on the input.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
+
+        // These are analogs of APIs on System.Char
+
+        public static double GetNumericValue(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                uint baseNum = value._value - '0';
+                return (baseNum <= 9) ? (double)baseNum : -1;
+            }
+            else
+            {
+                // not an ASCII char; fall back to globalization table
+                return CharUnicodeInfo.InternalGetNumericValue(value.Value);
+            }
+        }
+
+        public static UnicodeCategory GetUnicodeCategory(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
+            }
+            else
+            {
+                return GetUnicodeCategoryNonAscii(value);
+            }
+        }
+
+        private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
+        {
+            Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
+            return CharUnicodeInfo.GetUnicodeCategory(value.Value);
+        }
+
+        // Returns true iff this Unicode category represents a letter
+        private static bool IsCategoryLetter(UnicodeCategory category)
+        {
+            return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
+        }
+
+        // Returns true iff this Unicode category represents a letter or a decimal digit
+        private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
+        {
+            return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
+                || (category == UnicodeCategory.DecimalDigitNumber);
+        }
+
+        // Returns true iff this Unicode category represents a number
+        private static bool IsCategoryNumber(UnicodeCategory category)
+        {
+            return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
+        }
+
+        // Returns true iff this Unicode category represents a punctuation mark
+        private static bool IsCategoryPunctuation(UnicodeCategory category)
+        {
+            return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
+        }
+
+        // Returns true iff this Unicode category represents a separator
+        private static bool IsCategorySeparator(UnicodeCategory category)
+        {
+            return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
+        }
+
+        // Returns true iff this Unicode category represents a symbol
+        private static bool IsCategorySymbol(UnicodeCategory category)
+        {
+            return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
+        }
+
+        public static bool IsControl(Rune value)
+        {
+            // Per the Unicode stability policy, the set of control characters
+            // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
+            // characters will ever be added to the "control characters" group.
+            // See http://www.unicode.org/policies/stability_policy.html.
+
+            // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
+            // 00..1F (+1) => 01..20 (&~80) => 01..20
+            // 7F..9F (+1) => 80..A0 (&~80) => 00..20
+
+            return (((value._value + 1) & ~0x80u) <= 0x20u);
+        }
+
+        public static bool IsDigit(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
+            }
+            else
+            {
+                return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber);
+            }
+        }
+
+        public static bool IsLetter(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
+            }
+            else
+            {
+                return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
+            }
+        }
+
+        public static bool IsLetterOrDigit(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
+            }
+            else
+            {
+                return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
+            }
+        }
+
+        public static bool IsLower(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
+            }
+            else
+            {
+                return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter);
+            }
+        }
+
+        public static bool IsNumber(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
+            }
+            else
+            {
+                return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
+            }
+        }
+
+        public static bool IsPunctuation(Rune value)
+        {
+            return IsCategoryPunctuation(GetUnicodeCategory(value));
+        }
+
+        public static bool IsSeparator(Rune value)
+        {
+            return IsCategorySeparator(GetUnicodeCategory(value));
+        }
+
+        public static bool IsSymbol(Rune value)
+        {
+            return IsCategorySymbol(GetUnicodeCategory(value));
+        }
+
+        public static bool IsUpper(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
+            }
+            else
+            {
+                return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter);
+            }
+        }
+
+        public static bool IsWhiteSpace(Rune value)
+        {
+            if (value.IsAscii)
+            {
+                return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
+            }
+
+            // U+0085 is special since it's a whitespace character but is in the Control category
+            // instead of a normal separator category. No other code point outside the ASCII range
+            // has this mismatch.
+
+            if (value._value == 0x0085u)
+            {
+                return true;
+            }
+
+            return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
+        }
+
+        public static Rune ToLower(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: false);
+
+        public static Rune ToLowerInvariant(Rune value)
+        {
+            // Handle the most common case (ASCII data) first. Within the common case, we expect
+            // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
+
+            if (value.IsAscii || GlobalizationMode.Invariant)
+            {
+                // It's ok for us to use the UTF-16 conversion utility for this since the high
+                // 16 bits of the value will never be set so will be left unchanged.
+                return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
+            }
+
+            // Non-ASCII data requires going through the case folding tables.
+
+            return ToLower(value, CultureInfo.InvariantCulture);
+        }
+
+        public static Rune ToUpper(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: true);
+
+        public static Rune ToUpperInvariant(Rune value)
+        {
+            // Handle the most common case (ASCII data) first. Within the common case, we expect
+            // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
+
+            if (value.IsAscii || GlobalizationMode.Invariant)
+            {
+                // It's ok for us to use the UTF-16 conversion utility for this since the high
+                // 16 bits of the value will never be set so will be left unchanged.
+                return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
+            }
+
+            // Non-ASCII data requires going through the case folding tables.
+
+            return ToUpper(value, CultureInfo.InvariantCulture);
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs
new file mode 100644 (file)
index 0000000..dedfbe2
--- /dev/null
@@ -0,0 +1,53 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace System.Text
+{
+    internal static class UnicodeDebug
+    {
+        [Conditional("DEBUG")]
+        internal static void AssertIsHighSurrogateCodePoint(uint codePoint)
+        {
+            Debug.Assert(UnicodeUtility.IsHighSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 high surrogate code point.");
+        }
+
+        [Conditional("DEBUG")]
+        internal static void AssertIsLowSurrogateCodePoint(uint codePoint)
+        {
+            Debug.Assert(UnicodeUtility.IsLowSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 low surrogate code point.");
+        }
+
+        [Conditional("DEBUG")]
+        internal static void AssertIsValidCodePoint(uint codePoint)
+        {
+            Debug.Assert(UnicodeUtility.IsValidCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid Unicode code point.");
+        }
+
+        [Conditional("DEBUG")]
+        internal static void AssertIsValidScalar(uint scalarValue)
+        {
+            Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid Unicode scalar value.");
+        }
+
+        [Conditional("DEBUG")]
+        internal static void AssertIsValidSupplementaryPlaneScalar(uint scalarValue)
+        {
+            Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue) && !UnicodeUtility.IsBmpCodePoint(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid supplementary plane Unicode scalar value.");
+        }
+
+        /// <summary>
+        /// Formats a code point as the hex string "U+XXXX".
+        /// </summary>
+        /// <remarks>
+        /// The input value doesn't have to be a real code point in the Unicode codespace. It can be any integer.
+        /// </remarks>
+        private static string ToHexString(uint codePoint)
+        {
+            return FormattableString.Invariant($"U+{codePoint:X4}");
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
new file mode 100644 (file)
index 0000000..c1dcefd
--- /dev/null
@@ -0,0 +1,180 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+
+namespace System.Text
+{
+    internal static class UnicodeUtility
+    {
+        /// <summary>
+        /// The Unicode replacement character U+FFFD.
+        /// </summary>
+        public const uint ReplacementChar = 0xFFFDU;
+
+        /// <summary>
+        /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point.
+        /// </summary>
+        public static int GetPlane(uint codePoint)
+        {
+            UnicodeDebug.AssertIsValidCodePoint(codePoint);
+
+            return (int)(codePoint >> 16);
+        }
+
+        /// <summary>
+        /// Returns a Unicode scalar value from two code points representing a UTF-16 surrogate pair.
+        /// </summary>
+        public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint)
+        {
+            UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint);
+            UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint);
+
+            // This calculation comes from the Unicode specification, Table 3-5.
+            // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate,
+            // then fix up the "wwww = uuuuu - 1" section of the bit distribution. The code is written as below
+            // to become just two instructions: shl, lea.
+
+            return (highSurrogateCodePoint << 10) + lowSurrogateCodePoint - ((0xD800U << 10) + 0xDC00U - (1 << 16));
+        }
+
+        /// <summary>
+        /// Given a Unicode scalar value, gets the number of UTF-16 code units required to represent this value.
+        /// </summary>
+        public static int GetUtf16SequenceLength(uint value)
+        {
+            UnicodeDebug.AssertIsValidScalar(value);
+
+            value -= 0x10000;   // if value < 0x10000, high byte = 0xFF; else high byte = 0x00
+            value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02
+            value >>= 24;       // shift high byte down
+            return (int)value;  // and return it
+        }
+
+        /// <summary>
+        /// Decomposes an astral Unicode scalar into UTF-16 high and low surrogate code units.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint)
+        {
+            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value);
+
+            // This calculation comes from the Unicode specification, Table 3-5.
+
+            highSurrogateCodePoint = (char)((value + ((0xD800u - 0x40u) << 10)) >> 10);
+            lowSurrogateCodePoint = (char)((value & 0x3FFu) + 0xDC00u);
+        }
+
+        /// <summary>
+        /// Given a Unicode scalar value, gets the number of UTF-8 code units required to represent this value.
+        /// </summary>
+        public static int GetUtf8SequenceLength(uint value)
+        {
+            UnicodeDebug.AssertIsValidScalar(value);
+
+            // The logic below can handle all valid scalar values branchlessly.
+            // It gives generally good performance across all inputs, and on x86
+            // it's only six instructions: lea, sar, xor, add, shr, lea.
+
+            // 'a' will be -1 if input is < 0x800; else 'a' will be 0
+            // => 'a' will be -1 if input is 1 or 2 UTF-8 code units; else 'a' will be 0
+
+            int a = ((int)value - 0x0800) >> 31;
+
+            // The number of UTF-8 code units for a given scalar is as follows:
+            // - U+0000..U+007F => 1 code unit
+            // - U+0080..U+07FF => 2 code units
+            // - U+0800..U+FFFF => 3 code units
+            // - U+10000+       => 4 code units
+            //
+            // If we XOR the incoming scalar with 0xF800, the chart mutates:
+            // - U+0000..U+F7FF => 3 code units
+            // - U+F800..U+F87F => 1 code unit
+            // - U+F880..U+FFFF => 2 code units
+            // - U+10000+       => 4 code units
+            //
+            // Since the 1- and 3-code unit cases are now clustered, they can
+            // both be checked together very cheaply.
+
+            value ^= 0xF800u;
+            value -= 0xF880u;   // if scalar is 1 or 3 code units, high byte = 0xFF; else high byte = 0x00
+            value += (4 << 24); // if scalar is 1 or 3 code units, high byte = 0x03; else high byte = 0x04
+            value >>= 24;       // shift high byte down
+
+            // Final return value:
+            // - U+0000..U+007F => 3 + (-1) * 2 = 1
+            // - U+0080..U+07FF => 4 + (-1) * 2 = 2
+            // - U+0800..U+FFFF => 3 + ( 0) * 2 = 3
+            // - U+10000+       => 4 + ( 0) * 2 = 4
+            return (int)value + (a * 2);
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is an ASCII
+        /// character ([ U+0000..U+007F ]).
+        /// </summary>
+        /// <remarks>
+        /// Per http://www.unicode.org/glossary/#ASCII, ASCII is only U+0000..U+007F.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsAsciiCodePoint(uint value) => (value <= 0x7Fu);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is in the
+        /// Basic Multilingual Plane (BMP).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsBmpCodePoint(uint value) => (value <= 0xFFFFu);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 high surrogate code point,
+        /// i.e., is in [ U+D800..U+DBFF ], inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsHighSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDBFFU);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is between
+        /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound) => ((value - lowerBound) <= (upperBound - lowerBound));
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 low surrogate code point,
+        /// i.e., is in [ U+DC00..U+DFFF ], inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsLowSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xDC00U, 0xDFFFU);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 surrogate code point,
+        /// i.e., is in [ U+D800..U+DFFF ], inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDFFFU);
+        
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode code
+        /// point, i.e., is in [ U+0000..U+10FFFF ], inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsValidCodePoint(uint codePoint) => (codePoint <= 0x10FFFFU);
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+        /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool IsValidUnicodeScalar(uint value)
+        {
+            // By XORing the incoming value with 0xD800, surrogate code points
+            // are moved to the range [ U+0000..U+07FF ], and all valid scalar
+            // values are clustered into the single range [ U+0800..U+10FFFF ],
+            // which allows performing a single fast range check.
+
+            return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
+        }
+    }
+}
index 41745e78013cc72d1fc59c5e3bf55a0817a6739c..551f87e7c55f5c94e715a01c4982abc89e2fef22 100644 (file)
@@ -76,6 +76,11 @@ namespace System
             throw new ArgumentException(SR.Argument_OverlapAlignmentMismatch);
         }
 
+        internal static void ThrowArgumentException_CannotExtractScalar(ExceptionArgument argument)
+        {
+            throw GetArgumentException(ExceptionResource.Argument_CannotExtractScalar, argument);
+        }
+
         internal static void ThrowArgumentOutOfRange_IndexException()
         {
             throw GetArgumentOutOfRangeException(ExceptionArgument.index,
@@ -490,6 +495,7 @@ namespace System
         pHandle,
         values,
         task,
+        ch,
         s,
         input,
         pointer,
@@ -528,6 +534,7 @@ namespace System
         ArgumentOutOfRange_Index,
         Argument_InvalidOffLen,
         Argument_ItemNotExist,
+        Argument_CannotExtractScalar,
         ArgumentOutOfRange_Count,
         ArgumentOutOfRange_InvalidThreshold,
         ArgumentOutOfRange_ListInsert,