From 17c7414efc4808b5189a76c8b77fc37faf29d1d2 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 13 Nov 2018 17:05:02 -0800 Subject: [PATCH] Initial commit for System.Text.Rune (dotnet/coreclr#20935) This type represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; and [ U+E000..U+10FFFF ], inclusive). The primary scenario is for having a consistent representation of Unicode data regardless of the underlying input encoding type, including abstracting away surrogate code points. Commit migrated from https://github.com/dotnet/coreclr/commit/7fcd8a86732f2a57b17d3848f113b5a8afca8d6f --- .../System.Private.CoreLib/Resources/Strings.resx | 3 + .../src/System/ThrowHelper.cs | 7 + .../src/System.Private.CoreLib.Shared.projitems | 3 + .../System.Private.CoreLib/src/System/Text/Rune.cs | 731 +++++++++++++++++++++ .../src/System/Text/UnicodeDebug.cs | 53 ++ .../src/System/Text/UnicodeUtility.cs | 180 +++++ 6 files changed, 977 insertions(+) create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/UnicodeDebug.cs create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs diff --git a/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx b/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx index 42dcde3..4f17a28 100644 --- a/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx +++ b/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx @@ -3652,6 +3652,9 @@ Method has been already defined. + + Cannot extract a Unicode scalar value from the specified index in the input. + Characters following the format symbol must be a number of {0} or less. diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/ThrowHelper.cs b/src/coreclr/src/System.Private.CoreLib/src/System/ThrowHelper.cs index 41745e7..551f87e 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/ThrowHelper.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/ThrowHelper.cs @@ -76,6 +76,11 @@ namespace System throw new ArgumentException(SR.Argument_OverlapAlignmentMismatch); } + internal static void ThrowArgumentException_CannotExtractScalar(ExceptionArgument argument) + { + throw GetArgumentException(ExceptionResource.Argument_CannotExtractScalar, argument); + } + internal static void ThrowArgumentOutOfRange_IndexException() { throw GetArgumentOutOfRangeException(ExceptionArgument.index, @@ -490,6 +495,7 @@ namespace System pHandle, values, task, + ch, s, input, pointer, @@ -528,6 +534,7 @@ namespace System ArgumentOutOfRange_Index, Argument_InvalidOffLen, Argument_ItemNotExist, + Argument_CannotExtractScalar, ArgumentOutOfRange_Count, ArgumentOutOfRange_InvalidThreshold, ArgumentOutOfRange_ListInsert, diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 79956c4..8768d19 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -651,9 +651,12 @@ + + + diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs new file mode 100644 index 0000000..a4ef3a3 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs @@ -0,0 +1,731 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Globalization; +using System.Runtime.CompilerServices; + +namespace System.Text +{ + /// + /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive). + /// + /// + /// This type's constructors and conversion operators validate the input, so consumers can call the APIs + /// assuming that the underlying instance is well-formed. + /// + [DebuggerDisplay("{DebuggerDisplay,nq}")] + public readonly struct Rune : IComparable, IEquatable + { + private const byte IsWhiteSpaceFlag = 0x80; + private const byte IsLetterOrDigitFlag = 0x40; + private const byte UnicodeCategoryMask = 0x1F; + + // Contains information about the ASCII character range [ U+0000..U+007F ], with: + // - 0x80 bit if set means 'is whitespace' + // - 0x40 bit if set means 'is letter or digit' + // - 0x20 bit is reserved for future use + // - bottom 5 bits are the UnicodeCategory of the character + private static ReadOnlySpan AsciiCharInfo => new byte[] + { + 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E, + 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, + 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18, + 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18, + 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12, + 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E + }; + + private readonly uint _value; + + /// + /// Creates a from the provided UTF-16 code unit. + /// + /// + /// If represents a UTF-16 surrogate code point + /// U+D800..U+DFFF, inclusive. + /// + public Rune(char ch) + { + uint expanded = ch; + if (UnicodeUtility.IsSurrogateCodePoint(expanded)) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch); + } + _value = expanded; + } + + /// + /// Creates a from the provided Unicode scalar value. + /// + /// + /// If does not represent a value Unicode scalar value. + /// + public Rune(int value) + : this((uint)value) + { + } + + /// + /// Creates a from the provided Unicode scalar value. + /// + /// + /// If does not represent a value Unicode scalar value. + /// + [CLSCompliant(false)] + public Rune(uint value) + { + if (!UnicodeUtility.IsValidUnicodeScalar(value)) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); + } + _value = value; + } + + // non-validating ctor + private Rune(uint scalarValue, bool unused) + { + UnicodeDebug.AssertIsValidScalar(scalarValue); + _value = scalarValue; + } + + public static bool operator ==(Rune left, Rune right) => (left._value == right._value); + + public static bool operator !=(Rune left, Rune right) => (left._value != right._value); + + public static bool operator <(Rune left, Rune right) => (left._value < right._value); + + public static bool operator <=(Rune left, Rune right) => (left._value <= right._value); + + public static bool operator >(Rune left, Rune right) => (left._value > right._value); + + public static bool operator >=(Rune left, Rune right) => (left._value >= right._value); + + // Operators below are explicit because they may throw. + + public static explicit operator Rune(char ch) => new Rune(ch); + + [CLSCompliant(false)] + public static explicit operator Rune(uint value) => new Rune(value); + + public static explicit operator Rune(int value) => new Rune(value); + + // Displayed as "'' (U+XXXX)"; e.g., "'e' (U+0065)" + private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'"); + + /// + /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) + /// and therefore representable by a single UTF-8 code unit. + /// + public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value); + + /// + /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) + /// and therefore representable by a single UTF-16 code unit. + /// + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); + + /// + /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar. + /// + public int Plane => UnicodeUtility.GetPlane(_value); + + /// + /// A instance that represents the Unicode replacement character U+FFFD. + /// + public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); + + /// + /// Returns the length in code units () of the + /// UTF-16 sequence required to represent this scalar value. + /// + /// + /// The return value will be 1 or 2. + /// + public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value); + + /// + /// Returns the length in code units () of the + /// UTF-8 sequence required to represent this scalar value. + /// + /// + /// The return value will be 1 through 4, inclusive. + /// + public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value); + + /// + /// Returns the Unicode scalar value as an integer. + /// + public int Value => (int)_value; + + private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper) + { + if (culture == null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture); + } + + var textInfo = culture.TextInfo; + + Span original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair) + Span modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count + + int charCount = rune.EncodeToUtf16(original); + original = original.Slice(0, charCount); + modified = modified.Slice(0, charCount); + + if (toUpper) + { + textInfo.ChangeCaseToUpper(original, modified); + } + else + { + textInfo.ChangeCaseToLower(original, modified); + } + + // We use simple case folding rules, which disallows moving between the BMP and supplementary + // planes when performing a case conversion. The helper methods which reconstruct a Rune + // contain debug asserts for this condition. + + if (rune.IsBmp) + { + return UnsafeCreate(modified[0]); + } + else + { + return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1])); + } + } + + public int CompareTo(Rune other) => this._value.CompareTo(other._value); + + // returns the number of chars written + private int EncodeToUtf16(Span destination) + { + Debug.Assert(destination.Length >= Utf16SequenceLength, "Caller should've provided a large enough buffer."); + bool success = TryEncode(destination, out int charsWritten); + Debug.Assert(success, "TryEncode should never fail given a large enough buffer."); + return charsWritten; + } + + public override bool Equals(object obj) => (obj is Rune other) && this.Equals(other); + + public bool Equals(Rune other) => (this == other); + + public override int GetHashCode() => Value; + + /// + /// Gets the which begins at index in + /// string . + /// + /// + /// Throws if is null, if is out of range, or + /// if does not reference the start of a valid scalar value within . + /// + public static Rune GetRuneAt(string input, int index) + { + int runeValue = ReadRuneFromString(input, index); + if (runeValue < 0) + { + ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index); + } + + return UnsafeCreate((uint)runeValue); + } + + /// + /// Returns iff is a valid Unicode scalar + /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. + /// + public static bool IsValid(int value) => IsValid((uint)value); + + /// + /// Returns iff is a valid Unicode scalar + /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. + /// + [CLSCompliant(false)] + public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); + + // returns a negative number on failure + private static int ReadRuneFromString(string input, int index) + { + if (input is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); + } + + if ((uint)index >= (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRange_IndexException(); + } + + // Optimistically assume input is within BMP. + + uint returnValue = input[index]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + return -1; + } + + // Treat 'returnValue' as the high surrogate. + // + // If this becomes a hot code path, we can skip the below bounds check by reading + // off the end of the string using unsafe code. Since strings are null-terminated, + // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if + // the string terminates unexpectedly. + + index++; + if ((uint)index >= (uint)input.Length) + { + return -1; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = input[index]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + return -1; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + return (int)returnValue; + } + + /// + /// Returns a representation of this instance. + /// + public override string ToString() + { + Span chars = stackalloc char[2]; // worst case + return new string(chars.Slice(0, EncodeToUtf16(chars))); + } + + /// + /// Attempts to create a from the provided input value. + /// + public static bool TryCreate(char ch, out Rune result) + { + uint extendedValue = ch; + if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue)) + { + result = UnsafeCreate(extendedValue); + return true; + } + else + { + result = default; + return false; + } + } + + /// + /// Attempts to create a from the provided input value. + /// + public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result); + + /// + /// Attempts to create a from the provided input value. + /// + [CLSCompliant(false)] + public static bool TryCreate(uint value, out Rune result) + { + if (UnicodeUtility.IsValidUnicodeScalar(value)) + { + result = UnsafeCreate(value); + return true; + } + else + { + result = default; + return false; + } + } + + /// + /// Encodes this to a UTF-16 destination buffer. + /// + /// The buffer to which to write this value as UTF-16. + /// + /// The number of s written to , + /// or 0 if the destination buffer is not large enough to contain the output. + /// True if the value was written to the buffer; otherwise, false. + /// + /// The property can be queried ahead of time to determine + /// the required size of the buffer. + /// + public bool TryEncode(Span destination, out int charsWritten) + { + if (destination.Length >= 1) + { + if (IsBmp) + { + destination[0] = (char)_value; + charsWritten = 1; + return true; + } + else if (destination.Length >= 2) + { + UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]); + charsWritten = 2; + return true; + } + } + + // Destination buffer not large enough + + charsWritten = default; + return false; + } + + /// + /// Encodes this to a destination buffer as UTF-8 bytes. + /// + /// The buffer to which to write this value as UTF-8. + /// + /// The number of s written to , + /// or 0 if the destination buffer is not large enough to contain the output. + /// True if the value was written to the buffer; otherwise, false. + /// + /// The property can be queried ahead of time to determine + /// the required size of the buffer. + /// + // ** This is public so it can be unit tested but isn't yet exposed via the reference assemblies. ** + public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) + { + // TODO: Optimize some of these writes by using BMI2 instructions. + + // The bit patterns below come from the Unicode Standard, Table 3-6. + + if (destination.Length >= 1) + { + if (IsAscii) + { + destination[0] = (byte)_value; + bytesWritten = 1; + return true; + } + + if (destination.Length >= 2) + { + if (_value <= 0x7FFu) + { + // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] + destination[0] = (byte)((_value + (0b110u << 11)) >> 6); + destination[1] = (byte)((_value & 0x3Fu) + 0x80u); + bytesWritten = 2; + return true; + } + + if (destination.Length >= 3) + { + if (_value <= 0xFFFFu) + { + // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] + destination[0] = (byte)((_value + (0b1110 << 16)) >> 12); + destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); + destination[2] = (byte)((_value & 0x3Fu) + 0x80u); + bytesWritten = 3; + return true; + } + + if (destination.Length >= 4) + { + // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] + destination[0] = (byte)((_value + (0b11110 << 21)) >> 18); + destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u); + destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); + destination[3] = (byte)((_value & 0x3Fu) + 0x80u); + bytesWritten = 4; + return true; + } + } + } + } + + // Destination buffer not large enough + + bytesWritten = default; + return false; + } + + /// + /// Attempts to get the which begins at index in + /// string . + /// + /// if a scalar value was successfully extracted from the specified index, + /// if a value could not be extracted due to invalid data. + /// + /// Throws only if is null or is out of range. + /// + public static bool TryGetRuneAt(string input, int index, out Rune value) + { + int runeValue = ReadRuneFromString(input, index); + if (runeValue >= 0) + { + value = UnsafeCreate((uint)runeValue); + return true; + } + else + { + value = default; + return false; + } + } + + // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without + // validation. It is the caller's responsibility to have performed manual validation + // before calling this method. If a Rune instance is forcibly constructed + // from invalid input, the APIs on this type have undefined behavior, potentially including + // introducing a security hole in the consuming application. + // + // An example of a security hole resulting from an invalid Rune value, which could result + // in a stack overflow. + // + // public int GetMarvin32HashCode(Rune r) { + // Span buffer = stackalloc char[r.Utf16SequenceLength]; + // r.TryEncode(buffer, ...); + // return Marvin32.ComputeHash(buffer.AsBytes()); + // } + + /// + /// Creates a without performing validation on the input. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); + + // These are analogs of APIs on System.Char + + public static double GetNumericValue(Rune value) + { + if (value.IsAscii) + { + uint baseNum = value._value - '0'; + return (baseNum <= 9) ? (double)baseNum : -1; + } + else + { + // not an ASCII char; fall back to globalization table + return CharUnicodeInfo.InternalGetNumericValue(value.Value); + } + } + + public static UnicodeCategory GetUnicodeCategory(Rune value) + { + if (value.IsAscii) + { + return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask); + } + else + { + return GetUnicodeCategoryNonAscii(value); + } + } + + private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value) + { + Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters."); + return CharUnicodeInfo.GetUnicodeCategory(value.Value); + } + + // Returns true iff this Unicode category represents a letter + private static bool IsCategoryLetter(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter); + } + + // Returns true iff this Unicode category represents a letter or a decimal digit + private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter) + || (category == UnicodeCategory.DecimalDigitNumber); + } + + // Returns true iff this Unicode category represents a number + private static bool IsCategoryNumber(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber); + } + + // Returns true iff this Unicode category represents a punctuation mark + private static bool IsCategoryPunctuation(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation); + } + + // Returns true iff this Unicode category represents a separator + private static bool IsCategorySeparator(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator); + } + + // Returns true iff this Unicode category represents a symbol + private static bool IsCategorySymbol(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol); + } + + public static bool IsControl(Rune value) + { + // Per the Unicode stability policy, the set of control characters + // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No + // characters will ever be added to the "control characters" group. + // See http://www.unicode.org/policies/stability_policy.html. + + // Logic below depends on Rune.Value never being -1 (since Rune is a validating type) + // 00..1F (+1) => 01..20 (&~80) => 01..20 + // 7F..9F (+1) => 80..A0 (&~80) => 00..20 + + return (((value._value + 1) & ~0x80u) <= 0x20u); + } + + public static bool IsDigit(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9'); + } + else + { + return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber); + } + } + + public static bool IsLetter(Rune value) + { + if (value.IsAscii) + { + return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z] + } + else + { + return IsCategoryLetter(GetUnicodeCategoryNonAscii(value)); + } + } + + public static bool IsLetterOrDigit(Rune value) + { + if (value.IsAscii) + { + return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0); + } + else + { + return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value)); + } + } + + public static bool IsLower(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z'); + } + else + { + return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter); + } + } + + public static bool IsNumber(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9'); + } + else + { + return IsCategoryNumber(GetUnicodeCategoryNonAscii(value)); + } + } + + public static bool IsPunctuation(Rune value) + { + return IsCategoryPunctuation(GetUnicodeCategory(value)); + } + + public static bool IsSeparator(Rune value) + { + return IsCategorySeparator(GetUnicodeCategory(value)); + } + + public static bool IsSymbol(Rune value) + { + return IsCategorySymbol(GetUnicodeCategory(value)); + } + + public static bool IsUpper(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z'); + } + else + { + return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter); + } + } + + public static bool IsWhiteSpace(Rune value) + { + if (value.IsAscii) + { + return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0; + } + + // U+0085 is special since it's a whitespace character but is in the Control category + // instead of a normal separator category. No other code point outside the ASCII range + // has this mismatch. + + if (value._value == 0x0085u) + { + return true; + } + + return IsCategorySeparator(GetUnicodeCategoryNonAscii(value)); + } + + public static Rune ToLower(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: false); + + public static Rune ToLowerInvariant(Rune value) + { + // Handle the most common case (ASCII data) first. Within the common case, we expect + // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless. + + if (value.IsAscii || GlobalizationMode.Invariant) + { + // It's ok for us to use the UTF-16 conversion utility for this since the high + // 16 bits of the value will never be set so will be left unchanged. + return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value)); + } + + // Non-ASCII data requires going through the case folding tables. + + return ToLower(value, CultureInfo.InvariantCulture); + } + + public static Rune ToUpper(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: true); + + public static Rune ToUpperInvariant(Rune value) + { + // Handle the most common case (ASCII data) first. Within the common case, we expect + // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless. + + if (value.IsAscii || GlobalizationMode.Invariant) + { + // It's ok for us to use the UTF-16 conversion utility for this since the high + // 16 bits of the value will never be set so will be left unchanged. + return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value)); + } + + // Non-ASCII data requires going through the case folding tables. + + return ToUpper(value, CultureInfo.InvariantCulture); + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeDebug.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeDebug.cs new file mode 100644 index 0000000..dedfbe2 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeDebug.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace System.Text +{ + internal static class UnicodeDebug + { + [Conditional("DEBUG")] + internal static void AssertIsHighSurrogateCodePoint(uint codePoint) + { + Debug.Assert(UnicodeUtility.IsHighSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 high surrogate code point."); + } + + [Conditional("DEBUG")] + internal static void AssertIsLowSurrogateCodePoint(uint codePoint) + { + Debug.Assert(UnicodeUtility.IsLowSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 low surrogate code point."); + } + + [Conditional("DEBUG")] + internal static void AssertIsValidCodePoint(uint codePoint) + { + Debug.Assert(UnicodeUtility.IsValidCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid Unicode code point."); + } + + [Conditional("DEBUG")] + internal static void AssertIsValidScalar(uint scalarValue) + { + Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid Unicode scalar value."); + } + + [Conditional("DEBUG")] + internal static void AssertIsValidSupplementaryPlaneScalar(uint scalarValue) + { + Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue) && !UnicodeUtility.IsBmpCodePoint(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid supplementary plane Unicode scalar value."); + } + + /// + /// Formats a code point as the hex string "U+XXXX". + /// + /// + /// The input value doesn't have to be a real code point in the Unicode codespace. It can be any integer. + /// + private static string ToHexString(uint codePoint) + { + return FormattableString.Invariant($"U+{codePoint:X4}"); + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs new file mode 100644 index 0000000..c1dcefd --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs @@ -0,0 +1,180 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; + +namespace System.Text +{ + internal static class UnicodeUtility + { + /// + /// The Unicode replacement character U+FFFD. + /// + public const uint ReplacementChar = 0xFFFDU; + + /// + /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point. + /// + public static int GetPlane(uint codePoint) + { + UnicodeDebug.AssertIsValidCodePoint(codePoint); + + return (int)(codePoint >> 16); + } + + /// + /// Returns a Unicode scalar value from two code points representing a UTF-16 surrogate pair. + /// + public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint) + { + UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint); + UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint); + + // This calculation comes from the Unicode specification, Table 3-5. + // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate, + // then fix up the "wwww = uuuuu - 1" section of the bit distribution. The code is written as below + // to become just two instructions: shl, lea. + + return (highSurrogateCodePoint << 10) + lowSurrogateCodePoint - ((0xD800U << 10) + 0xDC00U - (1 << 16)); + } + + /// + /// Given a Unicode scalar value, gets the number of UTF-16 code units required to represent this value. + /// + public static int GetUtf16SequenceLength(uint value) + { + UnicodeDebug.AssertIsValidScalar(value); + + value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00 + value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02 + value >>= 24; // shift high byte down + return (int)value; // and return it + } + + /// + /// Decomposes an astral Unicode scalar into UTF-16 high and low surrogate code units. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint) + { + UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value); + + // This calculation comes from the Unicode specification, Table 3-5. + + highSurrogateCodePoint = (char)((value + ((0xD800u - 0x40u) << 10)) >> 10); + lowSurrogateCodePoint = (char)((value & 0x3FFu) + 0xDC00u); + } + + /// + /// Given a Unicode scalar value, gets the number of UTF-8 code units required to represent this value. + /// + public static int GetUtf8SequenceLength(uint value) + { + UnicodeDebug.AssertIsValidScalar(value); + + // The logic below can handle all valid scalar values branchlessly. + // It gives generally good performance across all inputs, and on x86 + // it's only six instructions: lea, sar, xor, add, shr, lea. + + // 'a' will be -1 if input is < 0x800; else 'a' will be 0 + // => 'a' will be -1 if input is 1 or 2 UTF-8 code units; else 'a' will be 0 + + int a = ((int)value - 0x0800) >> 31; + + // The number of UTF-8 code units for a given scalar is as follows: + // - U+0000..U+007F => 1 code unit + // - U+0080..U+07FF => 2 code units + // - U+0800..U+FFFF => 3 code units + // - U+10000+ => 4 code units + // + // If we XOR the incoming scalar with 0xF800, the chart mutates: + // - U+0000..U+F7FF => 3 code units + // - U+F800..U+F87F => 1 code unit + // - U+F880..U+FFFF => 2 code units + // - U+10000+ => 4 code units + // + // Since the 1- and 3-code unit cases are now clustered, they can + // both be checked together very cheaply. + + value ^= 0xF800u; + value -= 0xF880u; // if scalar is 1 or 3 code units, high byte = 0xFF; else high byte = 0x00 + value += (4 << 24); // if scalar is 1 or 3 code units, high byte = 0x03; else high byte = 0x04 + value >>= 24; // shift high byte down + + // Final return value: + // - U+0000..U+007F => 3 + (-1) * 2 = 1 + // - U+0080..U+07FF => 4 + (-1) * 2 = 2 + // - U+0800..U+FFFF => 3 + ( 0) * 2 = 3 + // - U+10000+ => 4 + ( 0) * 2 = 4 + return (int)value + (a * 2); + } + + /// + /// Returns iff is an ASCII + /// character ([ U+0000..U+007F ]). + /// + /// + /// Per http://www.unicode.org/glossary/#ASCII, ASCII is only U+0000..U+007F. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsAsciiCodePoint(uint value) => (value <= 0x7Fu); + + /// + /// Returns iff is in the + /// Basic Multilingual Plane (BMP). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsBmpCodePoint(uint value) => (value <= 0xFFFFu); + + /// + /// Returns iff is a UTF-16 high surrogate code point, + /// i.e., is in [ U+D800..U+DBFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsHighSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDBFFU); + + /// + /// Returns iff is between + /// and , inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound) => ((value - lowerBound) <= (upperBound - lowerBound)); + + /// + /// Returns iff is a UTF-16 low surrogate code point, + /// i.e., is in [ U+DC00..U+DFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsLowSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xDC00U, 0xDFFFU); + + /// + /// Returns iff is a UTF-16 surrogate code point, + /// i.e., is in [ U+D800..U+DFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDFFFU); + + /// + /// Returns iff is a valid Unicode code + /// point, i.e., is in [ U+0000..U+10FFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsValidCodePoint(uint codePoint) => (codePoint <= 0x10FFFFU); + + /// + /// Returns iff is a valid Unicode scalar + /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsValidUnicodeScalar(uint value) + { + // By XORing the incoming value with 0xD800, surrogate code points + // are moved to the range [ U+0000..U+07FF ], and all valid scalar + // values are clustered into the single range [ U+0800..U+10FFFF ], + // which allows performing a single fast range check. + + return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU); + } + } +} -- 2.7.4