From bdb4b4b0ac1f06ea9dd290c8057d78f7fa676b84 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Wed, 25 Sep 2019 17:36:44 -0700 Subject: [PATCH] Introduce Utf8Span, which is a span of UTF-8 text (dotnet/coreclr#26711) Commit migrated from https://github.com/dotnet/coreclr/commit/9dd5b1a0346a6876616541177a7ffb883b810b16 --- .../System.Private.CoreLib/Resources/Strings.resx | 12 + .../System.Private.CoreLib.csproj | 8 + .../src/System/Text/Utf8Span.Comparison.cs | 228 +++++++++ .../src/System/Text/Utf8Span.Conversion.cs | 321 ++++++++++++ .../src/System/Text/Utf8Span.Enumeration.cs | 137 ++++++ .../src/System/Text/Utf8Span.Manipulation.cs | 536 +++++++++++++++++++++ .../src/System/Text/Utf8Span.Searching.cs | 505 +++++++++++++++++++ .../src/System/Text/Utf8Span.cs | 288 +++++++++++ .../src/System/Utf8Extensions.cs | 82 +++- .../src/System/Utf8String.Comparison.cs | 142 ++++++ .../src/System/Utf8String.Construction.cs | 11 + .../src/System/Utf8String.cs | 18 +- .../src/System/Utf8StringSplitOptions.cs | 17 + .../src/System.Private.CoreLib.Shared.projitems | 2 + .../src/System/Globalization/CompareInfo.cs | 59 ++- .../src/System/String.Manipulation.cs | 15 +- .../src/System/Text/TrimType.cs | 30 ++ .../src/System/Text/Unicode/Utf8Utility.Helpers.cs | 2 +- .../System/Text/Unicode/Utf8Utility.WhiteSpace.cs | 139 ++++++ .../src/System/Text/Unicode/Utf8Utility.cs | 22 +- 20 files changed, 2522 insertions(+), 52 deletions(-) create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs diff --git a/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx b/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx index f5563a2..e2afd26 100644 --- a/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx +++ b/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx @@ -3766,4 +3766,16 @@ Type '{0}' has more than one COM unregistration function. + + Cannot create the desired substring because it would split a multi-byte UTF-8 subsequence. + + + Cannot call Utf8Span.Equals(object). Use Equals(Utf8Span) or operator == instead. + + + UTF-16 surrogate code points (U+D800..U+DFFF) are disallowed. + + + Argument cannot be an empty span. + diff --git a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj index 335de4b..604deae 100644 --- a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj +++ b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj @@ -303,9 +303,17 @@ + + + + + + + + diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs new file mode 100644 index 0000000..317927f --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs @@ -0,0 +1,228 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Text.Unicode; + +namespace System.Text +{ + public readonly ref partial struct Utf8Span + { + public static bool operator ==(Utf8Span left, Utf8Span right) => Equals(left, right); + public static bool operator !=(Utf8Span left, Utf8Span right) => !Equals(left, right); + + public int CompareTo(Utf8Span other) + { + // TODO_UTF8STRING: This is ordinal, but String.CompareTo uses CurrentCulture. + // Is this acceptable? + + // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted. + + return StringComparer.Ordinal.Compare(this.ToString(), other.ToString()); + } + + public int CompareTo(Utf8Span other, StringComparison comparison) + { + // TODO_UTF8STRING: We can avoid the virtual dispatch by moving the switch into this method. + + // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted. + + return StringComparer.FromComparison(comparison).Compare(this.ToString(), other.ToString()); + } + + /// + /// Returns a value stating whether the current instance contains + /// . An ordinal comparison is used. + /// + public bool Contains(char value) + { + return Rune.TryCreate(value, out Rune rune) && Contains(rune); + } + + /// + /// Returns a value stating whether the current instance contains + /// . The specified comparison is used. + /// + public bool Contains(char value, StringComparison comparison) + { + return Rune.TryCreate(value, out Rune rune) && Contains(rune, comparison); + } + + /// + /// Returns a value stating whether the current instance contains + /// the specified . An ordinal comparison is used. + /// + public bool Contains(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return (this.Bytes.IndexOf(runeBytes.Slice(0, runeBytesWritten)) >= 0); + } + + /// + /// Returns a value stating whether the current instance contains + /// the specified . The specified comparison is used. + /// + public bool Contains(Rune value, StringComparison comparison) + { + // TODO_UTF8STRING: Optimize me to avoid allocations. + + return this.ToString().Contains(value.ToString(), comparison); + } + + /// + /// Returns a value stating whether the current instance contains . + /// An ordinal comparison is used. + /// + public bool Contains(Utf8Span value) + { + return (this.Bytes.IndexOf(value.Bytes) >= 0); + } + + /// + /// Returns a value stating whether the current instance contains . + /// The specified comparison is used. + /// + public bool Contains(Utf8Span value, StringComparison comparison) + { + // TODO_UTF8STRING: Optimize me to avoid allocations. + + return this.ToString().Contains(value.ToString(), comparison); + } + + /// + /// Returns a value stating whether the current instance ends with + /// . An ordinal comparison is used. + /// + public bool EndsWith(char value) + { + return Rune.TryCreate(value, out Rune rune) && EndsWith(rune); + } + + /// + /// Returns a value stating whether the current instance ends with + /// . The specified comparison is used. + /// + public bool EndsWith(char value, StringComparison comparison) + { + return Rune.TryCreate(value, out Rune rune) && EndsWith(rune, comparison); + } + + /// + /// Returns a value stating whether the current instance ends with + /// the specified . An ordinal comparison is used. + /// + public bool EndsWith(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return this.Bytes.EndsWith(runeBytes.Slice(0, runeBytesWritten)); + } + + /// + /// Returns a value stating whether the current instance ends with + /// the specified . The specified comparison is used. + /// + public bool EndsWith(Rune value, StringComparison comparison) + { + // TODO_UTF8STRING: Optimize me to avoid allocations. + + return this.ToString().EndsWith(value.ToString(), comparison); + } + + /// + /// Returns a value stating whether the current instance ends with . + /// An ordinal comparison is used. + /// + public bool EndsWith(Utf8Span value) + { + return this.Bytes.EndsWith(value.Bytes); + } + + /// + /// Returns a value stating whether the current instance ends with . + /// The specified comparison is used. + /// + public bool EndsWith(Utf8Span value, StringComparison comparison) + { + // TODO_UTF8STRING: Optimize me to avoid allocations. + + return this.ToString().EndsWith(value.ToString(), comparison); + } + + /// + /// Returns a value stating whether the current instance begins with + /// . An ordinal comparison is used. + /// + public bool StartsWith(char value) + { + return Rune.TryCreate(value, out Rune rune) && StartsWith(rune); + } + + /// + /// Returns a value stating whether the current instance begins with + /// . The specified comparison is used. + /// + public bool StartsWith(char value, StringComparison comparison) + { + return Rune.TryCreate(value, out Rune rune) && StartsWith(rune, comparison); + } + + /// + /// Returns a value stating whether the current instance begins with + /// the specified . An ordinal comparison is used. + /// + public bool StartsWith(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return this.Bytes.StartsWith(runeBytes.Slice(0, runeBytesWritten)); + } + + /// + /// Returns a value stating whether the current instance begins with + /// the specified . The specified comparison is used. + /// + public bool StartsWith(Rune value, StringComparison comparison) + { + // TODO_UTF8STRING: Optimize me to avoid allocations. + + return this.ToString().StartsWith(value.ToString(), comparison); + } + + /// + /// Returns a value stating whether the current instance begins with . + /// An ordinal comparison is used. + /// + public bool StartsWith(Utf8Span value) + { + return this.Bytes.StartsWith(value.Bytes); + } + + /// + /// Returns a value stating whether the current instance begins with . + /// The specified comparison is used. + /// + public bool StartsWith(Utf8Span value, StringComparison comparison) + { + // TODO_UTF8STRING: Optimize me to avoid allocations. + + return this.ToString().StartsWith(value.ToString(), comparison); + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs new file mode 100644 index 0000000..a4211e7 --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs @@ -0,0 +1,321 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.Globalization; +using System.Runtime.CompilerServices; +using System.Text.Unicode; + +namespace System.Text +{ + public readonly ref partial struct Utf8Span + { + /// + /// Returns a new instance which represents this instance + /// normalized using the specified Unicode normalization form. + /// + /// + /// The original is left unchanged by this operation. + /// + public Utf8String Normalize(NormalizationForm normalizationForm = NormalizationForm.FormC) + { + // TODO_UTF8STRING: Reduce allocations in this code path. + + return new Utf8String(this.ToString().Normalize(normalizationForm)); + } + + /// + /// Converts this to the desired Unicode normalization form, writing the + /// UTF-8 result to the buffer . + /// + /// + /// The number of bytes written to , or -1 if + /// is not large enough to hold the result of the normalization operation. + /// + /// + /// The original is left unchanged by this operation. Note that the the required + /// length of may be longer or shorter (in terms of UTF-8 byte count) + /// than the input . + /// + public int Normalize(Span destination, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + // TODO_UTF8STRING: Reduce allocations in this code path. + + ReadOnlySpan normalized = this.ToString().Normalize(normalizationForm); + OperationStatus status = Utf8.FromUtf16(normalized, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true); + + Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "Normalize shouldn't have produced malformed Unicode string."); + + if (status != OperationStatus.Done) + { + bytesWritten = -1; // "destination too small" + } + + return bytesWritten; + } + + /// + /// Converts this to a . + /// + public unsafe char[] ToCharArray() + { + if (IsEmpty) + { + return Array.Empty(); + } + + // TODO_UTF8STRING: Since we know the underlying data is immutable, well-formed UTF-8, + // we can perform transcoding using an optimized code path that skips all safety checks. + // We should also consider skipping the two-pass if possible. + + fixed (byte* pbUtf8 = &DangerousGetMutableReference()) + { + byte* pbUtf8Invalid = Utf8Utility.GetPointerToFirstInvalidByte(pbUtf8, this.Length, out int utf16CodeUnitCountAdjustment, out _); + Debug.Assert(pbUtf8Invalid == pbUtf8 + this.Length, "Invalid UTF-8 data seen in buffer."); + + char[] asUtf16 = new char[this.Length + utf16CodeUnitCountAdjustment]; + fixed (byte* pbUtf16 = &asUtf16.GetRawSzArrayData()) + { + OperationStatus status = Utf8Utility.TranscodeToUtf16(pbUtf8, this.Length, (char*)pbUtf16, asUtf16.Length, out byte* pbUtf8End, out char* pchUtf16End); + Debug.Assert(status == OperationStatus.Done, "The buffer changed out from under us unexpectedly?"); + Debug.Assert(pbUtf8End == pbUtf8 + this.Length, "The buffer changed out from under us unexpectedly?"); + Debug.Assert(pchUtf16End == ((char*)pbUtf16) + asUtf16.Length, "The buffer changed out from under us unexpectedly?"); + + return asUtf16; + } + } + } + + /// + /// Converts this instance to its UTF-16 equivalent, writing the result into + /// the buffer . + /// + /// + /// The number of bytes written to , or -1 if + /// is not large enough to hold the result of the transcoding operation. + /// + public int ToChars(Span destination) + { + OperationStatus status = Utf8.ToUtf16(Bytes, destination, out int _, out int charsWritten, replaceInvalidSequences: false, isFinalBlock: true); + + Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "Utf8Spans shouldn't contain ill-formed UTF-8 data."); + + if (status != OperationStatus.Done) + { + charsWritten = -1; // "destination too small" + } + + return charsWritten; + } + + /// + /// Returns a new instance which represents this instance + /// converted to lowercase using . + /// + /// + /// The original is left unchanged by this operation. Note that the returned + /// instance may be longer or shorter (in terms of UTF-8 byte count) than the + /// input . + /// + public Utf8String ToLower(CultureInfo culture) + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + if (culture is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture); + } + + return new Utf8String(this.ToString().ToLower(culture)); + } + + /// + /// Converts this to lowercase using , writing the + /// UTF-8 result to the buffer . + /// + /// + /// The number of bytes written to , or -1 if + /// is not large enough to hold the result of the case conversion operation. + /// + /// + /// The original is left unchanged by this operation. Note that the the required + /// length of may be longer or shorter (in terms of UTF-8 byte count) + /// than the input . + /// + public int ToLower(Span destination, CultureInfo culture) + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + if (culture is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture); + } + + ReadOnlySpan asLower = this.ToString().ToLower(culture); + OperationStatus status = Utf8.FromUtf16(asLower, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true); + + Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToLower shouldn't have produced malformed Unicode string."); + + if (status != OperationStatus.Done) + { + bytesWritten = -1; // "destination too small" + } + + return bytesWritten; + } + + /// + /// Returns a new instance which represents this instance + /// converted to lowercase using the invariant culture. + /// + /// + /// The original is left unchanged by this operation. For more information on the + /// invariant culture, see the property. Note that the returned + /// instance may be longer or shorter (in terms of UTF-8 byte count) than the + /// input . + /// + public Utf8String ToLowerInvariant() + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + return new Utf8String(this.ToString().ToLowerInvariant()); + } + + /// + /// Converts this to lowercase using the invariant culture, writing the + /// UTF-8 result to the buffer . + /// + /// + /// The number of bytes written to , or -1 if + /// is not large enough to hold the result of the case conversion operation. + /// + /// + /// The original is left unchanged by this operation. For more information on the + /// invariant culture, see the property. Note that the the required + /// length of may be longer or shorter (in terms of UTF-8 byte count) + /// than the input . + /// + public int ToLowerInvariant(Span destination) + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + ReadOnlySpan asLowerInvariant = this.ToString().ToLowerInvariant(); + OperationStatus status = Utf8.FromUtf16(asLowerInvariant, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true); + + Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToLowerInvariant shouldn't have produced malformed Unicode string."); + + if (status != OperationStatus.Done) + { + bytesWritten = -1; // "destination too small" + } + + return bytesWritten; + } + + /// + /// Returns a new instance which represents this instance + /// converted to uppercase using . + /// + /// + /// The original is left unchanged by this operation. Note that the returned + /// instance may be longer or shorter (in terms of UTF-8 byte count) than the + /// input . + /// + public Utf8String ToUpper(CultureInfo culture) + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + if (culture is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture); + } + + return new Utf8String(this.ToString().ToUpper(culture)); + } + + /// + /// Converts this to uppercase using , writing the + /// UTF-8 result to the buffer . + /// + /// + /// The number of bytes written to , or -1 if + /// is not large enough to hold the result of the case conversion operation. + /// + /// + /// The original is left unchanged by this operation. Note that the the required + /// length of may be longer or shorter (in terms of UTF-8 byte count) + /// than the input . + /// + public int ToUpper(Span destination, CultureInfo culture) + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + if (culture is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture); + } + + ReadOnlySpan asUpper = this.ToString().ToUpper(culture); + OperationStatus status = Utf8.FromUtf16(asUpper, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true); + + Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToUpper shouldn't have produced malformed Unicode string."); + + if (status != OperationStatus.Done) + { + bytesWritten = -1; // "destination too small" + } + + return bytesWritten; + } + + /// + /// Returns a new instance which represents this instance + /// converted to uppercase using the invariant culture. + /// + /// + /// The original is left unchanged by this operation. For more information on the + /// invariant culture, see the property. Note that the returned + /// instance may be longer or shorter (in terms of UTF-8 byte count) than the + /// input . + /// + public Utf8String ToUpperInvariant() + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + return new Utf8String(this.ToString().ToUpperInvariant()); + } + + /// + /// Converts this to uppercase using the invariant culture, writing the + /// UTF-8 result to the buffer . + /// + /// + /// The number of bytes written to , or -1 if + /// is not large enough to hold the result of the case conversion operation. + /// + /// + /// The original is left unchanged by this operation. For more information on the + /// invariant culture, see the property. Note that the the required + /// length of may be longer or shorter (in terms of UTF-8 byte count) + /// than the input . + /// + public int ToUpperInvariant(Span destination) + { + // TODO_UTF8STRING: Avoid intermediate allocations. + + ReadOnlySpan asUpperInvariant = this.ToString().ToUpperInvariant(); + OperationStatus status = Utf8.FromUtf16(asUpperInvariant, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true); + + Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToUpperInvariant shouldn't have produced malformed Unicode string."); + + if (status != OperationStatus.Done) + { + bytesWritten = -1; // "destination too small" + } + + return bytesWritten; + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs new file mode 100644 index 0000000..6fba6ed --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs @@ -0,0 +1,137 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; + +namespace System.Text +{ + public readonly ref partial struct Utf8Span + { + public CharEnumerable Chars => new CharEnumerable(this); + public RuneEnumerable Runes => new RuneEnumerable(this); + + public readonly ref struct CharEnumerable + { + private readonly Utf8Span _span; + + internal CharEnumerable(Utf8Span span) + { + _span = span; + } + + public Enumerator GetEnumerator() => new Enumerator(_span); + + public ref struct Enumerator + { + private uint _currentCharPair; + private ReadOnlySpan _remainingUtf8Bytes; + + internal Enumerator(Utf8Span span) + { + _currentCharPair = default; + _remainingUtf8Bytes = span.Bytes; + } + + public char Current => (char)_currentCharPair; + + public bool MoveNext() + { + // We don't need to worry about tearing since this enumerator is a ref struct. + + if (_currentCharPair > char.MaxValue) + { + // There was a surrogate pair smuggled in here from a previous operation. + // Shift out the high surrogate value and return immediately. + + _currentCharPair >>= 16; + return true; + } + + if (_remainingUtf8Bytes.IsEmpty) + { + return false; + } + + // TODO_UTF8STRING: Since we assume Utf8String instances are well-formed, we may instead + // call an optimized version of the "decode" routine below which skips well-formedness checks. + + OperationStatus status = Rune.DecodeFromUtf8(_remainingUtf8Bytes, out Rune currentRune, out int bytesConsumed); + Debug.Assert(status == OperationStatus.Done, "Somebody fed us invalid data?"); + + if (currentRune.IsBmp) + { + // Common case - BMP scalar value. + + _currentCharPair = (uint)currentRune.Value; + } + else + { + // Uncommon case - supplementary plane (astral) scalar value. + // We'll smuggle the two UTF-16 code units into a single 32-bit value, + // with the leading surrogate packed into the low 16 bits of the value, + // and the trailing surrogate packed into the high 16 bits of the value. + + UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar((uint)currentRune.Value, out char leadingCodeUnit, out char trailingCodeUnit); + _currentCharPair = (uint)leadingCodeUnit + ((uint)trailingCodeUnit << 16); + } + + // TODO_UTF8STRING: We can consider unsafe slicing below if we wish since we know we're + // not going to overrun the end of the span. + + _remainingUtf8Bytes = _remainingUtf8Bytes.Slice(bytesConsumed); + return true; + } + } + } + + public readonly ref struct RuneEnumerable + { + private readonly Utf8Span _span; + + internal RuneEnumerable(Utf8Span span) + { + _span = span; + } + + public Enumerator GetEnumerator() => new Enumerator(_span); + + public ref struct Enumerator + { + private Rune _currentRune; + private ReadOnlySpan _remainingUtf8Bytes; + + internal Enumerator(Utf8Span span) + { + _currentRune = default; + _remainingUtf8Bytes = span.Bytes; + } + + public Rune Current => _currentRune; + + public bool MoveNext() + { + // We don't need to worry about tearing since this enumerator is a ref struct. + + if (_remainingUtf8Bytes.IsEmpty) + { + return false; + } + + // TODO_UTF8STRING: Since we assume Utf8Span instances are well-formed, we may instead + // call an optimized version of the "decode" routine below which skips well-formedness checks. + + OperationStatus status = Rune.DecodeFromUtf8(_remainingUtf8Bytes, out _currentRune, out int bytesConsumed); + Debug.Assert(status == OperationStatus.Done, "Somebody fed us invalid data?"); + + // TODO_UTF8STRING: We can consider unsafe slicing below if we wish since we know we're + // not going to overrun the end of the span. + + _remainingUtf8Bytes = _remainingUtf8Bytes.Slice(bytesConsumed); + return true; + } + } + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs new file mode 100644 index 0000000..ff177ae --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs @@ -0,0 +1,536 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.ComponentModel; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text.Unicode; + +namespace System.Text +{ + public readonly ref partial struct Utf8Span + { + [StackTraceHidden] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckSplitOptions(Utf8StringSplitOptions options) + { + if ((uint)options > (uint)(Utf8StringSplitOptions.RemoveEmptyEntries | Utf8StringSplitOptions.TrimEntries)) + { + CheckSplitOptions_Throw(options); + } + } + + [StackTraceHidden] + private static void CheckSplitOptions_Throw(Utf8StringSplitOptions options) + { + throw new ArgumentOutOfRangeException( + paramName: nameof(options), + message: SR.Format(SR.Arg_EnumIllegalVal, (int)options)); + } + + public SplitResult Split(char separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None) + { + if (!Rune.TryCreate(separator, out Rune rune)) + { + throw new ArgumentOutOfRangeException( + paramName: nameof(separator), + message: SR.ArgumentOutOfRange_Utf16SurrogatesDisallowed); + } + + CheckSplitOptions(options); + + return new SplitResult(this, rune, options); + } + + public SplitResult Split(Rune separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None) + { + CheckSplitOptions(options); + + return new SplitResult(this, separator, options); + } + + public SplitResult Split(Utf8Span separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None) + { + if (separator.IsEmpty) + { + throw new ArgumentException( + paramName: nameof(separator), + message: SR.Argument_CannotBeEmptySpan); + } + + CheckSplitOptions(options); + + return new SplitResult(this, separator, options); + } + + /// + /// Locates within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// An ordinal search is performed. + /// + public SplitOnResult SplitOn(char separator) + { + return TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// The search is performed using the specified . + /// + public SplitOnResult SplitOn(char separator, StringComparison comparisonType) + { + return TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// An ordinal search is performed. + /// + public SplitOnResult SplitOn(Rune separator) + { + return TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// The search is performed using the specified . + /// + public SplitOnResult SplitOn(Rune separator, StringComparison comparisonType) + { + return TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// An ordinal search is performed. + /// + public SplitOnResult SplitOn(Utf8Span separator) + { + return TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Utf8Span)". + /// + /// + /// The search is performed using the specified . + /// + public SplitOnResult SplitOn(Utf8Span separator, StringComparison comparisonType) + { + return TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// An ordinal search is performed. + /// + public SplitOnResult SplitOnLast(char separator) + { + return TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// The search is performed using the specified . + /// + public SplitOnResult SplitOnLast(char separator, StringComparison comparisonType) + { + return TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// An ordinal search is performed. + /// + public SplitOnResult SplitOnLast(Rune separator) + { + return TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// The search is performed using the specified . + /// + public SplitOnResult SplitOnLast(Rune separator, StringComparison comparisonType) + { + return TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// An ordinal search is performed. + /// + public SplitOnResult SplitOnLast(Utf8Span separator) + { + return TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Locates the last occurrence of within this instance, creating + /// instances which represent the data on either side of the separator. If is not found + /// within this instance, returns the tuple "(this, Empty)". + /// + /// + /// The search is performed using the specified . + /// + public SplitOnResult SplitOnLast(Utf8Span separator, StringComparison comparisonType) + { + return TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this); + } + + /// + /// Trims whitespace from the beginning and the end of this , + /// returning a new containing the resulting slice. + /// + public Utf8Span Trim() => TrimHelper(TrimType.Both); + + /// + /// Trims whitespace from only the end of this , + /// returning a new containing the resulting slice. + /// + public Utf8Span TrimEnd() => TrimHelper(TrimType.Tail); + + internal Utf8Span TrimHelper(TrimType trimType) + { + ReadOnlySpan retSpan = Bytes; + + if ((trimType & TrimType.Head) != 0) + { + int indexOfFirstNonWhiteSpaceChar = Utf8Utility.GetIndexOfFirstNonWhiteSpaceChar(retSpan); + Debug.Assert((uint)indexOfFirstNonWhiteSpaceChar <= (uint)retSpan.Length); + + // TODO_UTF8STRING: Can use an unsafe slicing routine below if we need a perf boost. + + retSpan = retSpan.Slice(indexOfFirstNonWhiteSpaceChar); + } + + if ((trimType & TrimType.Tail) != 0) + { + int indexOfTrailingWhiteSpaceSequence = Utf8Utility.GetIndexOfTrailingWhiteSpaceSequence(retSpan); + Debug.Assert((uint)indexOfTrailingWhiteSpaceSequence <= (uint)retSpan.Length); + + // TODO_UTF8STRING: Can use an unsafe slicing routine below if we need a perf boost. + + retSpan = retSpan.Slice(0, indexOfTrailingWhiteSpaceSequence); + } + + return UnsafeCreateWithoutValidation(retSpan); + } + + /// + /// Trims whitespace from only the beginning of this , + /// returning a new containing the resulting slice. + /// + public Utf8Span TrimStart() => TrimHelper(TrimType.Head); + [StructLayout(LayoutKind.Auto)] + public readonly ref struct SplitResult + { + private readonly State _state; + + internal SplitResult(Utf8Span source, Rune searchRune, Utf8StringSplitOptions splitOptions) + { + _state = new State + { + RemainingSearchSpace = source, + SearchRune = searchRune.Value, + SearchTerm = default, + SplitOptions = splitOptions + }; + } + + internal SplitResult(Utf8Span source, Utf8Span searchTerm, Utf8StringSplitOptions splitOptions) + { + _state = new State + { + RemainingSearchSpace = source, + SearchRune = -1, + SearchTerm = searchTerm, + SplitOptions = splitOptions + }; + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out item2); + TrimIfNeeded(ref item2); + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder); + _state.DeconstructHelper(in remainder, out item2, out item3); + TrimIfNeeded(ref item3); + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder); + _state.DeconstructHelper(in remainder, out item2, out remainder); + _state.DeconstructHelper(in remainder, out item3, out item4); + TrimIfNeeded(ref item4); + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder); + _state.DeconstructHelper(in remainder, out item2, out remainder); + _state.DeconstructHelper(in remainder, out item3, out remainder); + _state.DeconstructHelper(in remainder, out item4, out item5); + TrimIfNeeded(ref item5); + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5, out Utf8Span item6) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder); + _state.DeconstructHelper(in remainder, out item2, out remainder); + _state.DeconstructHelper(in remainder, out item3, out remainder); + _state.DeconstructHelper(in remainder, out item4, out remainder); + _state.DeconstructHelper(in remainder, out item5, out item6); + TrimIfNeeded(ref item6); + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5, out Utf8Span item6, out Utf8Span item7) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder); + _state.DeconstructHelper(in remainder, out item2, out remainder); + _state.DeconstructHelper(in remainder, out item3, out remainder); + _state.DeconstructHelper(in remainder, out item4, out remainder); + _state.DeconstructHelper(in remainder, out item5, out remainder); + _state.DeconstructHelper(in remainder, out item6, out item7); + TrimIfNeeded(ref item7); + } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5, out Utf8Span item6, out Utf8Span item7, out Utf8Span item8) + { + _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder); + _state.DeconstructHelper(in remainder, out item2, out remainder); + _state.DeconstructHelper(in remainder, out item3, out remainder); + _state.DeconstructHelper(in remainder, out item4, out remainder); + _state.DeconstructHelper(in remainder, out item5, out remainder); + _state.DeconstructHelper(in remainder, out item6, out remainder); + _state.DeconstructHelper(in remainder, out item7, out item8); + TrimIfNeeded(ref item8); + } + + public Enumerator GetEnumerator() => new Enumerator(this); + + private void TrimIfNeeded(ref Utf8Span span) + { + if ((_state.SplitOptions & Utf8StringSplitOptions.TrimEntries) != 0) + { + span = span.Trim(); + } + } + + [StructLayout(LayoutKind.Auto)] + public ref struct Enumerator + { + private const Utf8StringSplitOptions HALT_ENUMERATION = (Utf8StringSplitOptions)int.MinValue; + + private Utf8Span _current; + private State _state; + + internal Enumerator(SplitResult result) + { + _current = default; + _state = result._state; // copy by value + } + + public Utf8Span Current => _current; + + public bool MoveNext() + { + // Happy path: if the search term was found, then the two 'out' fields below are overwritten with + // the contents of the (before, after) tuple, and we can return right away. + + if (_state.DeconstructHelper(in _state.RemainingSearchSpace, out _current, out _state.RemainingSearchSpace)) + { + return true; + } + + // At this point, the search term was not found within the search space. '_current' contains the last + // bit of data after the final occurrence of the search term. We'll also set a flag saying that we've + // completed enumeration. + + if (_current.IsEmpty && (_state.SplitOptions & Utf8StringSplitOptions.RemoveEmptyEntries) != 0) + { + return false; + } + + if ((_state.SplitOptions & HALT_ENUMERATION) != 0) + { + return false; + } + + _state.SplitOptions |= HALT_ENUMERATION; // prevents yielding forever at end of split + + return true; + } + } + + [StructLayout(LayoutKind.Auto)] + private ref struct State // fully mutable + { + internal Utf8Span RemainingSearchSpace; + internal int SearchRune; // -1 if not specified, takes less space than "Rune?" + internal Utf8Span SearchTerm; + internal Utf8StringSplitOptions SplitOptions; + + // Returns 'true' if a match was found, 'false' otherwise. + internal readonly bool DeconstructHelper(in Utf8Span source, out Utf8Span firstItem, out Utf8Span remainder) + { + // n.b. Our callers might pass the same reference for 'source' and 'remainder'. + // We need to take care not to read 'source' after writing 'remainder'. + + bool wasMatchFound; + ref readonly Utf8Span searchSpan = ref source; + + while (true) + { + if (searchSpan.IsEmpty) + { + firstItem = searchSpan; + remainder = default; + wasMatchFound = false; + break; + } + + Range matchRange; + + if (SearchRune >= 0) + { + wasMatchFound = searchSpan.TryFind(Rune.UnsafeCreate((uint)SearchRune), out matchRange); + } + else + { + wasMatchFound = searchSpan.TryFind(SearchTerm, out matchRange); + } + + if (!wasMatchFound) + { + // If no match was found, we move 'source' to 'firstItem', trim if necessary, and return right away. + + firstItem = searchSpan; + + if ((SplitOptions & Utf8StringSplitOptions.TrimEntries) != 0) + { + firstItem = firstItem.Trim(); + } + + remainder = default; + } + else + { + // Otherwise, if a match was found, split the result across 'firstItem' and 'remainder', + // applying trimming if necessary. + + firstItem = searchSpan[..matchRange.Start]; // TODO_UTF8STRING: Could use unsafe slicing as optimization + remainder = searchSpan[matchRange.End..]; // TODO_UTF8STRING: Could use unsafe slicing as optimization + + if ((SplitOptions & Utf8StringSplitOptions.TrimEntries) != 0) + { + firstItem = firstItem.Trim(); + } + + // If we're asked to remove empty entries, loop until there's a real value in 'firstItem'. + + if ((SplitOptions & Utf8StringSplitOptions.RemoveEmptyEntries) != 0 && firstItem.IsEmpty) + { + searchSpan = ref remainder; + continue; + } + } + + break; // loop only if explicit 'continue' statement was hit + } + + return wasMatchFound; + } + } + } + + [StructLayout(LayoutKind.Auto)] + public readonly ref struct SplitOnResult + { + // Used when there is no match. + internal SplitOnResult(Utf8Span originalSearchSpace) + { + Before = originalSearchSpace; + After = Empty; + } + + // Used when a match is found. + internal SplitOnResult(Utf8Span originalSearchSpace, Range searchTermMatchRange) + { + (int startIndex, int length) = searchTermMatchRange.GetOffsetAndLength(originalSearchSpace.Length); + + // TODO_UTF8STRING: The below indexer performs correctness checks. We can skip these checks (and even the + // bounds checks more generally) since we know the inputs are all valid and the containing struct is not + // subject to tearing. + + Before = originalSearchSpace[..startIndex]; + After = originalSearchSpace[(startIndex + length)..]; + } + + public Utf8Span After { get; } + public Utf8Span Before { get; } + + [EditorBrowsable(EditorBrowsableState.Never)] + public void Deconstruct(out Utf8Span before, out Utf8Span after) + { + before = Before; + after = After; + } + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs new file mode 100644 index 0000000..6be6e21 --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs @@ -0,0 +1,505 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Globalization; +using System.Text.Unicode; + +namespace System.Text +{ + public readonly ref partial struct Utf8Span + { + /// + /// Attempts to locate the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// An ordinal search is performed. + /// + public bool TryFind(char value, out Range range) + { + if (Rune.TryCreate(value, out Rune rune)) + { + return TryFind(rune, out range); + } + else + { + // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately. + + range = default; + return false; + } + } + + /// + /// Attempts to locate the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// The search is performed using the specified . + /// + public bool TryFind(char value, StringComparison comparisonType, out Range range) + { + if (Rune.TryCreate(value, out Rune rune)) + { + return TryFind(rune, comparisonType, out range); + } + else + { + string.CheckStringComparison(comparisonType); + + // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately. + + range = default; + return false; + } + } + + /// + /// Attempts to locate the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// An ordinal search is performed. + /// + public bool TryFind(Rune value, out Range range) + { + if (value.IsAscii) + { + // Special-case ASCII since it's a simple single byte search. + + int idx = Bytes.IndexOf((byte)value.Value); + if (idx < 0) + { + range = default; + return false; + } + else + { + range = idx..(idx + 1); + return true; + } + } + else + { + // Slower path: need to search a multi-byte sequence. + // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we + // know Rune instances are well-formed and slicing is safe. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes); + + return TryFind(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), out range); + } + } + + /// + /// Attempts to locate the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// The search is performed using the specified . + /// + public bool TryFind(Rune value, StringComparison comparisonType, out Range range) + { + if (comparisonType == StringComparison.Ordinal) + { + return TryFind(value, out range); + } + else + { + // Slower path: not an ordinal comparison. + // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we + // know Rune instances are well-formed and slicing is safe. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes); + + return TryFind(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), comparisonType, out range); + } + } + + /// + /// Attempts to locate the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// An ordinal search is performed. + /// + public bool TryFind(Utf8Span value, out Range range) + { + int idx; + + if (value.Bytes.Length == 1) + { + // Special-case ASCII since it's a simple single byte search. + + idx = this.Bytes.IndexOf(value.Bytes[0]); + } + else + { + // Slower path: need to search a multi-byte sequence. + + idx = this.Bytes.IndexOf(value.Bytes); + } + + if (idx < 0) + { + range = default; + return false; + } + else + { + range = idx..(idx + value.Bytes.Length); + return true; + } + } + + /// + /// Attempts to locate the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// The search is performed using the specified . + /// + public bool TryFind(Utf8Span value, StringComparison comparisonType, out Range range) => TryFind(value, comparisonType, out range, fromBeginning: true); + + private unsafe bool TryFind(Utf8Span value, StringComparison comparisonType, out Range range, bool fromBeginning) + { + string.CheckStringComparison(comparisonType); + + if (value.IsEmpty) + { + // sourceString.IndexOf/LastIndexOf(term, comparer) should return the minimum/maximum value index + // for which the expression "sourceString.Substring(index).StartsWith(term, comparer)" is true. + // The range we return to the caller should reflect this so that they can pull out the correct index. + + if (fromBeginning) + { + range = Index.Start..Index.Start; + } + else + { + range = Index.End..Index.End; + } + return true; + } + + if (this.IsEmpty) + { + range = default; + return false; + } + + CompareInfo compareInfo = default!; // will be overwritten if it matters + CompareOptions compareOptions = string.GetCaseCompareOfComparisonCulture(comparisonType); + + if (GlobalizationMode.Invariant) + { + // In the Invariant globalization mode, all comparisons are normalized to Ordinal or OrdinalIgnoreCase, + // and even in "ignore case" we only map [a-z] <-> [A-Z]. All other code points remain unmapped. + + // TODO_UTF8STRING: We should take advantage of the property described above to avoid the UTF-16 + // transcoding step entirely. + + if (compareOptions != CompareOptions.None) + { + return (fromBeginning) + ? TryFind(value, out range) + : TryFindLast(value, out range); // call the ordinal search routine + } + } + else + { + switch (comparisonType) + { + case StringComparison.Ordinal: + return (fromBeginning) + ? TryFind(value, out range) + : TryFindLast(value, out range); + + case StringComparison.OrdinalIgnoreCase: + // TODO_UTF8STRING: Can probably optimize this case. + compareInfo = CompareInfo.Invariant; + break; + + case StringComparison.CurrentCulture: + case StringComparison.CurrentCultureIgnoreCase: + compareInfo = CultureInfo.CurrentCulture.CompareInfo; + break; + + default: + Debug.Assert(comparisonType == StringComparison.InvariantCulture || comparisonType == StringComparison.InvariantCultureIgnoreCase); + compareInfo = CompareInfo.Invariant; + break; + } + } + + // TODO_UTF8STRING: Remove allocations below, and try to avoid the transcoding step if possible. + + string thisTranscodedToUtf16 = this.ToStringNoReplacement(); + string otherTranscodedToUtf16 = value.ToStringNoReplacement(); + + int idx, matchLength; + + if (GlobalizationMode.Invariant) + { + // If we got here, it meant we're doing an OrdinalIgnoreCase comparison. + + Debug.Assert(compareOptions == CompareOptions.IgnoreCase); + + idx = CompareInfo.InvariantIndexOf(thisTranscodedToUtf16, otherTranscodedToUtf16, ignoreCase: true, fromBeginning); + matchLength = otherTranscodedToUtf16.Length; // If there was a match, it involved only simple case folding. + } + else + { + idx = compareInfo.IndexOf(thisTranscodedToUtf16, otherTranscodedToUtf16, 0, thisTranscodedToUtf16.Length, compareOptions, &matchLength, fromBeginning); + } + + if (idx < 0) + { + // No match found. Bail out now. + + range = default; + return false; + } + + // If we reached this point, we found a match. The 'idx' local is the index in the source + // string (indexed by UTF-16 code units) where the match was found, and the 'matchLength' + // local is the number of chars in the source string which constitute the match. This length + // can be different than the length of the search string, as non-ordinal IndexOf operations + // follow Unicode full case folding semantics and might also normalize characters like + // digraphs. + + fixed (char* pThisTranscodedToUtf16 = &thisTranscodedToUtf16.GetRawStringData()) + { + // First, we need to convert the UTF-16 'idx' to its UTF-8 equivalent. + + char* pStoppedCounting = Utf16Utility.GetPointerToFirstInvalidChar(pThisTranscodedToUtf16, idx, out long utf8CodeUnitCountAdjustment, out _); + Debug.Assert(pStoppedCounting == pThisTranscodedToUtf16 + idx, "We shouldn't have generated an ill-formed UTF-16 temp string."); + Debug.Assert((ulong)(idx + utf8CodeUnitCountAdjustment) <= (uint)this.Bytes.Length, "Start index should be within the source UTF-8 data."); + + // Normally when we produce a UTF-8 code unit count from a UTF-16 source we + // need to perform 64-bit arithmetic so we don't overflow. But in this case + // we know the true original source was UTF-8, so its length is known already + // to fit into a signed 32-bit integer. So we'll perform an unchecked cast. + + int utf8StartIdx = idx + (int)utf8CodeUnitCountAdjustment; + + // Now we need to convert the UTF-16 'matchLength' to its UTF-8 equivalent. + + pStoppedCounting = Utf16Utility.GetPointerToFirstInvalidChar(pThisTranscodedToUtf16 + idx, matchLength, out utf8CodeUnitCountAdjustment, out _); + Debug.Assert(pStoppedCounting == pThisTranscodedToUtf16 + idx + matchLength, "We shouldn't have generated an ill-formed UTF-16 temp string."); + Debug.Assert((ulong)(utf8StartIdx + matchLength + utf8CodeUnitCountAdjustment) <= (uint)this.Bytes.Length, "End index should be within the source UTF-8 data."); + + int utf8EndIdx = utf8StartIdx + matchLength + (int)utf8CodeUnitCountAdjustment; + + // Some quick sanity checks on the return value before we return. + + Debug.Assert(0 <= utf8StartIdx); + Debug.Assert(utf8StartIdx <= utf8EndIdx); + Debug.Assert(utf8EndIdx <= this.Bytes.Length); + + range = utf8StartIdx..utf8EndIdx; + return true; + } + } + + /// + /// Attempts to locate the last occurrence of the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// An ordinal search is performed. + /// + public bool TryFindLast(char value, out Range range) + { + if (Rune.TryCreate(value, out Rune rune)) + { + return TryFindLast(rune, out range); + } + else + { + // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately. + + range = default; + return false; + } + } + + /// + /// Attempts to locate the last occurrence of the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// The search is performed using the specified . + /// + public bool TryFindLast(char value, StringComparison comparisonType, out Range range) + { + if (Rune.TryCreate(value, out Rune rune)) + { + return TryFindLast(rune, comparisonType, out range); + } + else + { + string.CheckStringComparison(comparisonType); + + // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately. + + range = default; + return false; + } + } + + /// + /// Attempts to locate the last occurrence of the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// An ordinal search is performed. + /// + public bool TryFindLast(Rune value, out Range range) + { + if (value.IsAscii) + { + // Special-case ASCII since it's a simple single byte search. + + int idx = Bytes.LastIndexOf((byte)value.Value); + if (idx < 0) + { + range = default; + return false; + } + else + { + range = idx..(idx + 1); + return true; + } + } + else + { + // Slower path: need to search a multi-byte sequence. + // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we + // know Rune instances are well-formed and slicing is safe. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes); + + return TryFindLast(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), out range); + } + } + + /// + /// Attempts to locate the last occurrence of the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// The search is performed using the specified . + /// + public bool TryFindLast(Rune value, StringComparison comparisonType, out Range range) + { + if (comparisonType == StringComparison.Ordinal) + { + return TryFindLast(value, out range); + } + else + { + // Slower path: not an ordinal comparison. + // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we + // know Rune instances are well-formed and slicing is safe. + + Span runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes); + + return TryFindLast(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), comparisonType, out range); + } + } + + /// + /// Attempts to locate the last occurrence of the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// An ordinal search is performed. + /// + public bool TryFindLast(Utf8Span value, out Range range) + { + int idx; + + if (value.Bytes.Length <= 1) + { + if (value.Bytes.Length == 1) + { + idx = this.Bytes.LastIndexOf(value.Bytes[0]); // special-case ASCII since it's a single byte search + } + else + { + idx = this.Length; // the last empty substring always occurs at the end of the buffer + } + } + else + { + // Slower path: need to search a multi-byte sequence. + + idx = this.Bytes.LastIndexOf(value.Bytes); + } + + if (idx < 0) + { + range = default; + return false; + } + else + { + range = idx..(idx + value.Bytes.Length); + return true; + } + } + + /// + /// Attempts to locate the last occurrence of the target within this instance. + /// If is found, returns and sets to + /// the location where occurs within this instance. + /// If is not found, returns and sets + /// to . + /// + /// + /// The search is performed using the specified . + /// + public bool TryFindLast(Utf8Span value, StringComparison comparisonType, out Range range) => TryFind(value, comparisonType, out range, fromBeginning: false); + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs new file mode 100644 index 0000000..63a7770 --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs @@ -0,0 +1,288 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.ComponentModel; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text.Unicode; +using Internal.Runtime.CompilerServices; + +#pragma warning disable 0809 //warning CS0809: Obsolete member 'Utf8Span.Equals(object)' overrides non-obsolete member 'object.Equals(object)' + +#pragma warning disable SA1121 // explicitly using type aliases instead of built-in types +#if BIT64 +using nint = System.Int64; +using nuint = System.UInt64; +#else +using nint = System.Int32; +using nuint = System.UInt32; +#endif + +namespace System.Text +{ + [StructLayout(LayoutKind.Auto)] + public readonly ref partial struct Utf8Span + { + /// + /// Creates a from an existing instance. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Utf8Span(Utf8String? value) + { + Bytes = Utf8Extensions.AsBytes(value); + } + + /// + /// Ctor for internal use only. Caller _must_ validate both invariants hold: + /// (a) the buffer represents well-formed UTF-8 data, and + /// (b) the buffer is immutable. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Utf8Span(ReadOnlySpan rawData) + { + // In debug builds, we want to ensure that the callers really did validate + // the buffer for well-formedness. The entire line below is removed when + // compiling release builds. + + Debug.Assert(Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(rawData, out _) == -1); + + Bytes = rawData; + } + + public ReadOnlySpan Bytes { get; } + + public static Utf8Span Empty => default; + + public bool IsEmpty => Bytes.IsEmpty; + + internal int Length => Bytes.Length; + + public Utf8Span this[Range range] + { + get + { + (int offset, int length) = range.GetOffsetAndLength(Length); + + // Check for a split across a multi-byte subsequence on the way out. + // Reminder: Unlike Utf8String, we can't safely dereference past the end of the span. + + ref byte newRef = ref DangerousGetMutableReference(offset); + if (length > 0 && Utf8Utility.IsUtf8ContinuationByte(newRef)) + { + Utf8String.ThrowImproperStringSplit(); + } + + int endIdx = offset + length; + if (endIdx < Length && Utf8Utility.IsUtf8ContinuationByte(DangerousGetMutableReference(endIdx))) + { + Utf8String.ThrowImproperStringSplit(); + } + + return UnsafeCreateWithoutValidation(new ReadOnlySpan(ref newRef, length)); + } + } + + /// + /// Returns a mutable reference to the first byte of this + /// (or, if this is empty, to where the first byte would be). + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal ref byte DangerousGetMutableReference() => ref MemoryMarshal.GetReference(Bytes); + + /// + /// Returns a mutable reference to the element at index + /// of this instance. The index is not bounds-checked. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal ref byte DangerousGetMutableReference(int index) + { + Debug.Assert(index >= 0, "Caller should've performed bounds checking."); + return ref DangerousGetMutableReference((uint)index); + } + + /// + /// Returns a mutable reference to the element at index + /// of this instance. The index is not bounds-checked. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal ref byte DangerousGetMutableReference(nuint index) + { + // Allow retrieving references to just past the end of the span (but shouldn't dereference this). + + Debug.Assert(index <= (uint)Length, "Caller should've performed bounds checking."); + return ref Unsafe.AddByteOffset(ref DangerousGetMutableReference(), index); + } + + public bool IsEmptyOrWhiteSpace() => (Utf8Utility.GetIndexOfFirstNonWhiteSpaceChar(Bytes) == Length); + + /// + /// This method is not supported as spans cannot be boxed. To compare two spans, use operator==. + /// + /// Always thrown by this method. + /// + /// + [Obsolete("Equals(object) on Utf8Span will always throw an exception. Use Equals(Utf8Span) or operator == instead.")] + [EditorBrowsable(EditorBrowsableState.Never)] + public override bool Equals(object? obj) + { + throw new NotSupportedException(SR.Utf8Span_CannotCallEqualsObject); + } + + public bool Equals(Utf8Span other) => Equals(this, other); + + public bool Equals(Utf8Span other, StringComparison comparison) => Equals(this, other, comparison); + + public static bool Equals(Utf8Span left, Utf8Span right) => left.Bytes.SequenceEqual(right.Bytes); + + public static bool Equals(Utf8Span left, Utf8Span right, StringComparison comparison) + { + // TODO_UTF8STRING: This perf can be improved, including removing + // the virtual dispatch by putting the switch directly in this method. + + // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted. + + return StringComparer.FromComparison(comparison).Equals(left.ToString(), right.ToString()); + } + + public override int GetHashCode() + { + // TODO_UTF8STRING: Consider whether this should use a different seed than String.GetHashCode. + // This method should only be called to calculate the hash code over spans that represent + // UTF-8 textual data, not over arbitrary binary sequences. + + ulong seed = Marvin.DefaultSeed; + return Marvin.ComputeHash32(ref MemoryMarshal.GetReference(Bytes), (uint)Length /* in bytes */, (uint)seed, (uint)(seed >> 32)); + } + + public int GetHashCode(StringComparison comparison) + { + // TODO_UTF8STRING: This perf can be improved, including removing + // the virtual dispatch by putting the switch directly in this method. + + // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted. + + return StringComparer.FromComparison(comparison).GetHashCode(this.ToString()); + } + + /// + /// Returns if this UTF-8 text consists of all-ASCII data, + /// if there is any non-ASCII data within this UTF-8 text. + /// + /// + /// ASCII text is defined as text consisting only of scalar values in the range [ U+0000..U+007F ]. + /// The runtime of this method is O(n). + /// + public bool IsAscii() + { + // TODO_UTF8STRING: Use an API that takes 'ref byte' instead of a 'byte*' as a parameter. + + unsafe + { + fixed (byte* pData = &MemoryMarshal.GetReference(Bytes)) + { + return (ASCIIUtility.GetIndexOfFirstNonAsciiByte(pData, (uint)Length) == (uint)Length); + } + } + } + + public bool IsNormalized(NormalizationForm normalizationForm = NormalizationForm.FormC) + { + // TODO_UTF8STRING: Avoid allocations in this code path. + + return ToString().IsNormalized(normalizationForm); + } + + /// + /// Gets an immutable reference that can be used in a statement. Unlike + /// , the resulting reference is not guaranteed to be null-terminated. + /// + /// + /// If this instance is empty, returns . Dereferencing + /// such a reference will result in a being generated. + /// + [EditorBrowsable(EditorBrowsableState.Never)] + public ref readonly byte GetPinnableReference() + { + // This returns null if the underlying span is empty. The reason for this is that unlike + // Utf8String, these buffers are not guaranteed to be null-terminated, so it's not always + // safe or meaningful to dereference the element just past the end of the buffer. + + return ref Bytes.GetPinnableReference(); + } + + public override string ToString() + { + // TODO_UTF8STRING: Since we know the underlying data is immutable, well-formed UTF-8, + // we can perform transcoding using an optimized code path that skips all safety checks. + + return Encoding.UTF8.GetString(Bytes); + } + + /// + /// Converts this instance to a . + /// + /// + /// This routine throws if the underlying instance + /// contains invalid UTF-8 data. + /// + internal unsafe string ToStringNoReplacement() + { + // TODO_UTF8STRING: Optimize the call below, potentially by avoiding the two-pass. + + fixed (byte* pData = &MemoryMarshal.GetReference(Bytes)) + { + byte* pFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pData, Length, out int utf16CodeUnitCountAdjustment, out _); + if (pFirstInvalidByte != pData + (uint)Length) + { + // Saw bad UTF-8 data. + // TODO_UTF8STRING: Throw a better exception below? + + ThrowHelper.ThrowInvalidOperationException(); + } + + int utf16CharCount = Length + utf16CodeUnitCountAdjustment; + Debug.Assert(utf16CharCount <= Length && utf16CharCount >= 0); + + // TODO_UTF8STRING: Can we call string.FastAllocate directly? + + return string.Create(utf16CharCount, (pbData: (IntPtr)pData, cbData: Length), (chars, state) => + { + OperationStatus status = Utf8.ToUtf16(new ReadOnlySpan((byte*)state.pbData, state.cbData), chars, out _, out _, replaceInvalidSequences: false); + Debug.Assert(status == OperationStatus.Done, "Did somebody mutate this Utf8String instance unexpectedly?"); + }); + } + } + + public Utf8String ToUtf8String() + { + // TODO_UTF8STRING: Since we know the underlying data is immutable, well-formed UTF-8, + // we can perform transcoding using an optimized code path that skips all safety checks. + + return new Utf8String(Bytes); + } + + /// + /// Wraps a instance around the provided , + /// skipping validation of the input data. + /// + /// + /// Callers must uphold the following two invariants: + /// + /// (a) consists only of well-formed UTF-8 data and does + /// not contain invalid or incomplete UTF-8 subsequences; and + /// (b) the contents of will not change for the duration + /// of the returned 's existence. + /// + /// If these invariants are not maintained, the runtime may exhibit undefined behavior. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Utf8Span UnsafeCreateWithoutValidation(ReadOnlySpan buffer) + { + return new Utf8Span(buffer); + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs index e34149f..85ee986 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs @@ -4,6 +4,8 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; namespace System @@ -26,7 +28,7 @@ namespace System [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ReadOnlySpan AsBytes(this Utf8String? text) { - if (text == null) + if (text is null) return default; return new ReadOnlySpan(ref text.DangerousGetMutableReference(), text.Length); @@ -44,7 +46,7 @@ namespace System [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ReadOnlySpan AsBytes(this Utf8String? text, int start) { - if (text == null) + if (text is null) { if (start != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -70,7 +72,7 @@ namespace System [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ReadOnlySpan AsBytes(this Utf8String? text, int start, int length) { - if (text == null) + if (text is null) { if (start != 0 || length != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -90,21 +92,21 @@ namespace System } /// - /// Creates a new readonly span over the portion of the target . + /// Creates a new over the target . /// /// The target . /// Returns default when is null. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static ReadOnlySpan AsSpan(this Utf8String? text) + public static Utf8Span AsSpan(this Utf8String? text) { - if (text == null) + if (text is null) return default; - return new ReadOnlySpan(ref Unsafe.As(ref text.DangerousGetMutableReference()), text.Length); + return new Utf8Span(text); } /// - /// Creates a new readonly span over the portion of the target . + /// Creates a new over the portion of the target . /// /// The target . /// The index at which to begin this slice. @@ -112,10 +114,13 @@ namespace System /// /// Thrown when the specified index is not in range (<0 or >text.Length). /// + /// + /// Thrown if the resulting span would split a multi-byte UTF-8 subsequence. + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static ReadOnlySpan AsSpan(this Utf8String? text, int start) + public static Utf8Span AsSpan(this Utf8String? text, int start) { - if (text == null) + if (text is null) { if (start != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -125,11 +130,20 @@ namespace System if ((uint)start > (uint)text.Length) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); - return new ReadOnlySpan(ref Unsafe.As(ref text.DangerousGetMutableReference(start)), text.Length - start); + // It's always safe for us to read just past the end of the string (since there's a null terminator), + // so we don't need to perform any additional bounds checking. We only need to check that we're not + // splitting in the middle of a multi-byte UTF-8 subsequence. + + if (Utf8Utility.IsUtf8ContinuationByte(text.DangerousGetMutableReference(start))) + { + Utf8String.ThrowImproperStringSplit(); + } + + return Utf8Span.UnsafeCreateWithoutValidation(new ReadOnlySpan(ref text.DangerousGetMutableReference(start), text.Length - start)); } /// - /// Creates a new readonly span over the portion of the target . + /// Creates a new over the portion of the target . /// /// The target . /// The index at which to begin this slice. @@ -138,10 +152,13 @@ namespace System /// /// Thrown when the specified index or is not in range. /// + /// + /// Thrown if the resulting span would split a multi-byte UTF-8 subsequence. + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static ReadOnlySpan AsSpan(this Utf8String? text, int start, int length) + public static Utf8Span AsSpan(this Utf8String? text, int start, int length) { - if (text == null) + if (text is null) { if (start != 0 || length != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -157,7 +174,17 @@ namespace System ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); #endif - return new ReadOnlySpan(ref Unsafe.As(ref text.DangerousGetMutableReference(start)), length); + // It's always safe for us to read just past the end of the string (since there's a null terminator), + // so we don't need to perform any additional bounds checking. We only need to check that we're not + // splitting in the middle of a multi-byte UTF-8 subsequence. + + if (Utf8Utility.IsUtf8ContinuationByte(text.DangerousGetMutableReference(start)) + || Utf8Utility.IsUtf8ContinuationByte(text.DangerousGetMutableReference(start + length))) + { + Utf8String.ThrowImproperStringSplit(); + } + + return Utf8Span.UnsafeCreateWithoutValidation(new ReadOnlySpan(ref text.DangerousGetMutableReference(start), length)); } /// Creates a new over the portion of the target . @@ -165,7 +192,7 @@ namespace System /// Returns default when is null. public static ReadOnlyMemory AsMemory(this Utf8String? text) { - if (text == null) + if (text is null) return default; return new ReadOnlyMemory(text, 0, text.Length); @@ -180,7 +207,7 @@ namespace System /// public static ReadOnlyMemory AsMemory(this Utf8String? text, int start) { - if (text == null) + if (text is null) { if (start != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -198,7 +225,7 @@ namespace System /// The index at which to begin this slice. public static ReadOnlyMemory AsMemory(this Utf8String? text, Index startIndex) { - if (text == null) + if (text is null) { if (!startIndex.Equals(Index.Start)) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text); @@ -223,7 +250,7 @@ namespace System /// public static ReadOnlyMemory AsMemory(this Utf8String? text, int start, int length) { - if (text == null) + if (text is null) { if (start != 0 || length != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -247,7 +274,7 @@ namespace System /// The range used to indicate the start and length of the sliced string. public static ReadOnlyMemory AsMemory(this Utf8String? text, Range range) { - if (text == null) + if (text is null) { Index startIndex = range.Start; Index endIndex = range.End; @@ -267,7 +294,7 @@ namespace System /// Returns default when is null. public static ReadOnlyMemory AsMemoryBytes(this Utf8String? text) { - if (text == null) + if (text is null) return default; return new ReadOnlyMemory(text, 0, text.Length); @@ -282,7 +309,7 @@ namespace System /// public static ReadOnlyMemory AsMemoryBytes(this Utf8String? text, int start) { - if (text == null) + if (text is null) { if (start != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -300,7 +327,7 @@ namespace System /// The index at which to begin this slice. public static ReadOnlyMemory AsMemoryBytes(this Utf8String? text, Index startIndex) { - if (text == null) + if (text is null) { if (!startIndex.Equals(Index.Start)) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text); @@ -325,7 +352,7 @@ namespace System /// public static ReadOnlyMemory AsMemoryBytes(this Utf8String? text, int start, int length) { - if (text == null) + if (text is null) { if (start != 0 || length != 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); @@ -349,7 +376,7 @@ namespace System /// The range used to indicate the start and length of the sliced string. public static ReadOnlyMemory AsMemoryBytes(this Utf8String? text, Range range) { - if (text == null) + if (text is null) { Index startIndex = range.Start; Index endIndex = range.End; @@ -363,5 +390,10 @@ namespace System (int start, int length) = range.GetOffsetAndLength(text.Length); return new ReadOnlyMemory(text, start, length); } + + /// + /// Creates a new representation of this . + /// + public static Utf8String ToUtf8String(this Rune rune) => Utf8String.CreateFromRune(rune); } } diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs new file mode 100644 index 0000000..1a20f36 --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs @@ -0,0 +1,142 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.Text; + +namespace System +{ + public sealed partial class Utf8String + { + /* + * COMPARISON OF UTF-8 AGAINST UTF-16 + */ + + /// + /// Returns a value stating whether and + /// represent the same data. An ordinal comparison is performed scalar-by-scalar. + /// + /// + /// This method returns if both and + /// are null, or if both are empty. This method returns + /// if either input contains an ill-formed subsequence. Otherwise, this method returns + /// if and only if both arguments decode to the same Unicode scalar value sequence. + /// + public static bool AreEquivalent(Utf8String? utf8Text, string? utf16Text) + { + if (ReferenceEquals(utf8Text, utf16Text)) + { + return true; // both are null + } + + if (utf8Text is null || utf16Text is null) + { + return false; // null is never equivalent to non-null + } + + if (utf8Text.Length == 0 && utf16Text.Length == 0) + { + return true; // empty is equivalent to empty + } + + // Short-circuit: are the texts of sufficiently different lengths that + // they could never be equivalent? This check allows us to skip the + // normal decoding walk, which is O(n). + // + // The maximum length of a 'System.String' is around 1 billion elements, + // so we can perform the multiplication within an unsigned 32-bit domain. + + Debug.Assert((ulong)utf16Text.Length * MAX_UTF8_BYTES_PER_UTF16_CHAR <= uint.MaxValue, "Did somebody change the max. allowed string length?"); + + if (utf8Text.Length < utf16Text.Length + || ((uint)utf16Text.Length * MAX_UTF8_BYTES_PER_UTF16_CHAR < (uint)utf8Text.Length)) + { + return false; + } + + return AreEquivalentOrdinalSkipShortCircuitingChecks(utf8Text.AsBytes(), utf16Text); + } + + /// + /// Returns a value stating whether and + /// represent the same data. An ordinal comparison is performed scalar-by-scalar. + /// + /// + /// This method returns if both and + /// are empty. This method returns + /// if either input contains an ill-formed subsequence. Otherwise, this method returns + /// if and only if both arguments decode to the same Unicode scalar value sequence. + /// + public static bool AreEquivalent(Utf8Span utf8Text, ReadOnlySpan utf16Text) => AreEquivalent(utf8Text.Bytes, utf16Text); + + /// + /// Returns a value stating whether and + /// represent the same data. An ordinal comparison is performed scalar-by-scalar. + /// + /// + /// This method returns if both and + /// are empty. This method returns + /// if either input contains an ill-formed subsequence. Otherwise, this method returns + /// if and only if both arguments decode to the same Unicode scalar value sequence. + /// + public static bool AreEquivalent(ReadOnlySpan utf8Text, ReadOnlySpan utf16Text) + { + if (utf8Text.Length == 0 && utf16Text.Length == 0) + { + // Don't use IsEmpty for this check; JIT can optimize "Length == 0" better + // for this particular scenario. + + return true; + } + + // Same check as the (Utf8String, string) overload. The primary difference is that + // since spans can be up to 2 billion elements in length, we need to perform + // the multiplication step in the unsigned 64-bit domain to avoid integer overflow. + + if (utf8Text.Length < utf16Text.Length + || ((ulong)(uint)utf16Text.Length * MAX_UTF8_BYTES_PER_UTF16_CHAR < (uint)utf8Text.Length)) + { + return false; + } + + return AreEquivalentOrdinalSkipShortCircuitingChecks(utf8Text, utf16Text); + } + + private static bool AreEquivalentOrdinalSkipShortCircuitingChecks(ReadOnlySpan utf8Text, ReadOnlySpan utf16Text) + { + while (!utf16Text.IsEmpty) + { + // If the next UTF-16 subsequence is malformed or incomplete, or if the next + // UTF-8 subsequence is malformed or incomplete, or if they don't decode to + // the exact same Unicode scalar value, fail. + // + // The Rune.DecodeFrom* APIs handle empty inputs just fine and return "Incomplete". + + // TODO_UTF8STRING: If we assume Utf8String contains well-formed UTF-8, we could + // create a version of this method that calls a faster implementation of DecodeFromUtf8. + // We'd need to be careful not to call that optimized routine if the user passed + // us a normal ROS that didn't originate from a Utf8String or similar. + + if (Rune.DecodeFromUtf16(utf16Text, out Rune scalarFromUtf16, out int charsConsumedJustNow) != OperationStatus.Done + || Rune.DecodeFromUtf8(utf8Text, out Rune scalarFromUtf8, out int bytesConsumedJustNow) != OperationStatus.Done + || scalarFromUtf16 != scalarFromUtf8) + { + return false; + } + + // TODO_UTF8STRING: As an optimization, we could perform unsafe slices below. + + utf16Text = utf16Text.Slice(charsConsumedJustNow); + utf8Text = utf8Text.Slice(bytesConsumedJustNow); + } + + // We decoded the entire UTF-16 input, and so far it has matched the decoded form + // of the UTF-8 input. Now just make sure we've also decoded the entirety of the + // UTF-8 data, otherwise the input strings aren't equivalent. + + return utf8Text.IsEmpty; + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs index 4b678b6..bbaecf0 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; @@ -169,6 +170,16 @@ namespace System #endif private Utf8String Ctor(string? value) => Ctor(value.AsSpan()); + internal static Utf8String CreateFromRune(Rune value) + { + Utf8String newString = FastAllocate(value.Utf8SequenceLength); + int bytesWritten = value.EncodeToUtf8(new Span(ref newString.DangerousGetMutableReference(), newString.Length)); + + Debug.Assert(bytesWritten == value.Utf8SequenceLength); + + return newString; + } + /* * HELPER METHODS */ diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs index 4cb7816..0767abe 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs @@ -6,6 +6,7 @@ using System.ComponentModel; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Text; using Internal.Runtime.CompilerServices; @@ -19,6 +20,9 @@ namespace System IEquatable #nullable restore { + // For values beyond U+FFFF, it's 4 UTF-8 bytes per 2 UTF-16 chars (2:1 ratio) + private const int MAX_UTF8_BYTES_PER_UTF16_CHAR = 3; + /* * STATIC FIELDS */ @@ -55,7 +59,12 @@ namespace System /// /// Projects a instance as a . /// - public static implicit operator ReadOnlySpan(Utf8String? value) => value.AsSpan(); + public static implicit operator ReadOnlySpan(Utf8String? value) => MemoryMarshal.Cast(value.AsSpan().Bytes); + + /// + /// Projects a instance as a . + /// + public static implicit operator Utf8Span(Utf8String? value) => new Utf8Span(value); /* * INSTANCE PROPERTIES @@ -233,5 +242,12 @@ namespace System return Encoding.UTF8.GetString(new ReadOnlySpan(ref DangerousGetMutableReference(), Length)); } + + [StackTraceHidden] + internal static void ThrowImproperStringSplit() + { + throw new InvalidOperationException( + message: SR.Utf8String_CannotSplitMultibyteSubsequence); + } } } diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs new file mode 100644 index 0000000..29a00a2 --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs @@ -0,0 +1,17 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System +{ + // TODO_UTF8STRING: This should be removed and we should use regular StringSplitOptions + // once a 'TrimEntries' flag gets added to the type. + + [Flags] + public enum Utf8StringSplitOptions + { + None = 0, + RemoveEmptyEntries = 1, + TrimEntries = 2 + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 5e8a46b..3745146 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -814,6 +814,7 @@ + @@ -828,6 +829,7 @@ + diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs index eecfc26..3b8350f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs @@ -1013,7 +1013,7 @@ namespace System.Globalization /// The following IndexOf overload is mainly used by String.Replace. This overload assumes the parameters are already validated /// and the caller is passing a valid matchLengthPtr pointer. /// - internal unsafe int IndexOf(string source, string value, int startIndex, int count, CompareOptions options, int* matchLengthPtr) + internal unsafe int IndexOf(string source, string value, int startIndex, int count, CompareOptions options, int* matchLengthPtr, bool fromBeginning = true) { Debug.Assert(source != null); Debug.Assert(value != null); @@ -1036,7 +1036,16 @@ namespace System.Globalization if (options == CompareOptions.OrdinalIgnoreCase) { - int res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase: true); + int res; + if (fromBeginning) + { + res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase: true); + } + else + { + res = LastIndexOfOrdinal(source, value, startIndex, count, ignoreCase: true); + } + if (res >= 0 && matchLengthPtr != null) { *matchLengthPtr = value.Length; @@ -1046,7 +1055,18 @@ namespace System.Globalization if (GlobalizationMode.Invariant) { - int res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase: (options & (CompareOptions.IgnoreCase | CompareOptions.OrdinalIgnoreCase)) != 0); + bool ignoreCase = (options & (CompareOptions.IgnoreCase | CompareOptions.OrdinalIgnoreCase)) != 0; + int res; + + if (fromBeginning) + { + res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase); + } + else + { + res = LastIndexOfOrdinal(source, value, startIndex, count, ignoreCase); + } + if (res >= 0 && matchLengthPtr != null) { *matchLengthPtr = value.Length; @@ -1056,11 +1076,24 @@ namespace System.Globalization if (options == CompareOptions.Ordinal) { - int retValue = SpanHelpers.IndexOf( - ref Unsafe.Add(ref source.GetRawStringData(), startIndex), - count, - ref value.GetRawStringData(), - value.Length); + int retValue; + + if (fromBeginning) + { + retValue = SpanHelpers.IndexOf( + ref Unsafe.Add(ref source.GetRawStringData(), startIndex), + count, + ref value.GetRawStringData(), + value.Length); + } + else + { + retValue = SpanHelpers.LastIndexOf( + ref Unsafe.Add(ref source.GetRawStringData(), startIndex), + count, + ref value.GetRawStringData(), + value.Length); + } if (retValue >= 0) { @@ -1075,7 +1108,15 @@ namespace System.Globalization } else { - return IndexOfCore(source, value, startIndex, count, options, matchLengthPtr); + if (fromBeginning) + { + // Call the string-based overload, as it special-cases IsFastSort as a perf optimization. + return IndexOfCore(source, value, startIndex, count, options, matchLengthPtr); + } + else + { + return IndexOfCore(source.AsSpan(startIndex, count), value, options, matchLengthPtr, fromBeginning: false); + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index 0c79ce3..4b190ee 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1784,7 +1784,7 @@ namespace System int start = 0; // Trim specified characters. - if (trimType != TrimType.Tail) + if ((trimType & TrimType.Head) != 0) { for (start = 0; start < Length; start++) { @@ -1795,7 +1795,7 @@ namespace System } } - if (trimType != TrimType.Head) + if ((trimType & TrimType.Tail) != 0) { for (end = Length - 1; end >= start; end--) { @@ -1820,7 +1820,7 @@ namespace System int start = 0; // Trim specified characters. - if (trimType != TrimType.Tail) + if ((trimType & TrimType.Head) != 0) { for (start = 0; start < Length; start++) { @@ -1841,7 +1841,7 @@ namespace System } } - if (trimType != TrimType.Head) + if ((trimType & TrimType.Tail) != 0) { for (end = Length - 1; end >= start; end--) { @@ -1873,12 +1873,5 @@ namespace System len == 0 ? string.Empty : InternalSubString(start, len); } - - private enum TrimType - { - Head = 0, - Tail = 1, - Both = 2 - } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs b/src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs new file mode 100644 index 0000000..db75688 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs @@ -0,0 +1,30 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace System.Text +{ + /// + /// Specifies which portions of the string should be trimmed in a trimming operation. + /// + [Flags] + internal enum TrimType + { + /// + /// Trim from the beginning of the string. + /// + Head = 1 << 0, + + /// + /// Trim from the end of the string. + /// + Tail = 1 << 1, + + /// + /// Trim from both the beginning and the end of the string. + /// + Both = Head | Tail + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs index 46eb8b5..693c6f4 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -431,7 +431,7 @@ namespace System.Text.Unicode /// i.e., has binary representation 10xxxxxx, where x is any bit. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsUtf8ContinuationByte(in byte value) + internal static bool IsUtf8ContinuationByte(in byte value) { // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements // directly rather than bounce a temporary through a register. That is, we want the JIT to be diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs new file mode 100644 index 0000000..968144a --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs @@ -0,0 +1,139 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Internal.Runtime.CompilerServices; + +#pragma warning disable SA1121 // explicitly using type aliases instead of built-in types +#if BIT64 +using nint = System.Int64; +using nuint = System.UInt64; +#else +using nint = System.Int32; +using nuint = System.UInt32; +#endif + +namespace System.Text.Unicode +{ + internal static partial class Utf8Utility + { + /// + /// Returns the index in where the first non-whitespace character + /// appears, or the input length if the data contains only whitespace characters. + /// + public static int GetIndexOfFirstNonWhiteSpaceChar(ReadOnlySpan utf8Data) + { + return (int)GetIndexOfFirstNonWhiteSpaceChar(ref MemoryMarshal.GetReference(utf8Data), (uint)utf8Data.Length); + } + + private static nuint GetIndexOfFirstNonWhiteSpaceChar(ref byte utf8Data, nuint length) + { + // This method is optimized for the case where the input data is ASCII, and if the + // data does need to be trimmed it's likely that only a relatively small number of + // bytes will be trimmed. + + nuint i = 0; + + while (i < length) + { + // Very quick check: see if the byte is in the range [ 21 .. 7F ]. + // If so, we can skip the more expensive logic later in this method. + + if ((sbyte)Unsafe.AddByteOffset(ref utf8Data, i) > (sbyte)0x20) + { + break; + } + + uint possibleAsciiByte = Unsafe.AddByteOffset(ref utf8Data, i); + if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte)) + { + // The simple comparison failed. Let's read the actual byte value, + // and if it's ASCII we can delegate to Rune's inlined method + // implementation. + + if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte))) + { + i++; + continue; + } + } + else + { + // Not ASCII data. Go back to the slower "decode the entire scalar" + // code path, then compare it against our Unicode tables. + + Rune.DecodeFromUtf8(new ReadOnlySpan(ref utf8Data, (int)length).Slice((int)i), out Rune decodedRune, out int bytesConsumed); + if (Rune.IsWhiteSpace(decodedRune)) + { + i += (uint)bytesConsumed; + continue; + } + } + + break; // If we got here, we saw a non-whitespace subsequence. + } + + return i; + } + + /// + /// Returns the index in where the trailing whitespace sequence + /// begins, or 0 if the data contains only whitespace characters, or the span length if the + /// data does not end with any whitespace characters. + /// + public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan utf8Data) + { + return (int)GetIndexOfTrailingWhiteSpaceSequence(ref MemoryMarshal.GetReference(utf8Data), (uint)utf8Data.Length); + } + + private static nuint GetIndexOfTrailingWhiteSpaceSequence(ref byte utf8Data, nuint length) + { + // This method is optimized for the case where the input data is ASCII, and if the + // data does need to be trimmed it's likely that only a relatively small number of + // bytes will be trimmed. + + while (length > 0) + { + // Very quick check: see if the byte is in the range [ 21 .. 7F ]. + // If so, we can skip the more expensive logic later in this method. + + if ((sbyte)Unsafe.Add(ref Unsafe.AddByteOffset(ref utf8Data, length), -1) > (sbyte)0x20) + { + break; + } + + uint possibleAsciiByte = Unsafe.Add(ref Unsafe.AddByteOffset(ref utf8Data, length), -1); + if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte)) + { + // The simple comparison failed. Let's read the actual byte value, + // and if it's ASCII we can delegate to Rune's inlined method + // implementation. + + if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte))) + { + length--; + continue; + } + } + else + { + // Not ASCII data. Go back to the slower "decode the entire scalar" + // code path, then compare it against our Unicode tables. + + Rune.DecodeLastFromUtf8(new ReadOnlySpan(ref utf8Data, (int)length), out Rune decodedRune, out int bytesConsumed); + if (Rune.IsWhiteSpace(decodedRune)) + { + length -= (uint)bytesConsumed; + continue; + } + } + + break; // If we got here, we saw a non-whitespace subsequence. + } + + return length; + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs index 50a3e3f..01aa0a9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs @@ -4,7 +4,6 @@ using System.Buffers; using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; using System.IO; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -46,15 +45,28 @@ namespace System.Text.Unicode #if FEATURE_UTF8STRING /// + /// Returns a value stating whether contains only well-formed UTF-8 data. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe bool IsWellFormedUtf8(ReadOnlySpan utf8Data) + { + fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data)) + { + // The return value here will point to the end of the span if the data is well-formed. + byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int _, out _); + return (pFirstInvalidByte == (pUtf8Data + (uint)utf8Data.Length)); + } + } + + /// /// Returns if it is null or contains only well-formed UTF-8 data; /// otherwises allocates a new instance containing the same data as /// but where all invalid UTF-8 sequences have been replaced - /// with U+FFD. + /// with U+FFFD. /// - [return: NotNullIfNotNull("value")] - public static Utf8String? ValidateAndFixupUtf8String(Utf8String? value) + public static Utf8String ValidateAndFixupUtf8String(Utf8String value) { - if (Utf8String.IsNullOrEmpty(value)) + if (value.Length == 0) { return value; } -- 2.7.4