Introduce Utf8Span, which is a span of UTF-8 text (dotnet/coreclr#26711)

author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>

Thu, 26 Sep 2019 00:36:44 +0000 (17:36 -0700)

committer GitHub <noreply@github.com>

Thu, 26 Sep 2019 00:36:44 +0000 (17:36 -0700)
author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Thu, 26 Sep 2019 00:36:44 +0000 (17:36 -0700)
committer GitHub <noreply@github.com>
Thu, 26 Sep 2019 00:36:44 +0000 (17:36 -0700)
diff --git a/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx b/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx

index f5563a2..e2afd26 100644 (file)
--- a/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx
+++ b/src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx
@@ -3766,4 +3766,16 @@
    <data name="InvalidOperation_MultipleComUnRegFunctions" xml:space="preserve">
      <value>Type '{0}' has more than one COM unregistration function.</value>
    </data>
+  <data name="Utf8String_CannotSplitMultibyteSubsequence" xml:space="preserve">
+    <value>Cannot create the desired substring because it would split a multi-byte UTF-8 subsequence.</value>
+  </data>
+  <data name="Utf8Span_CannotCallEqualsObject" xml:space="preserve">
+    <value>Cannot call Utf8Span.Equals(object). Use Equals(Utf8Span) or operator == instead.</value>
+  </data>
+  <data name="ArgumentOutOfRange_Utf16SurrogatesDisallowed" xml:space="preserve">
+    <value>UTF-16 surrogate code points (U+D800..U+DFFF) are disallowed.</value>
+  </data>
+  <data name="Argument_CannotBeEmptySpan" xml:space="preserve">
+    <value>Argument cannot be an empty span.</value>
+  </data>
  </root>
diff --git a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj

index 335de4b..604deae 100644 (file)
--- a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj
+++ b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj
@@ -303,9 +303,17 @@
      <Compile Include="$(BclSourcesRoot)\System\Char8.cs" />
      <Compile Include="$(BclSourcesRoot)\System\Utf8Extensions.cs" />
      <Compile Include="$(BclSourcesRoot)\System\Utf8String.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8String.Comparison.cs" />
      <Compile Include="$(BclSourcesRoot)\System\Utf8String.Construction.cs" />
      <Compile Include="$(BclSourcesRoot)\System\Utf8String.Manipulation.cs" />
      <Compile Include="$(BclSourcesRoot)\System\Utf8String.Searching.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8StringSplitOptions.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Text\Utf8Span.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Text\Utf8Span.Comparison.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Text\Utf8Span.Conversion.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Text\Utf8Span.Enumeration.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Text\Utf8Span.Manipulation.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Text\Utf8Span.Searching.cs" />
    </ItemGroup>
    <ItemGroup>
      <Compile Include="$(BclSourcesRoot)\System\Diagnostics\Eventing\XplatEventLogger.cs" Condition="'$(FeatureXplatEventSource)' == 'true'" />
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs

new file mode 100644 (file)

index 0000000..317927f
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs
@@ -0,0 +1,228 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text.Unicode;
+
+namespace System.Text
+{
+    public readonly ref partial struct Utf8Span
+    {
+        public static bool operator ==(Utf8Span left, Utf8Span right) => Equals(left, right);
+        public static bool operator !=(Utf8Span left, Utf8Span right) => !Equals(left, right);
+
+        public int CompareTo(Utf8Span other)
+        {
+            // TODO_UTF8STRING: This is ordinal, but String.CompareTo uses CurrentCulture.
+            // Is this acceptable?
+
+            // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted.
+
+            return StringComparer.Ordinal.Compare(this.ToString(), other.ToString());
+        }
+
+        public int CompareTo(Utf8Span other, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: We can avoid the virtual dispatch by moving the switch into this method.
+
+            // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted.
+
+            return StringComparer.FromComparison(comparison).Compare(this.ToString(), other.ToString());
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains
+        /// <paramref name="value"/>. An ordinal comparison is used.
+        /// </summary>
+        public bool Contains(char value)
+        {
+            return Rune.TryCreate(value, out Rune rune) && Contains(rune);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains
+        /// <paramref name="value"/>. The specified comparison is used.
+        /// </summary>
+        public bool Contains(char value, StringComparison comparison)
+        {
+            return Rune.TryCreate(value, out Rune rune) && Contains(rune, comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains
+        /// the specified <see cref="Rune"/>. An ordinal comparison is used.
+        /// </summary>
+        public bool Contains(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return (this.Bytes.IndexOf(runeBytes.Slice(0, runeBytesWritten)) >= 0);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains
+        /// the specified <see cref="Rune"/>. The specified comparison is used.
+        /// </summary>
+        public bool Contains(Rune value, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: Optimize me to avoid allocations.
+
+            return this.ToString().Contains(value.ToString(), comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains <paramref name="value"/>.
+        /// An ordinal comparison is used.
+        /// </summary>
+        public bool Contains(Utf8Span value)
+        {
+            return (this.Bytes.IndexOf(value.Bytes) >= 0);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance contains <paramref name="value"/>.
+        /// The specified comparison is used.
+        /// </summary>
+        public bool Contains(Utf8Span value, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: Optimize me to avoid allocations.
+
+            return this.ToString().Contains(value.ToString(), comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance ends with
+        /// <paramref name="value"/>. An ordinal comparison is used.
+        /// </summary>
+        public bool EndsWith(char value)
+        {
+            return Rune.TryCreate(value, out Rune rune) && EndsWith(rune);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance ends with
+        /// <paramref name="value"/>. The specified comparison is used.
+        /// </summary>
+        public bool EndsWith(char value, StringComparison comparison)
+        {
+            return Rune.TryCreate(value, out Rune rune) && EndsWith(rune, comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance ends with
+        /// the specified <see cref="Rune"/>. An ordinal comparison is used.
+        /// </summary>
+        public bool EndsWith(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return this.Bytes.EndsWith(runeBytes.Slice(0, runeBytesWritten));
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance ends with
+        /// the specified <see cref="Rune"/>. The specified comparison is used.
+        /// </summary>
+        public bool EndsWith(Rune value, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: Optimize me to avoid allocations.
+
+            return this.ToString().EndsWith(value.ToString(), comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance ends with <paramref name="value"/>.
+        /// An ordinal comparison is used.
+        /// </summary>
+        public bool EndsWith(Utf8Span value)
+        {
+            return this.Bytes.EndsWith(value.Bytes);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance ends with <paramref name="value"/>.
+        /// The specified comparison is used.
+        /// </summary>
+        public bool EndsWith(Utf8Span value, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: Optimize me to avoid allocations.
+
+            return this.ToString().EndsWith(value.ToString(), comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
+        /// <paramref name="value"/>. An ordinal comparison is used.
+        /// </summary>
+        public bool StartsWith(char value)
+        {
+            return Rune.TryCreate(value, out Rune rune) && StartsWith(rune);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
+        /// <paramref name="value"/>. The specified comparison is used.
+        /// </summary>
+        public bool StartsWith(char value, StringComparison comparison)
+        {
+            return Rune.TryCreate(value, out Rune rune) && StartsWith(rune, comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
+        /// the specified <see cref="Rune"/>. An ordinal comparison is used.
+        /// </summary>
+        public bool StartsWith(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return this.Bytes.StartsWith(runeBytes.Slice(0, runeBytesWritten));
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with
+        /// the specified <see cref="Rune"/>. The specified comparison is used.
+        /// </summary>
+        public bool StartsWith(Rune value, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: Optimize me to avoid allocations.
+
+            return this.ToString().StartsWith(value.ToString(), comparison);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with <paramref name="value"/>.
+        /// An ordinal comparison is used.
+        /// </summary>
+        public bool StartsWith(Utf8Span value)
+        {
+            return this.Bytes.StartsWith(value.Bytes);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether the current <see cref="Utf8Span"/> instance begins with <paramref name="value"/>.
+        /// The specified comparison is used.
+        /// </summary>
+        public bool StartsWith(Utf8Span value, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: Optimize me to avoid allocations.
+
+            return this.ToString().StartsWith(value.ToString(), comparison);
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs

new file mode 100644 (file)

index 0000000..a4211e7
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs
@@ -0,0 +1,321 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Diagnostics;
+using System.Globalization;
+using System.Runtime.CompilerServices;
+using System.Text.Unicode;
+
+namespace System.Text
+{
+    public readonly ref partial struct Utf8Span
+    {
+        /// <summary>
+        /// Returns a new <see cref="Utf8String"/> instance which represents this <see cref="Utf8Span"/> instance
+        /// normalized using the specified Unicode normalization form.
+        /// </summary>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation.
+        /// </remarks>
+        public Utf8String Normalize(NormalizationForm normalizationForm = NormalizationForm.FormC)
+        {
+            // TODO_UTF8STRING: Reduce allocations in this code path.
+
+            return new Utf8String(this.ToString().Normalize(normalizationForm));
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> to the desired Unicode normalization form, writing the
+        /// UTF-8 result to the buffer <paramref name="destination"/>.
+        /// </summary>
+        /// <returns>
+        /// The number of bytes written to <paramref name="destination"/>, or -1 if <paramref name="destination"/>
+        /// is not large enough to hold the result of the normalization operation.
+        /// </returns>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. Note that the the required
+        /// length of <paramref name="destination"/> may be longer or shorter (in terms of UTF-8 byte count)
+        /// than the input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public int Normalize(Span<byte> destination, NormalizationForm normalizationForm = NormalizationForm.FormC)
+        {
+            // TODO_UTF8STRING: Reduce allocations in this code path.
+
+            ReadOnlySpan<char> normalized = this.ToString().Normalize(normalizationForm);
+            OperationStatus status = Utf8.FromUtf16(normalized, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true);
+
+            Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "Normalize shouldn't have produced malformed Unicode string.");
+
+            if (status != OperationStatus.Done)
+            {
+                bytesWritten = -1; // "destination too small"
+            }
+
+            return bytesWritten;
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> to a <see langword="char[]"/>.
+        /// </summary>
+        public unsafe char[] ToCharArray()
+        {
+            if (IsEmpty)
+            {
+                return Array.Empty<char>();
+            }
+
+            // TODO_UTF8STRING: Since we know the underlying data is immutable, well-formed UTF-8,
+            // we can perform transcoding using an optimized code path that skips all safety checks.
+            // We should also consider skipping the two-pass if possible.
+
+            fixed (byte* pbUtf8 = &DangerousGetMutableReference())
+            {
+                byte* pbUtf8Invalid = Utf8Utility.GetPointerToFirstInvalidByte(pbUtf8, this.Length, out int utf16CodeUnitCountAdjustment, out _);
+                Debug.Assert(pbUtf8Invalid == pbUtf8 + this.Length, "Invalid UTF-8 data seen in buffer.");
+
+                char[] asUtf16 = new char[this.Length + utf16CodeUnitCountAdjustment];
+                fixed (byte* pbUtf16 = &asUtf16.GetRawSzArrayData())
+                {
+                    OperationStatus status = Utf8Utility.TranscodeToUtf16(pbUtf8, this.Length, (char*)pbUtf16, asUtf16.Length, out byte* pbUtf8End, out char* pchUtf16End);
+                    Debug.Assert(status == OperationStatus.Done, "The buffer changed out from under us unexpectedly?");
+                    Debug.Assert(pbUtf8End == pbUtf8 + this.Length, "The buffer changed out from under us unexpectedly?");
+                    Debug.Assert(pchUtf16End == ((char*)pbUtf16) + asUtf16.Length, "The buffer changed out from under us unexpectedly?");
+
+                    return asUtf16;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> instance to its UTF-16 equivalent, writing the result into
+        /// the buffer <paramref name="destination"/>.
+        /// </summary>
+        /// <returns>
+        /// The number of bytes written to <paramref name="destination"/>, or -1 if <paramref name="destination"/>
+        /// is not large enough to hold the result of the transcoding operation.
+        /// </returns>
+        public int ToChars(Span<char> destination)
+        {
+            OperationStatus status = Utf8.ToUtf16(Bytes, destination, out int _, out int charsWritten, replaceInvalidSequences: false, isFinalBlock: true);
+
+            Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "Utf8Spans shouldn't contain ill-formed UTF-8 data.");
+
+            if (status != OperationStatus.Done)
+            {
+                charsWritten = -1; // "destination too small"
+            }
+
+            return charsWritten;
+        }
+
+        /// <summary>
+        /// Returns a new <see cref="Utf8String"/> instance which represents this <see cref="Utf8Span"/> instance
+        /// converted to lowercase using <paramref name="culture"/>.
+        /// </summary>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. Note that the returned
+        /// <see cref="Utf8String"/> instance may be longer or shorter (in terms of UTF-8 byte count) than the
+        /// input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public Utf8String ToLower(CultureInfo culture)
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            if (culture is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
+            }
+
+            return new Utf8String(this.ToString().ToLower(culture));
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> to lowercase using <paramref name="culture"/>, writing the
+        /// UTF-8 result to the buffer <paramref name="destination"/>.
+        /// </summary>
+        /// <returns>
+        /// The number of bytes written to <paramref name="destination"/>, or -1 if <paramref name="destination"/>
+        /// is not large enough to hold the result of the case conversion operation.
+        /// </returns>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. Note that the the required
+        /// length of <paramref name="destination"/> may be longer or shorter (in terms of UTF-8 byte count)
+        /// than the input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public int ToLower(Span<byte> destination, CultureInfo culture)
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            if (culture is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
+            }
+
+            ReadOnlySpan<char> asLower = this.ToString().ToLower(culture);
+            OperationStatus status = Utf8.FromUtf16(asLower, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true);
+
+            Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToLower shouldn't have produced malformed Unicode string.");
+
+            if (status != OperationStatus.Done)
+            {
+                bytesWritten = -1; // "destination too small"
+            }
+
+            return bytesWritten;
+        }
+
+        /// <summary>
+        /// Returns a new <see cref="Utf8String"/> instance which represents this <see cref="Utf8Span"/> instance
+        /// converted to lowercase using the invariant culture.
+        /// </summary>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. For more information on the
+        /// invariant culture, see the <see cref="CultureInfo.InvariantCulture"/> property. Note that the returned
+        /// <see cref="Utf8String"/> instance may be longer or shorter (in terms of UTF-8 byte count) than the
+        /// input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public Utf8String ToLowerInvariant()
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            return new Utf8String(this.ToString().ToLowerInvariant());
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> to lowercase using the invariant culture, writing the
+        /// UTF-8 result to the buffer <paramref name="destination"/>.
+        /// </summary>
+        /// <returns>
+        /// The number of bytes written to <paramref name="destination"/>, or -1 if <paramref name="destination"/>
+        /// is not large enough to hold the result of the case conversion operation.
+        /// </returns>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. For more information on the
+        /// invariant culture, see the <see cref="CultureInfo.InvariantCulture"/> property. Note that the the required
+        /// length of <paramref name="destination"/> may be longer or shorter (in terms of UTF-8 byte count)
+        /// than the input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public int ToLowerInvariant(Span<byte> destination)
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            ReadOnlySpan<char> asLowerInvariant = this.ToString().ToLowerInvariant();
+            OperationStatus status = Utf8.FromUtf16(asLowerInvariant, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true);
+
+            Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToLowerInvariant shouldn't have produced malformed Unicode string.");
+
+            if (status != OperationStatus.Done)
+            {
+                bytesWritten = -1; // "destination too small"
+            }
+
+            return bytesWritten;
+        }
+
+        /// <summary>
+        /// Returns a new <see cref="Utf8String"/> instance which represents this <see cref="Utf8Span"/> instance
+        /// converted to uppercase using <paramref name="culture"/>.
+        /// </summary>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. Note that the returned
+        /// <see cref="Utf8String"/> instance may be longer or shorter (in terms of UTF-8 byte count) than the
+        /// input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public Utf8String ToUpper(CultureInfo culture)
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            if (culture is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
+            }
+
+            return new Utf8String(this.ToString().ToUpper(culture));
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> to uppercase using <paramref name="culture"/>, writing the
+        /// UTF-8 result to the buffer <paramref name="destination"/>.
+        /// </summary>
+        /// <returns>
+        /// The number of bytes written to <paramref name="destination"/>, or -1 if <paramref name="destination"/>
+        /// is not large enough to hold the result of the case conversion operation.
+        /// </returns>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. Note that the the required
+        /// length of <paramref name="destination"/> may be longer or shorter (in terms of UTF-8 byte count)
+        /// than the input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public int ToUpper(Span<byte> destination, CultureInfo culture)
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            if (culture is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
+            }
+
+            ReadOnlySpan<char> asUpper = this.ToString().ToUpper(culture);
+            OperationStatus status = Utf8.FromUtf16(asUpper, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true);
+
+            Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToUpper shouldn't have produced malformed Unicode string.");
+
+            if (status != OperationStatus.Done)
+            {
+                bytesWritten = -1; // "destination too small"
+            }
+
+            return bytesWritten;
+        }
+
+        /// <summary>
+        /// Returns a new <see cref="Utf8String"/> instance which represents this <see cref="Utf8Span"/> instance
+        /// converted to uppercase using the invariant culture.
+        /// </summary>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. For more information on the
+        /// invariant culture, see the <see cref="CultureInfo.InvariantCulture"/> property. Note that the returned
+        /// <see cref="Utf8String"/> instance may be longer or shorter (in terms of UTF-8 byte count) than the
+        /// input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public Utf8String ToUpperInvariant()
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            return new Utf8String(this.ToString().ToUpperInvariant());
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> to uppercase using the invariant culture, writing the
+        /// UTF-8 result to the buffer <paramref name="destination"/>.
+        /// </summary>
+        /// <returns>
+        /// The number of bytes written to <paramref name="destination"/>, or -1 if <paramref name="destination"/>
+        /// is not large enough to hold the result of the case conversion operation.
+        /// </returns>
+        /// <remarks>
+        /// The original <see cref="Utf8Span"/> is left unchanged by this operation. For more information on the
+        /// invariant culture, see the <see cref="CultureInfo.InvariantCulture"/> property. Note that the the required
+        /// length of <paramref name="destination"/> may be longer or shorter (in terms of UTF-8 byte count)
+        /// than the input <see cref="Utf8Span"/>.
+        /// </remarks>
+        public int ToUpperInvariant(Span<byte> destination)
+        {
+            // TODO_UTF8STRING: Avoid intermediate allocations.
+
+            ReadOnlySpan<char> asUpperInvariant = this.ToString().ToUpperInvariant();
+            OperationStatus status = Utf8.FromUtf16(asUpperInvariant, destination, out int _, out int bytesWritten, replaceInvalidSequences: false, isFinalBlock: true);
+
+            Debug.Assert(status == OperationStatus.Done || status == OperationStatus.DestinationTooSmall, "ToUpperInvariant shouldn't have produced malformed Unicode string.");
+
+            if (status != OperationStatus.Done)
+            {
+                bytesWritten = -1; // "destination too small"
+            }
+
+            return bytesWritten;
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs

new file mode 100644 (file)

index 0000000..6fba6ed
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs
@@ -0,0 +1,137 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Diagnostics;
+
+namespace System.Text
+{
+    public readonly ref partial struct Utf8Span
+    {
+        public CharEnumerable Chars => new CharEnumerable(this);
+        public RuneEnumerable Runes => new RuneEnumerable(this);
+
+        public readonly ref struct CharEnumerable
+        {
+            private readonly Utf8Span _span;
+
+            internal CharEnumerable(Utf8Span span)
+            {
+                _span = span;
+            }
+
+            public Enumerator GetEnumerator() => new Enumerator(_span);
+
+            public ref struct Enumerator
+            {
+                private uint _currentCharPair;
+                private ReadOnlySpan<byte> _remainingUtf8Bytes;
+
+                internal Enumerator(Utf8Span span)
+                {
+                    _currentCharPair = default;
+                    _remainingUtf8Bytes = span.Bytes;
+                }
+
+                public char Current => (char)_currentCharPair;
+
+                public bool MoveNext()
+                {
+                    // We don't need to worry about tearing since this enumerator is a ref struct.
+
+                    if (_currentCharPair > char.MaxValue)
+                    {
+                        // There was a surrogate pair smuggled in here from a previous operation.
+                        // Shift out the high surrogate value and return immediately.
+
+                        _currentCharPair >>= 16;
+                        return true;
+                    }
+
+                    if (_remainingUtf8Bytes.IsEmpty)
+                    {
+                        return false;
+                    }
+
+                    // TODO_UTF8STRING: Since we assume Utf8String instances are well-formed, we may instead
+                    // call an optimized version of the "decode" routine below which skips well-formedness checks.
+
+                    OperationStatus status = Rune.DecodeFromUtf8(_remainingUtf8Bytes, out Rune currentRune, out int bytesConsumed);
+                    Debug.Assert(status == OperationStatus.Done, "Somebody fed us invalid data?");
+
+                    if (currentRune.IsBmp)
+                    {
+                        // Common case - BMP scalar value.
+
+                        _currentCharPair = (uint)currentRune.Value;
+                    }
+                    else
+                    {
+                        // Uncommon case - supplementary plane (astral) scalar value.
+                        // We'll smuggle the two UTF-16 code units into a single 32-bit value,
+                        // with the leading surrogate packed into the low 16 bits of the value,
+                        // and the trailing surrogate packed into the high 16 bits of the value.
+
+                        UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar((uint)currentRune.Value, out char leadingCodeUnit, out char trailingCodeUnit);
+                        _currentCharPair = (uint)leadingCodeUnit + ((uint)trailingCodeUnit << 16);
+                    }
+
+                    // TODO_UTF8STRING: We can consider unsafe slicing below if we wish since we know we're
+                    // not going to overrun the end of the span.
+
+                    _remainingUtf8Bytes = _remainingUtf8Bytes.Slice(bytesConsumed);
+                    return true;
+                }
+            }
+        }
+
+        public readonly ref struct RuneEnumerable
+        {
+            private readonly Utf8Span _span;
+
+            internal RuneEnumerable(Utf8Span span)
+            {
+                _span = span;
+            }
+
+            public Enumerator GetEnumerator() => new Enumerator(_span);
+
+            public ref struct Enumerator
+            {
+                private Rune _currentRune;
+                private ReadOnlySpan<byte> _remainingUtf8Bytes;
+
+                internal Enumerator(Utf8Span span)
+                {
+                    _currentRune = default;
+                    _remainingUtf8Bytes = span.Bytes;
+                }
+
+                public Rune Current => _currentRune;
+
+                public bool MoveNext()
+                {
+                    // We don't need to worry about tearing since this enumerator is a ref struct.
+
+                    if (_remainingUtf8Bytes.IsEmpty)
+                    {
+                        return false;
+                    }
+
+                    // TODO_UTF8STRING: Since we assume Utf8Span instances are well-formed, we may instead
+                    // call an optimized version of the "decode" routine below which skips well-formedness checks.
+
+                    OperationStatus status = Rune.DecodeFromUtf8(_remainingUtf8Bytes, out _currentRune, out int bytesConsumed);
+                    Debug.Assert(status == OperationStatus.Done, "Somebody fed us invalid data?");
+
+                    // TODO_UTF8STRING: We can consider unsafe slicing below if we wish since we know we're
+                    // not going to overrun the end of the span.
+
+                    _remainingUtf8Bytes = _remainingUtf8Bytes.Slice(bytesConsumed);
+                    return true;
+                }
+            }
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs

new file mode 100644 (file)

index 0000000..ff177ae
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs
@@ -0,0 +1,536 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text.Unicode;
+
+namespace System.Text
+{
+    public readonly ref partial struct Utf8Span
+    {
+        [StackTraceHidden]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void CheckSplitOptions(Utf8StringSplitOptions options)
+        {
+            if ((uint)options > (uint)(Utf8StringSplitOptions.RemoveEmptyEntries | Utf8StringSplitOptions.TrimEntries))
+            {
+                CheckSplitOptions_Throw(options);
+            }
+        }
+
+        [StackTraceHidden]
+        private static void CheckSplitOptions_Throw(Utf8StringSplitOptions options)
+        {
+            throw new ArgumentOutOfRangeException(
+                paramName: nameof(options),
+                message: SR.Format(SR.Arg_EnumIllegalVal, (int)options));
+        }
+
+        public SplitResult Split(char separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None)
+        {
+            if (!Rune.TryCreate(separator, out Rune rune))
+            {
+                throw new ArgumentOutOfRangeException(
+                    paramName: nameof(separator),
+                    message: SR.ArgumentOutOfRange_Utf16SurrogatesDisallowed);
+            }
+
+            CheckSplitOptions(options);
+
+            return new SplitResult(this, rune, options);
+        }
+
+        public SplitResult Split(Rune separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None)
+        {
+            CheckSplitOptions(options);
+
+            return new SplitResult(this, separator, options);
+        }
+
+        public SplitResult Split(Utf8Span separator, Utf8StringSplitOptions options = Utf8StringSplitOptions.None)
+        {
+            if (separator.IsEmpty)
+            {
+                throw new ArgumentException(
+                    paramName: nameof(separator),
+                    message: SR.Argument_CannotBeEmptySpan);
+            }
+
+            CheckSplitOptions(options);
+
+            return new SplitResult(this, separator, options);
+        }
+
+        /// <summary>
+        /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public SplitOnResult SplitOn(char separator)
+        {
+            return TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public SplitOnResult SplitOn(char separator, StringComparison comparisonType)
+        {
+            return TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public SplitOnResult SplitOn(Rune separator)
+        {
+            return TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public SplitOnResult SplitOn(Rune separator, StringComparison comparisonType)
+        {
+            return TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public SplitOnResult SplitOn(Utf8Span separator)
+        {
+            return TryFind(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Utf8Span)".
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public SplitOnResult SplitOn(Utf8Span separator, StringComparison comparisonType)
+        {
+            return TryFind(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public SplitOnResult SplitOnLast(char separator)
+        {
+            return TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public SplitOnResult SplitOnLast(char separator, StringComparison comparisonType)
+        {
+            return TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public SplitOnResult SplitOnLast(Rune separator)
+        {
+            return TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public SplitOnResult SplitOnLast(Rune separator, StringComparison comparisonType)
+        {
+            return TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public SplitOnResult SplitOnLast(Utf8Span separator)
+        {
+            return TryFindLast(separator, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Locates the last occurrence of <paramref name="separator"/> within this <see cref="Utf8Span"/> instance, creating <see cref="Utf8Span"/>
+        /// instances which represent the data on either side of the separator. If <paramref name="separator"/> is not found
+        /// within this <see cref="Utf8Span"/> instance, returns the tuple "(this, Empty)".
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public SplitOnResult SplitOnLast(Utf8Span separator, StringComparison comparisonType)
+        {
+            return TryFindLast(separator, comparisonType, out Range range) ? new SplitOnResult(this, range) : new SplitOnResult(this);
+        }
+
+        /// <summary>
+        /// Trims whitespace from the beginning and the end of this <see cref="Utf8Span"/>,
+        /// returning a new <see cref="Utf8Span"/> containing the resulting slice.
+        /// </summary>
+        public Utf8Span Trim() => TrimHelper(TrimType.Both);
+
+        /// <summary>
+        /// Trims whitespace from only the end of this <see cref="Utf8Span"/>,
+        /// returning a new <see cref="Utf8Span"/> containing the resulting slice.
+        /// </summary>
+        public Utf8Span TrimEnd() => TrimHelper(TrimType.Tail);
+
+        internal Utf8Span TrimHelper(TrimType trimType)
+        {
+            ReadOnlySpan<byte> retSpan = Bytes;
+
+            if ((trimType & TrimType.Head) != 0)
+            {
+                int indexOfFirstNonWhiteSpaceChar = Utf8Utility.GetIndexOfFirstNonWhiteSpaceChar(retSpan);
+                Debug.Assert((uint)indexOfFirstNonWhiteSpaceChar <= (uint)retSpan.Length);
+
+                // TODO_UTF8STRING: Can use an unsafe slicing routine below if we need a perf boost.
+
+                retSpan = retSpan.Slice(indexOfFirstNonWhiteSpaceChar);
+            }
+
+            if ((trimType & TrimType.Tail) != 0)
+            {
+                int indexOfTrailingWhiteSpaceSequence = Utf8Utility.GetIndexOfTrailingWhiteSpaceSequence(retSpan);
+                Debug.Assert((uint)indexOfTrailingWhiteSpaceSequence <= (uint)retSpan.Length);
+
+                // TODO_UTF8STRING: Can use an unsafe slicing routine below if we need a perf boost.
+
+                retSpan = retSpan.Slice(0, indexOfTrailingWhiteSpaceSequence);
+            }
+
+            return UnsafeCreateWithoutValidation(retSpan);
+        }
+
+        /// <summary>
+        /// Trims whitespace from only the beginning of this <see cref="Utf8Span"/>,
+        /// returning a new <see cref="Utf8Span"/> containing the resulting slice.
+        /// </summary>
+        public Utf8Span TrimStart() => TrimHelper(TrimType.Head);
+        [StructLayout(LayoutKind.Auto)]
+        public readonly ref struct SplitResult
+        {
+            private readonly State _state;
+
+            internal SplitResult(Utf8Span source, Rune searchRune, Utf8StringSplitOptions splitOptions)
+            {
+                _state = new State
+                {
+                    RemainingSearchSpace = source,
+                    SearchRune = searchRune.Value,
+                    SearchTerm = default,
+                    SplitOptions = splitOptions
+                };
+            }
+
+            internal SplitResult(Utf8Span source, Utf8Span searchTerm, Utf8StringSplitOptions splitOptions)
+            {
+                _state = new State
+                {
+                    RemainingSearchSpace = source,
+                    SearchRune = -1,
+                    SearchTerm = searchTerm,
+                    SplitOptions = splitOptions
+                };
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out item2);
+                TrimIfNeeded(ref item2);
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder);
+                _state.DeconstructHelper(in remainder, out item2, out item3);
+                TrimIfNeeded(ref item3);
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder);
+                _state.DeconstructHelper(in remainder, out item2, out remainder);
+                _state.DeconstructHelper(in remainder, out item3, out item4);
+                TrimIfNeeded(ref item4);
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder);
+                _state.DeconstructHelper(in remainder, out item2, out remainder);
+                _state.DeconstructHelper(in remainder, out item3, out remainder);
+                _state.DeconstructHelper(in remainder, out item4, out item5);
+                TrimIfNeeded(ref item5);
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5, out Utf8Span item6)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder);
+                _state.DeconstructHelper(in remainder, out item2, out remainder);
+                _state.DeconstructHelper(in remainder, out item3, out remainder);
+                _state.DeconstructHelper(in remainder, out item4, out remainder);
+                _state.DeconstructHelper(in remainder, out item5, out item6);
+                TrimIfNeeded(ref item6);
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5, out Utf8Span item6, out Utf8Span item7)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder);
+                _state.DeconstructHelper(in remainder, out item2, out remainder);
+                _state.DeconstructHelper(in remainder, out item3, out remainder);
+                _state.DeconstructHelper(in remainder, out item4, out remainder);
+                _state.DeconstructHelper(in remainder, out item5, out remainder);
+                _state.DeconstructHelper(in remainder, out item6, out item7);
+                TrimIfNeeded(ref item7);
+            }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span item1, out Utf8Span item2, out Utf8Span item3, out Utf8Span item4, out Utf8Span item5, out Utf8Span item6, out Utf8Span item7, out Utf8Span item8)
+            {
+                _state.DeconstructHelper(in _state.RemainingSearchSpace, out item1, out Utf8Span remainder);
+                _state.DeconstructHelper(in remainder, out item2, out remainder);
+                _state.DeconstructHelper(in remainder, out item3, out remainder);
+                _state.DeconstructHelper(in remainder, out item4, out remainder);
+                _state.DeconstructHelper(in remainder, out item5, out remainder);
+                _state.DeconstructHelper(in remainder, out item6, out remainder);
+                _state.DeconstructHelper(in remainder, out item7, out item8);
+                TrimIfNeeded(ref item8);
+            }
+
+            public Enumerator GetEnumerator() => new Enumerator(this);
+
+            private void TrimIfNeeded(ref Utf8Span span)
+            {
+                if ((_state.SplitOptions & Utf8StringSplitOptions.TrimEntries) != 0)
+                {
+                    span = span.Trim();
+                }
+            }
+
+            [StructLayout(LayoutKind.Auto)]
+            public ref struct Enumerator
+            {
+                private const Utf8StringSplitOptions HALT_ENUMERATION = (Utf8StringSplitOptions)int.MinValue;
+
+                private Utf8Span _current;
+                private State _state;
+
+                internal Enumerator(SplitResult result)
+                {
+                    _current = default;
+                    _state = result._state; // copy by value
+                }
+
+                public Utf8Span Current => _current;
+
+                public bool MoveNext()
+                {
+                    // Happy path: if the search term was found, then the two 'out' fields below are overwritten with
+                    // the contents of the (before, after) tuple, and we can return right away.
+
+                    if (_state.DeconstructHelper(in _state.RemainingSearchSpace, out _current, out _state.RemainingSearchSpace))
+                    {
+                        return true;
+                    }
+
+                    // At this point, the search term was not found within the search space. '_current' contains the last
+                    // bit of data after the final occurrence of the search term. We'll also set a flag saying that we've
+                    // completed enumeration.
+
+                    if (_current.IsEmpty && (_state.SplitOptions & Utf8StringSplitOptions.RemoveEmptyEntries) != 0)
+                    {
+                        return false;
+                    }
+
+                    if ((_state.SplitOptions & HALT_ENUMERATION) != 0)
+                    {
+                        return false;
+                    }
+
+                    _state.SplitOptions |= HALT_ENUMERATION; // prevents yielding <empty> forever at end of split
+
+                    return true;
+                }
+            }
+
+            [StructLayout(LayoutKind.Auto)]
+            private ref struct State // fully mutable
+            {
+                internal Utf8Span RemainingSearchSpace;
+                internal int SearchRune; // -1 if not specified, takes less space than "Rune?"
+                internal Utf8Span SearchTerm;
+                internal Utf8StringSplitOptions SplitOptions;
+
+                // Returns 'true' if a match was found, 'false' otherwise.
+                internal readonly bool DeconstructHelper(in Utf8Span source, out Utf8Span firstItem, out Utf8Span remainder)
+                {
+                    // n.b. Our callers might pass the same reference for 'source' and 'remainder'.
+                    // We need to take care not to read 'source' after writing 'remainder'.
+
+                    bool wasMatchFound;
+                    ref readonly Utf8Span searchSpan = ref source;
+
+                    while (true)
+                    {
+                        if (searchSpan.IsEmpty)
+                        {
+                            firstItem = searchSpan;
+                            remainder = default;
+                            wasMatchFound = false;
+                            break;
+                        }
+
+                        Range matchRange;
+
+                        if (SearchRune >= 0)
+                        {
+                            wasMatchFound = searchSpan.TryFind(Rune.UnsafeCreate((uint)SearchRune), out matchRange);
+                        }
+                        else
+                        {
+                            wasMatchFound = searchSpan.TryFind(SearchTerm, out matchRange);
+                        }
+
+                        if (!wasMatchFound)
+                        {
+                            // If no match was found, we move 'source' to 'firstItem', trim if necessary, and return right away.
+
+                            firstItem = searchSpan;
+
+                            if ((SplitOptions & Utf8StringSplitOptions.TrimEntries) != 0)
+                            {
+                                firstItem = firstItem.Trim();
+                            }
+
+                            remainder = default;
+                        }
+                        else
+                        {
+                            // Otherwise, if a match was found, split the result across 'firstItem' and 'remainder',
+                            // applying trimming if necessary.
+
+                            firstItem = searchSpan[..matchRange.Start]; // TODO_UTF8STRING: Could use unsafe slicing as optimization
+                            remainder = searchSpan[matchRange.End..]; // TODO_UTF8STRING: Could use unsafe slicing as optimization
+
+                            if ((SplitOptions & Utf8StringSplitOptions.TrimEntries) != 0)
+                            {
+                                firstItem = firstItem.Trim();
+                            }
+
+                            // If we're asked to remove empty entries, loop until there's a real value in 'firstItem'.
+
+                            if ((SplitOptions & Utf8StringSplitOptions.RemoveEmptyEntries) != 0 && firstItem.IsEmpty)
+                            {
+                                searchSpan = ref remainder;
+                                continue;
+                            }
+                        }
+
+                        break; // loop only if explicit 'continue' statement was hit
+                    }
+
+                    return wasMatchFound;
+                }
+            }
+        }
+
+        [StructLayout(LayoutKind.Auto)]
+        public readonly ref struct SplitOnResult
+        {
+            // Used when there is no match.
+            internal SplitOnResult(Utf8Span originalSearchSpace)
+            {
+                Before = originalSearchSpace;
+                After = Empty;
+            }
+
+            // Used when a match is found.
+            internal SplitOnResult(Utf8Span originalSearchSpace, Range searchTermMatchRange)
+            {
+                (int startIndex, int length) = searchTermMatchRange.GetOffsetAndLength(originalSearchSpace.Length);
+
+                // TODO_UTF8STRING: The below indexer performs correctness checks. We can skip these checks (and even the
+                // bounds checks more generally) since we know the inputs are all valid and the containing struct is not
+                // subject to tearing.
+
+                Before = originalSearchSpace[..startIndex];
+                After = originalSearchSpace[(startIndex + length)..];
+            }
+
+            public Utf8Span After { get; }
+            public Utf8Span Before { get; }
+
+            [EditorBrowsable(EditorBrowsableState.Never)]
+            public void Deconstruct(out Utf8Span before, out Utf8Span after)
+            {
+                before = Before;
+                after = After;
+            }
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs

new file mode 100644 (file)

index 0000000..6be6e21
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs
@@ -0,0 +1,505 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Globalization;
+using System.Text.Unicode;
+
+namespace System.Text
+{
+    public readonly ref partial struct Utf8Span
+    {
+        /// <summary>
+        /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public bool TryFind(char value, out Range range)
+        {
+            if (Rune.TryCreate(value, out Rune rune))
+            {
+                return TryFind(rune, out range);
+            }
+            else
+            {
+                // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately.
+
+                range = default;
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public bool TryFind(char value, StringComparison comparisonType, out Range range)
+        {
+            if (Rune.TryCreate(value, out Rune rune))
+            {
+                return TryFind(rune, comparisonType, out range);
+            }
+            else
+            {
+                string.CheckStringComparison(comparisonType);
+
+                // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately.
+
+                range = default;
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public bool TryFind(Rune value, out Range range)
+        {
+            if (value.IsAscii)
+            {
+                // Special-case ASCII since it's a simple single byte search.
+
+                int idx = Bytes.IndexOf((byte)value.Value);
+                if (idx < 0)
+                {
+                    range = default;
+                    return false;
+                }
+                else
+                {
+                    range = idx..(idx + 1);
+                    return true;
+                }
+            }
+            else
+            {
+                // Slower path: need to search a multi-byte sequence.
+                // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we
+                // know Rune instances are well-formed and slicing is safe.
+
+                Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+                int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes);
+
+                return TryFind(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), out range);
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public bool TryFind(Rune value, StringComparison comparisonType, out Range range)
+        {
+            if (comparisonType == StringComparison.Ordinal)
+            {
+                return TryFind(value, out range);
+            }
+            else
+            {
+                // Slower path: not an ordinal comparison.
+                // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we
+                // know Rune instances are well-formed and slicing is safe.
+
+                Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+                int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes);
+
+                return TryFind(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), comparisonType, out range);
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public bool TryFind(Utf8Span value, out Range range)
+        {
+            int idx;
+
+            if (value.Bytes.Length == 1)
+            {
+                // Special-case ASCII since it's a simple single byte search.
+
+                idx = this.Bytes.IndexOf(value.Bytes[0]);
+            }
+            else
+            {
+                // Slower path: need to search a multi-byte sequence.
+
+                idx = this.Bytes.IndexOf(value.Bytes);
+            }
+
+            if (idx < 0)
+            {
+                range = default;
+                return false;
+            }
+            else
+            {
+                range = idx..(idx + value.Bytes.Length);
+                return true;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public bool TryFind(Utf8Span value, StringComparison comparisonType, out Range range) => TryFind(value, comparisonType, out range, fromBeginning: true);
+
+        private unsafe bool TryFind(Utf8Span value, StringComparison comparisonType, out Range range, bool fromBeginning)
+        {
+            string.CheckStringComparison(comparisonType);
+
+            if (value.IsEmpty)
+            {
+                // sourceString.IndexOf/LastIndexOf(term, comparer) should return the minimum/maximum value index
+                // for which the expression "sourceString.Substring(index).StartsWith(term, comparer)" is true.
+                // The range we return to the caller should reflect this so that they can pull out the correct index.
+
+                if (fromBeginning)
+                {
+                    range = Index.Start..Index.Start;
+                }
+                else
+                {
+                    range = Index.End..Index.End;
+                }
+                return true;
+            }
+
+            if (this.IsEmpty)
+            {
+                range = default;
+                return false;
+            }
+
+            CompareInfo compareInfo = default!; // will be overwritten if it matters
+            CompareOptions compareOptions = string.GetCaseCompareOfComparisonCulture(comparisonType);
+
+            if (GlobalizationMode.Invariant)
+            {
+                // In the Invariant globalization mode, all comparisons are normalized to Ordinal or OrdinalIgnoreCase,
+                // and even in "ignore case" we only map [a-z] <-> [A-Z]. All other code points remain unmapped.
+
+                // TODO_UTF8STRING: We should take advantage of the property described above to avoid the UTF-16
+                // transcoding step entirely.
+
+                if (compareOptions != CompareOptions.None)
+                {
+                    return (fromBeginning)
+                        ? TryFind(value, out range)
+                        : TryFindLast(value, out range); // call the ordinal search routine
+                }
+            }
+            else
+            {
+                switch (comparisonType)
+                {
+                    case StringComparison.Ordinal:
+                        return (fromBeginning)
+                          ? TryFind(value, out range)
+                          : TryFindLast(value, out range);
+
+                    case StringComparison.OrdinalIgnoreCase:
+                        // TODO_UTF8STRING: Can probably optimize this case.
+                        compareInfo = CompareInfo.Invariant;
+                        break;
+
+                    case StringComparison.CurrentCulture:
+                    case StringComparison.CurrentCultureIgnoreCase:
+                        compareInfo = CultureInfo.CurrentCulture.CompareInfo;
+                        break;
+
+                    default:
+                        Debug.Assert(comparisonType == StringComparison.InvariantCulture || comparisonType == StringComparison.InvariantCultureIgnoreCase);
+                        compareInfo = CompareInfo.Invariant;
+                        break;
+                }
+            }
+
+            // TODO_UTF8STRING: Remove allocations below, and try to avoid the transcoding step if possible.
+
+            string thisTranscodedToUtf16 = this.ToStringNoReplacement();
+            string otherTranscodedToUtf16 = value.ToStringNoReplacement();
+
+            int idx, matchLength;
+
+            if (GlobalizationMode.Invariant)
+            {
+                // If we got here, it meant we're doing an OrdinalIgnoreCase comparison.
+
+                Debug.Assert(compareOptions == CompareOptions.IgnoreCase);
+
+                idx = CompareInfo.InvariantIndexOf(thisTranscodedToUtf16, otherTranscodedToUtf16, ignoreCase: true, fromBeginning);
+                matchLength = otherTranscodedToUtf16.Length; // If there was a match, it involved only simple case folding.
+            }
+            else
+            {
+                idx = compareInfo.IndexOf(thisTranscodedToUtf16, otherTranscodedToUtf16, 0, thisTranscodedToUtf16.Length, compareOptions, &matchLength, fromBeginning);
+            }
+
+            if (idx < 0)
+            {
+                // No match found. Bail out now.
+
+                range = default;
+                return false;
+            }
+
+            // If we reached this point, we found a match. The 'idx' local is the index in the source
+            // string (indexed by UTF-16 code units) where the match was found, and the 'matchLength'
+            // local is the number of chars in the source string which constitute the match. This length
+            // can be different than the length of the search string, as non-ordinal IndexOf operations
+            // follow Unicode full case folding semantics and might also normalize characters like
+            // digraphs.
+
+            fixed (char* pThisTranscodedToUtf16 = &thisTranscodedToUtf16.GetRawStringData())
+            {
+                // First, we need to convert the UTF-16 'idx' to its UTF-8 equivalent.
+
+                char* pStoppedCounting = Utf16Utility.GetPointerToFirstInvalidChar(pThisTranscodedToUtf16, idx, out long utf8CodeUnitCountAdjustment, out _);
+                Debug.Assert(pStoppedCounting == pThisTranscodedToUtf16 + idx, "We shouldn't have generated an ill-formed UTF-16 temp string.");
+                Debug.Assert((ulong)(idx + utf8CodeUnitCountAdjustment) <= (uint)this.Bytes.Length, "Start index should be within the source UTF-8 data.");
+
+                // Normally when we produce a UTF-8 code unit count from a UTF-16 source we
+                // need to perform 64-bit arithmetic so we don't overflow. But in this case
+                // we know the true original source was UTF-8, so its length is known already
+                // to fit into a signed 32-bit integer. So we'll perform an unchecked cast.
+
+                int utf8StartIdx = idx + (int)utf8CodeUnitCountAdjustment;
+
+                // Now we need to convert the UTF-16 'matchLength' to its UTF-8 equivalent.
+
+                pStoppedCounting = Utf16Utility.GetPointerToFirstInvalidChar(pThisTranscodedToUtf16 + idx, matchLength, out utf8CodeUnitCountAdjustment, out _);
+                Debug.Assert(pStoppedCounting == pThisTranscodedToUtf16 + idx + matchLength, "We shouldn't have generated an ill-formed UTF-16 temp string.");
+                Debug.Assert((ulong)(utf8StartIdx + matchLength + utf8CodeUnitCountAdjustment) <= (uint)this.Bytes.Length, "End index should be within the source UTF-8 data.");
+
+                int utf8EndIdx = utf8StartIdx + matchLength + (int)utf8CodeUnitCountAdjustment;
+
+                // Some quick sanity checks on the return value before we return.
+
+                Debug.Assert(0 <= utf8StartIdx);
+                Debug.Assert(utf8StartIdx <= utf8EndIdx);
+                Debug.Assert(utf8EndIdx <= this.Bytes.Length);
+
+                range = utf8StartIdx..utf8EndIdx;
+                return true;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the last occurrence of the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public bool TryFindLast(char value, out Range range)
+        {
+            if (Rune.TryCreate(value, out Rune rune))
+            {
+                return TryFindLast(rune, out range);
+            }
+            else
+            {
+                // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately.
+
+                range = default;
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the last occurrence of the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public bool TryFindLast(char value, StringComparison comparisonType, out Range range)
+        {
+            if (Rune.TryCreate(value, out Rune rune))
+            {
+                return TryFindLast(rune, comparisonType, out range);
+            }
+            else
+            {
+                string.CheckStringComparison(comparisonType);
+
+                // Surrogate chars can't exist in well-formed UTF-8 data - bail immediately.
+
+                range = default;
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the last occurrence of the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public bool TryFindLast(Rune value, out Range range)
+        {
+            if (value.IsAscii)
+            {
+                // Special-case ASCII since it's a simple single byte search.
+
+                int idx = Bytes.LastIndexOf((byte)value.Value);
+                if (idx < 0)
+                {
+                    range = default;
+                    return false;
+                }
+                else
+                {
+                    range = idx..(idx + 1);
+                    return true;
+                }
+            }
+            else
+            {
+                // Slower path: need to search a multi-byte sequence.
+                // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we
+                // know Rune instances are well-formed and slicing is safe.
+
+                Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+                int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes);
+
+                return TryFindLast(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), out range);
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the last occurrence of the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public bool TryFindLast(Rune value, StringComparison comparisonType, out Range range)
+        {
+            if (comparisonType == StringComparison.Ordinal)
+            {
+                return TryFindLast(value, out range);
+            }
+            else
+            {
+                // Slower path: not an ordinal comparison.
+                // TODO_UTF8STRING: As an optimization, we could use unsafe APIs below since we
+                // know Rune instances are well-formed and slicing is safe.
+
+                Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+                int utf8ByteLengthOfRune = value.EncodeToUtf8(runeBytes);
+
+                return TryFindLast(UnsafeCreateWithoutValidation(runeBytes.Slice(0, utf8ByteLengthOfRune)), comparisonType, out range);
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the last occurrence of the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// An ordinal search is performed.
+        /// </remarks>
+        public bool TryFindLast(Utf8Span value, out Range range)
+        {
+            int idx;
+
+            if (value.Bytes.Length <= 1)
+            {
+                if (value.Bytes.Length == 1)
+                {
+                    idx = this.Bytes.LastIndexOf(value.Bytes[0]); // special-case ASCII since it's a single byte search
+                }
+                else
+                {
+                    idx = this.Length; // the last empty substring always occurs at the end of the buffer
+                }
+            }
+            else
+            {
+                // Slower path: need to search a multi-byte sequence.
+
+                idx = this.Bytes.LastIndexOf(value.Bytes);
+            }
+
+            if (idx < 0)
+            {
+                range = default;
+                return false;
+            }
+            else
+            {
+                range = idx..(idx + value.Bytes.Length);
+                return true;
+            }
+        }
+
+        /// <summary>
+        /// Attempts to locate the last occurrence of the target <paramref name="value"/> within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is found, returns <see langword="true"/> and sets <paramref name="range"/> to
+        /// the location where <paramref name="value"/> occurs within this <see cref="Utf8Span"/> instance.
+        /// If <paramref name="value"/> is not found, returns <see langword="false"/> and sets <paramref name="range"/>
+        /// to <see langword="default"/>.
+        /// </summary>
+        /// <remarks>
+        /// The search is performed using the specified <paramref name="comparisonType"/>.
+        /// </remarks>
+        public bool TryFindLast(Utf8Span value, StringComparison comparisonType, out Range range) => TryFind(value, comparisonType, out range, fromBeginning: false);
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs

new file mode 100644 (file)

index 0000000..63a7770
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs
@@ -0,0 +1,288 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.ComponentModel;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text.Unicode;
+using Internal.Runtime.CompilerServices;
+
+#pragma warning disable 0809  //warning CS0809: Obsolete member 'Utf8Span.Equals(object)' overrides non-obsolete member 'object.Equals(object)'
+
+#pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif
+
+namespace System.Text
+{
+    [StructLayout(LayoutKind.Auto)]
+    public readonly ref partial struct Utf8Span
+    {
+        /// <summary>
+        /// Creates a <see cref="Utf8Span"/> from an existing <see cref="Utf8String"/> instance.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public Utf8Span(Utf8String? value)
+        {
+            Bytes = Utf8Extensions.AsBytes(value);
+        }
+
+        /// <summary>
+        /// Ctor for internal use only. Caller _must_ validate both invariants hold:
+        /// (a) the buffer represents well-formed UTF-8 data, and
+        /// (b) the buffer is immutable.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private Utf8Span(ReadOnlySpan<byte> rawData)
+        {
+            // In debug builds, we want to ensure that the callers really did validate
+            // the buffer for well-formedness. The entire line below is removed when
+            // compiling release builds.
+
+            Debug.Assert(Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(rawData, out _) == -1);
+
+            Bytes = rawData;
+        }
+
+        public ReadOnlySpan<byte> Bytes { get; }
+
+        public static Utf8Span Empty => default;
+
+        public bool IsEmpty => Bytes.IsEmpty;
+
+        internal int Length => Bytes.Length;
+
+        public Utf8Span this[Range range]
+        {
+            get
+            {
+                (int offset, int length) = range.GetOffsetAndLength(Length);
+
+                // Check for a split across a multi-byte subsequence on the way out.
+                // Reminder: Unlike Utf8String, we can't safely dereference past the end of the span.
+
+                ref byte newRef = ref DangerousGetMutableReference(offset);
+                if (length > 0 && Utf8Utility.IsUtf8ContinuationByte(newRef))
+                {
+                    Utf8String.ThrowImproperStringSplit();
+                }
+
+                int endIdx = offset + length;
+                if (endIdx < Length && Utf8Utility.IsUtf8ContinuationByte(DangerousGetMutableReference(endIdx)))
+                {
+                    Utf8String.ThrowImproperStringSplit();
+                }
+
+                return UnsafeCreateWithoutValidation(new ReadOnlySpan<byte>(ref newRef, length));
+            }
+        }
+
+        /// <summary>
+        /// Returns a <em>mutable</em> reference to the first byte of this <see cref="Utf8Span"/>
+        /// (or, if this <see cref="Utf8Span"/> is empty, to where the first byte would be).
+        /// </summary>
+        /// <returns></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal ref byte DangerousGetMutableReference() => ref MemoryMarshal.GetReference(Bytes);
+
+        /// <summary>
+        /// Returns a <em>mutable</em> reference to the element at index <paramref name="index"/>
+        /// of this <see cref="Utf8Span"/> instance. The index is not bounds-checked.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal ref byte DangerousGetMutableReference(int index)
+        {
+            Debug.Assert(index >= 0, "Caller should've performed bounds checking.");
+            return ref DangerousGetMutableReference((uint)index);
+        }
+
+        /// <summary>
+        /// Returns a <em>mutable</em> reference to the element at index <paramref name="index"/>
+        /// of this <see cref="Utf8Span"/> instance. The index is not bounds-checked.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal ref byte DangerousGetMutableReference(nuint index)
+        {
+            // Allow retrieving references to just past the end of the span (but shouldn't dereference this).
+
+            Debug.Assert(index <= (uint)Length, "Caller should've performed bounds checking.");
+            return ref Unsafe.AddByteOffset(ref DangerousGetMutableReference(), index);
+        }
+
+        public bool IsEmptyOrWhiteSpace() => (Utf8Utility.GetIndexOfFirstNonWhiteSpaceChar(Bytes) == Length);
+
+        /// <summary>
+        /// This method is not supported as spans cannot be boxed. To compare two spans, use operator==.
+        /// <exception cref="System.NotSupportedException">
+        /// Always thrown by this method.
+        /// </exception>
+        /// </summary>
+        [Obsolete("Equals(object) on Utf8Span will always throw an exception. Use Equals(Utf8Span) or operator == instead.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
+        public override bool Equals(object? obj)
+        {
+            throw new NotSupportedException(SR.Utf8Span_CannotCallEqualsObject);
+        }
+
+        public bool Equals(Utf8Span other) => Equals(this, other);
+
+        public bool Equals(Utf8Span other, StringComparison comparison) => Equals(this, other, comparison);
+
+        public static bool Equals(Utf8Span left, Utf8Span right) => left.Bytes.SequenceEqual(right.Bytes);
+
+        public static bool Equals(Utf8Span left, Utf8Span right, StringComparison comparison)
+        {
+            // TODO_UTF8STRING: This perf can be improved, including removing
+            // the virtual dispatch by putting the switch directly in this method.
+
+            // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted.
+
+            return StringComparer.FromComparison(comparison).Equals(left.ToString(), right.ToString());
+        }
+
+        public override int GetHashCode()
+        {
+            // TODO_UTF8STRING: Consider whether this should use a different seed than String.GetHashCode.
+            // This method should only be called to calculate the hash code over spans that represent
+            // UTF-8 textual data, not over arbitrary binary sequences.
+
+            ulong seed = Marvin.DefaultSeed;
+            return Marvin.ComputeHash32(ref MemoryMarshal.GetReference(Bytes), (uint)Length /* in bytes */, (uint)seed, (uint)(seed >> 32));
+        }
+
+        public int GetHashCode(StringComparison comparison)
+        {
+            // TODO_UTF8STRING: This perf can be improved, including removing
+            // the virtual dispatch by putting the switch directly in this method.
+
+            // TODO_UTF8STRING: To avoid allocations, use Utf8StringComparer instead of StringComparer once it's submitted.
+
+            return StringComparer.FromComparison(comparison).GetHashCode(this.ToString());
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> if this UTF-8 text consists of all-ASCII data,
+        /// <see langword="false"/> if there is any non-ASCII data within this UTF-8 text.
+        /// </summary>
+        /// <remarks>
+        /// ASCII text is defined as text consisting only of scalar values in the range [ U+0000..U+007F ].
+        /// The runtime of this method is O(n).
+        /// </remarks>
+        public bool IsAscii()
+        {
+            // TODO_UTF8STRING: Use an API that takes 'ref byte' instead of a 'byte*' as a parameter.
+
+            unsafe
+            {
+                fixed (byte* pData = &MemoryMarshal.GetReference(Bytes))
+                {
+                    return (ASCIIUtility.GetIndexOfFirstNonAsciiByte(pData, (uint)Length) == (uint)Length);
+                }
+            }
+        }
+
+        public bool IsNormalized(NormalizationForm normalizationForm = NormalizationForm.FormC)
+        {
+            // TODO_UTF8STRING: Avoid allocations in this code path.
+
+            return ToString().IsNormalized(normalizationForm);
+        }
+
+        /// <summary>
+        /// Gets an immutable reference that can be used in a <see langword="fixed"/> statement. Unlike
+        /// <see cref="Utf8String"/>, the resulting reference is not guaranteed to be null-terminated.
+        /// </summary>
+        /// <remarks>
+        /// If this <see cref="Utf8Span"/> instance is empty, returns <see langword="null"/>. Dereferencing
+        /// such a reference will result in a <see cref="NullReferenceException"/> being generated.
+        /// </remarks>
+        [EditorBrowsable(EditorBrowsableState.Never)]
+        public ref readonly byte GetPinnableReference()
+        {
+            // This returns null if the underlying span is empty. The reason for this is that unlike
+            // Utf8String, these buffers are not guaranteed to be null-terminated, so it's not always
+            // safe or meaningful to dereference the element just past the end of the buffer.
+
+            return ref Bytes.GetPinnableReference();
+        }
+
+        public override string ToString()
+        {
+            // TODO_UTF8STRING: Since we know the underlying data is immutable, well-formed UTF-8,
+            // we can perform transcoding using an optimized code path that skips all safety checks.
+
+            return Encoding.UTF8.GetString(Bytes);
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8Span"/> instance to a <see cref="string"/>.
+        /// </summary>
+        /// <remarks>
+        /// This routine throws <see cref="InvalidOperationException"/> if the underlying instance
+        /// contains invalid UTF-8 data.
+        /// </remarks>
+        internal unsafe string ToStringNoReplacement()
+        {
+            // TODO_UTF8STRING: Optimize the call below, potentially by avoiding the two-pass.
+
+            fixed (byte* pData = &MemoryMarshal.GetReference(Bytes))
+            {
+                byte* pFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pData, Length, out int utf16CodeUnitCountAdjustment, out _);
+                if (pFirstInvalidByte != pData + (uint)Length)
+                {
+                    // Saw bad UTF-8 data.
+                    // TODO_UTF8STRING: Throw a better exception below?
+
+                    ThrowHelper.ThrowInvalidOperationException();
+                }
+
+                int utf16CharCount = Length + utf16CodeUnitCountAdjustment;
+                Debug.Assert(utf16CharCount <= Length && utf16CharCount >= 0);
+
+                // TODO_UTF8STRING: Can we call string.FastAllocate directly?
+
+                return string.Create(utf16CharCount, (pbData: (IntPtr)pData, cbData: Length), (chars, state) =>
+                {
+                    OperationStatus status = Utf8.ToUtf16(new ReadOnlySpan<byte>((byte*)state.pbData, state.cbData), chars, out _, out _, replaceInvalidSequences: false);
+                    Debug.Assert(status == OperationStatus.Done, "Did somebody mutate this Utf8String instance unexpectedly?");
+                });
+            }
+        }
+
+        public Utf8String ToUtf8String()
+        {
+            // TODO_UTF8STRING: Since we know the underlying data is immutable, well-formed UTF-8,
+            // we can perform transcoding using an optimized code path that skips all safety checks.
+
+            return new Utf8String(Bytes);
+        }
+
+        /// <summary>
+        /// Wraps a <see cref="Utf8Span"/> instance around the provided <paramref name="buffer"/>,
+        /// skipping validation of the input data.
+        /// </summary>
+        /// <remarks>
+        /// Callers must uphold the following two invariants:
+        ///
+        /// (a) <paramref name="buffer"/> consists only of well-formed UTF-8 data and does
+        ///     not contain invalid or incomplete UTF-8 subsequences; and
+        /// (b) the contents of <paramref name="buffer"/> will not change for the duration
+        ///     of the returned <see cref="Utf8Span"/>'s existence.
+        ///
+        /// If these invariants are not maintained, the runtime may exhibit undefined behavior.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Utf8Span UnsafeCreateWithoutValidation(ReadOnlySpan<byte> buffer)
+        {
+            return new Utf8Span(buffer);
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs

index e34149f..85ee986 100644 (file)
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs
@@ -4,6 +4,8 @@
  
  using System.Runtime.CompilerServices;
  using System.Runtime.InteropServices;
+using System.Text;
+using System.Text.Unicode;
  using Internal.Runtime.CompilerServices;
  
  namespace System
@@ -26,7 +28,7 @@ namespace System
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
          public static ReadOnlySpan<byte> AsBytes(this Utf8String? text)
          {
-            if (text == null)
+            if (text is null)
                  return default;
  
              return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(), text.Length);
@@ -44,7 +46,7 @@ namespace System
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
          public static ReadOnlySpan<byte> AsBytes(this Utf8String? text, int start)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -70,7 +72,7 @@ namespace System
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
          public static ReadOnlySpan<byte> AsBytes(this Utf8String? text, int start, int length)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0 || length != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -90,21 +92,21 @@ namespace System
          }
  
          /// <summary>
-        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// Creates a new <see cref="Utf8Span"/> over the target <see cref="Utf8String"/>.
          /// </summary>
          /// <param name="text">The target <see cref="Utf8String"/>.</param>
          /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static ReadOnlySpan<Char8> AsSpan(this Utf8String? text)
+        public static Utf8Span AsSpan(this Utf8String? text)
          {
-            if (text == null)
+            if (text is null)
                  return default;
  
-            return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference()), text.Length);
+            return new Utf8Span(text);
          }
  
          /// <summary>
-        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// Creates a new <see cref="Utf8Span"/> over the portion of the target <see cref="Utf8String"/>.
          /// </summary>
          /// <param name="text">The target <see cref="Utf8String"/>.</param>
          /// <param name="start">The index at which to begin this slice.</param>
@@ -112,10 +114,13 @@ namespace System
          /// <exception cref="System.ArgumentOutOfRangeException">
          /// Thrown when the specified <paramref name="start"/> index is not in range (&lt;0 or &gt;text.Length).
          /// </exception>
+        /// <exception cref="InvalidOperationException">
+        /// Thrown if the resulting span would split a multi-byte UTF-8 subsequence.
+        /// </exception>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static ReadOnlySpan<Char8> AsSpan(this Utf8String? text, int start)
+        public static Utf8Span AsSpan(this Utf8String? text, int start)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -125,11 +130,20 @@ namespace System
              if ((uint)start > (uint)text.Length)
                  ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
  
-            return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference(start)), text.Length - start);
+            // It's always safe for us to read just past the end of the string (since there's a null terminator),
+            // so we don't need to perform any additional bounds checking. We only need to check that we're not
+            // splitting in the middle of a multi-byte UTF-8 subsequence.
+
+            if (Utf8Utility.IsUtf8ContinuationByte(text.DangerousGetMutableReference(start)))
+            {
+                Utf8String.ThrowImproperStringSplit();
+            }
+
+            return Utf8Span.UnsafeCreateWithoutValidation(new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(start), text.Length - start));
          }
  
          /// <summary>
-        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// Creates a new <see cref="Utf8Span"/> over the portion of the target <see cref="Utf8String"/>.
          /// </summary>
          /// <param name="text">The target <see cref="Utf8String"/>.</param>
          /// <param name="start">The index at which to begin this slice.</param>
@@ -138,10 +152,13 @@ namespace System
          /// <exception cref="System.ArgumentOutOfRangeException">
          /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range.
          /// </exception>
+        /// <exception cref="InvalidOperationException">
+        /// Thrown if the resulting span would split a multi-byte UTF-8 subsequence.
+        /// </exception>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static ReadOnlySpan<Char8> AsSpan(this Utf8String? text, int start, int length)
+        public static Utf8Span AsSpan(this Utf8String? text, int start, int length)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0 || length != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -157,7 +174,17 @@ namespace System
                  ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
  #endif
  
-            return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference(start)), length);
+            // It's always safe for us to read just past the end of the string (since there's a null terminator),
+            // so we don't need to perform any additional bounds checking. We only need to check that we're not
+            // splitting in the middle of a multi-byte UTF-8 subsequence.
+
+            if (Utf8Utility.IsUtf8ContinuationByte(text.DangerousGetMutableReference(start))
+                || Utf8Utility.IsUtf8ContinuationByte(text.DangerousGetMutableReference(start + length)))
+            {
+                Utf8String.ThrowImproperStringSplit();
+            }
+
+            return Utf8Span.UnsafeCreateWithoutValidation(new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(start), length));
          }
  
          /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
@@ -165,7 +192,7 @@ namespace System
          /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
          public static ReadOnlyMemory<Char8> AsMemory(this Utf8String? text)
          {
-            if (text == null)
+            if (text is null)
                  return default;
  
              return new ReadOnlyMemory<Char8>(text, 0, text.Length);
@@ -180,7 +207,7 @@ namespace System
          /// </exception>
          public static ReadOnlyMemory<Char8> AsMemory(this Utf8String? text, int start)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -198,7 +225,7 @@ namespace System
          /// <param name="startIndex">The index at which to begin this slice.</param>
          public static ReadOnlyMemory<Char8> AsMemory(this Utf8String? text, Index startIndex)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (!startIndex.Equals(Index.Start))
                      ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text);
@@ -223,7 +250,7 @@ namespace System
          /// </exception>
          public static ReadOnlyMemory<Char8> AsMemory(this Utf8String? text, int start, int length)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0 || length != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -247,7 +274,7 @@ namespace System
          /// <param name="range">The range used to indicate the start and length of the sliced string.</param>
          public static ReadOnlyMemory<Char8> AsMemory(this Utf8String? text, Range range)
          {
-            if (text == null)
+            if (text is null)
              {
                  Index startIndex = range.Start;
                  Index endIndex = range.End;
@@ -267,7 +294,7 @@ namespace System
          /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
          public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String? text)
          {
-            if (text == null)
+            if (text is null)
                  return default;
  
              return new ReadOnlyMemory<byte>(text, 0, text.Length);
@@ -282,7 +309,7 @@ namespace System
          /// </exception>
          public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String? text, int start)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -300,7 +327,7 @@ namespace System
          /// <param name="startIndex">The index at which to begin this slice.</param>
          public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String? text, Index startIndex)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (!startIndex.Equals(Index.Start))
                      ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text);
@@ -325,7 +352,7 @@ namespace System
          /// </exception>
          public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String? text, int start, int length)
          {
-            if (text == null)
+            if (text is null)
              {
                  if (start != 0 || length != 0)
                      ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
@@ -349,7 +376,7 @@ namespace System
          /// <param name="range">The range used to indicate the start and length of the sliced string.</param>
          public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String? text, Range range)
          {
-            if (text == null)
+            if (text is null)
              {
                  Index startIndex = range.Start;
                  Index endIndex = range.End;
@@ -363,5 +390,10 @@ namespace System
              (int start, int length) = range.GetOffsetAndLength(text.Length);
              return new ReadOnlyMemory<byte>(text, start, length);
          }
+
+        /// <summary>
+        /// Creates a new <see cref="Utf8String"/> representation of this <see cref="Rune"/>.
+        /// </summary>
+        public static Utf8String ToUtf8String(this Rune rune) => Utf8String.CreateFromRune(rune);
      }
  }
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs

new file mode 100644 (file)

index 0000000..1a20f36
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs
@@ -0,0 +1,142 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Diagnostics;
+using System.Text;
+
+namespace System
+{
+    public sealed partial class Utf8String
+    {
+        /*
+         * COMPARISON OF UTF-8 AGAINST UTF-16
+         */
+
+        /// <summary>
+        /// Returns a value stating whether <paramref name="utf8Text"/> and <paramref name="utf16Text"/>
+        /// represent the same data. An ordinal comparison is performed scalar-by-scalar.
+        /// </summary>
+        /// <remarks>
+        /// This method returns <see langword="true"/> if both <paramref name="utf8Text"/> and
+        /// <paramref name="utf16Text"/> are null, or if both are empty. This method returns <see langword="false"/>
+        /// if either input contains an ill-formed subsequence. Otherwise, this method returns <see langword="true"/>
+        /// if and only if both arguments decode to the same Unicode scalar value sequence.
+        /// </remarks>
+        public static bool AreEquivalent(Utf8String? utf8Text, string? utf16Text)
+        {
+            if (ReferenceEquals(utf8Text, utf16Text))
+            {
+                return true; // both are null
+            }
+
+            if (utf8Text is null || utf16Text is null)
+            {
+                return false; // null is never equivalent to non-null
+            }
+
+            if (utf8Text.Length == 0 && utf16Text.Length == 0)
+            {
+                return true; // empty is equivalent to empty
+            }
+
+            // Short-circuit: are the texts of sufficiently different lengths that
+            // they could never be equivalent? This check allows us to skip the
+            // normal decoding walk, which is O(n).
+            //
+            // The maximum length of a 'System.String' is around 1 billion elements,
+            // so we can perform the multiplication within an unsigned 32-bit domain.
+
+            Debug.Assert((ulong)utf16Text.Length * MAX_UTF8_BYTES_PER_UTF16_CHAR <= uint.MaxValue, "Did somebody change the max. allowed string length?");
+
+            if (utf8Text.Length < utf16Text.Length
+                || ((uint)utf16Text.Length * MAX_UTF8_BYTES_PER_UTF16_CHAR < (uint)utf8Text.Length))
+            {
+                return false;
+            }
+
+            return AreEquivalentOrdinalSkipShortCircuitingChecks(utf8Text.AsBytes(), utf16Text);
+        }
+
+        /// <summary>
+        /// Returns a value stating whether <paramref name="utf8Text"/> and <paramref name="utf16Text"/>
+        /// represent the same data. An ordinal comparison is performed scalar-by-scalar.
+        /// </summary>
+        /// <remarks>
+        /// This method returns <see langword="true"/> if both <paramref name="utf8Text"/> and
+        /// <paramref name="utf16Text"/> are empty. This method returns <see langword="false"/>
+        /// if either input contains an ill-formed subsequence. Otherwise, this method returns <see langword="true"/>
+        /// if and only if both arguments decode to the same Unicode scalar value sequence.
+        /// </remarks>
+        public static bool AreEquivalent(Utf8Span utf8Text, ReadOnlySpan<char> utf16Text) => AreEquivalent(utf8Text.Bytes, utf16Text);
+
+        /// <summary>
+        /// Returns a value stating whether <paramref name="utf8Text"/> and <paramref name="utf16Text"/>
+        /// represent the same data. An ordinal comparison is performed scalar-by-scalar.
+        /// </summary>
+        /// <remarks>
+        /// This method returns <see langword="true"/> if both <paramref name="utf8Text"/> and
+        /// <paramref name="utf16Text"/> are empty. This method returns <see langword="false"/>
+        /// if either input contains an ill-formed subsequence. Otherwise, this method returns <see langword="true"/>
+        /// if and only if both arguments decode to the same Unicode scalar value sequence.
+        /// </remarks>
+        public static bool AreEquivalent(ReadOnlySpan<byte> utf8Text, ReadOnlySpan<char> utf16Text)
+        {
+            if (utf8Text.Length == 0 && utf16Text.Length == 0)
+            {
+                // Don't use IsEmpty for this check; JIT can optimize "Length == 0" better
+                // for this particular scenario.
+
+                return true;
+            }
+
+            // Same check as the (Utf8String, string) overload. The primary difference is that
+            // since spans can be up to 2 billion elements in length, we need to perform
+            // the multiplication step in the unsigned 64-bit domain to avoid integer overflow.
+
+            if (utf8Text.Length < utf16Text.Length
+                || ((ulong)(uint)utf16Text.Length * MAX_UTF8_BYTES_PER_UTF16_CHAR < (uint)utf8Text.Length))
+            {
+                return false;
+            }
+
+            return AreEquivalentOrdinalSkipShortCircuitingChecks(utf8Text, utf16Text);
+        }
+
+        private static bool AreEquivalentOrdinalSkipShortCircuitingChecks(ReadOnlySpan<byte> utf8Text, ReadOnlySpan<char> utf16Text)
+        {
+            while (!utf16Text.IsEmpty)
+            {
+                // If the next UTF-16 subsequence is malformed or incomplete, or if the next
+                // UTF-8 subsequence is malformed or incomplete, or if they don't decode to
+                // the exact same Unicode scalar value, fail.
+                //
+                // The Rune.DecodeFrom* APIs handle empty inputs just fine and return "Incomplete".
+
+                // TODO_UTF8STRING: If we assume Utf8String contains well-formed UTF-8, we could
+                // create a version of this method that calls a faster implementation of DecodeFromUtf8.
+                // We'd need to be careful not to call that optimized routine if the user passed
+                // us a normal ROS<byte> that didn't originate from a Utf8String or similar.
+
+                if (Rune.DecodeFromUtf16(utf16Text, out Rune scalarFromUtf16, out int charsConsumedJustNow) != OperationStatus.Done
+                    || Rune.DecodeFromUtf8(utf8Text, out Rune scalarFromUtf8, out int bytesConsumedJustNow) != OperationStatus.Done
+                    || scalarFromUtf16 != scalarFromUtf8)
+                {
+                    return false;
+                }
+
+                // TODO_UTF8STRING: As an optimization, we could perform unsafe slices below.
+
+                utf16Text = utf16Text.Slice(charsConsumedJustNow);
+                utf8Text = utf8Text.Slice(bytesConsumedJustNow);
+            }
+
+            // We decoded the entire UTF-16 input, and so far it has matched the decoded form
+            // of the UTF-8 input. Now just make sure we've also decoded the entirety of the
+            // UTF-8 data, otherwise the input strings aren't equivalent.
+
+            return utf8Text.IsEmpty;
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs

index 4b678b6..bbaecf0 100644 (file)
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs
@@ -2,6 +2,7 @@
  // The .NET Foundation licenses this file to you under the MIT license.
  // See the LICENSE file in the project root for more information.
  
+using System.Diagnostics;
  using System.Runtime.CompilerServices;
  using System.Runtime.InteropServices;
  using System.Text;
@@ -169,6 +170,16 @@ namespace System
  #endif
          private Utf8String Ctor(string? value) => Ctor(value.AsSpan());
  
+        internal static Utf8String CreateFromRune(Rune value)
+        {
+            Utf8String newString = FastAllocate(value.Utf8SequenceLength);
+            int bytesWritten = value.EncodeToUtf8(new Span<byte>(ref newString.DangerousGetMutableReference(), newString.Length));
+
+            Debug.Assert(bytesWritten == value.Utf8SequenceLength);
+
+            return newString;
+        }
+
          /*
           * HELPER METHODS
           */
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs

index 4cb7816..0767abe 100644 (file)
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs
@@ -6,6 +6,7 @@ using System.ComponentModel;
  using System.Diagnostics;
  using System.Diagnostics.CodeAnalysis;
  using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
  using System.Text;
  using Internal.Runtime.CompilerServices;
  
@@ -19,6 +20,9 @@ namespace System
          IEquatable<Utf8String>
  #nullable restore
      {
+        // For values beyond U+FFFF, it's 4 UTF-8 bytes per 2 UTF-16 chars (2:1 ratio)
+        private const int MAX_UTF8_BYTES_PER_UTF16_CHAR = 3;
+
          /*
           * STATIC FIELDS
           */
@@ -55,7 +59,12 @@ namespace System
          /// <summary>
          /// Projects a <see cref="Utf8String"/> instance as a <see cref="ReadOnlySpan{Char8}"/>.
          /// </summary>
-        public static implicit operator ReadOnlySpan<Char8>(Utf8String? value) => value.AsSpan();
+        public static implicit operator ReadOnlySpan<Char8>(Utf8String? value) => MemoryMarshal.Cast<byte, Char8>(value.AsSpan().Bytes);
+
+        /// <summary>
+        /// Projects a <see cref="Utf8String"/> instance as a <see cref="Utf8Span"/>.
+        /// </summary>
+        public static implicit operator Utf8Span(Utf8String? value) => new Utf8Span(value);
  
          /*
           * INSTANCE PROPERTIES
@@ -233,5 +242,12 @@ namespace System
  
              return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref DangerousGetMutableReference(), Length));
          }
+
+        [StackTraceHidden]
+        internal static void ThrowImproperStringSplit()
+        {
+            throw new InvalidOperationException(
+                message: SR.Utf8String_CannotSplitMultibyteSubsequence);
+        }
      }
  }
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs

new file mode 100644 (file)

index 0000000..29a00a2
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs
@@ -0,0 +1,17 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace System
+{
+    // TODO_UTF8STRING: This should be removed and we should use regular StringSplitOptions
+    // once a 'TrimEntries' flag gets added to the type.
+
+    [Flags]
+    public enum Utf8StringSplitOptions
+    {
+        None = 0,
+        RemoveEmptyEntries = 1,
+        TrimEntries = 2
+    }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems

index 5e8a46b..3745146 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
+++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
@@ -814,6 +814,7 @@
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.Debug.cs" Condition="'$(Configuration)' == 'Debug'" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringRuneEnumerator.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\TrimType.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />
@@ -828,6 +829,7 @@
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Helpers.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Transcoding.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Validation.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.WhiteSpace.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
      <Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs

index eecfc26..3b8350f 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs
@@ -1013,7 +1013,7 @@ namespace System.Globalization
          /// The following IndexOf overload is mainly used by String.Replace. This overload assumes the parameters are already validated
          /// and the caller is passing a valid matchLengthPtr pointer.
          /// </summary>
-        internal unsafe int IndexOf(string source, string value, int startIndex, int count, CompareOptions options, int* matchLengthPtr)
+        internal unsafe int IndexOf(string source, string value, int startIndex, int count, CompareOptions options, int* matchLengthPtr, bool fromBeginning = true)
          {
              Debug.Assert(source != null);
              Debug.Assert(value != null);
@@ -1036,7 +1036,16 @@ namespace System.Globalization
  
              if (options == CompareOptions.OrdinalIgnoreCase)
              {
-                int res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase: true);
+                int res;
+                if (fromBeginning)
+                {
+                    res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase: true);
+                }
+                else
+                {
+                    res = LastIndexOfOrdinal(source, value, startIndex, count, ignoreCase: true);
+                }
+
                  if (res >= 0 && matchLengthPtr != null)
                  {
                      *matchLengthPtr = value.Length;
@@ -1046,7 +1055,18 @@ namespace System.Globalization
  
              if (GlobalizationMode.Invariant)
              {
-                int res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase: (options & (CompareOptions.IgnoreCase | CompareOptions.OrdinalIgnoreCase)) != 0);
+                bool ignoreCase = (options & (CompareOptions.IgnoreCase | CompareOptions.OrdinalIgnoreCase)) != 0;
+                int res;
+
+                if (fromBeginning)
+                {
+                    res = IndexOfOrdinal(source, value, startIndex, count, ignoreCase);
+                }
+                else
+                {
+                    res = LastIndexOfOrdinal(source, value, startIndex, count, ignoreCase);
+                }
+
                  if (res >= 0 && matchLengthPtr != null)
                  {
                      *matchLengthPtr = value.Length;
@@ -1056,11 +1076,24 @@ namespace System.Globalization
  
              if (options == CompareOptions.Ordinal)
              {
-                int retValue = SpanHelpers.IndexOf(
-                    ref Unsafe.Add(ref source.GetRawStringData(), startIndex),
-                    count,
-                    ref value.GetRawStringData(),
-                    value.Length);
+                int retValue;
+
+                if (fromBeginning)
+                {
+                    retValue = SpanHelpers.IndexOf(
+                        ref Unsafe.Add(ref source.GetRawStringData(), startIndex),
+                        count,
+                        ref value.GetRawStringData(),
+                        value.Length);
+                }
+                else
+                {
+                    retValue = SpanHelpers.LastIndexOf(
+                        ref Unsafe.Add(ref source.GetRawStringData(), startIndex),
+                        count,
+                        ref value.GetRawStringData(),
+                        value.Length);
+                }
  
                  if (retValue >= 0)
                  {
@@ -1075,7 +1108,15 @@ namespace System.Globalization
              }
              else
              {
-                return IndexOfCore(source, value, startIndex, count, options, matchLengthPtr);
+                if (fromBeginning)
+                {
+                    // Call the string-based overload, as it special-cases IsFastSort as a perf optimization.
+                    return IndexOfCore(source, value, startIndex, count, options, matchLengthPtr);
+                }
+                else
+                {
+                    return IndexOfCore(source.AsSpan(startIndex, count), value, options, matchLengthPtr, fromBeginning: false);
+                }
              }
          }
  
diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs

index 0c79ce3..4b190ee 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs
@@ -1784,7 +1784,7 @@ namespace System
              int start = 0;
  
              // Trim specified characters.
-            if (trimType != TrimType.Tail)
+            if ((trimType & TrimType.Head) != 0)
              {
                  for (start = 0; start < Length; start++)
                  {
@@ -1795,7 +1795,7 @@ namespace System
                  }
              }
  
-            if (trimType != TrimType.Head)
+            if ((trimType & TrimType.Tail) != 0)
              {
                  for (end = Length - 1; end >= start; end--)
                  {
@@ -1820,7 +1820,7 @@ namespace System
              int start = 0;
  
              // Trim specified characters.
-            if (trimType != TrimType.Tail)
+            if ((trimType & TrimType.Head) != 0)
              {
                  for (start = 0; start < Length; start++)
                  {
@@ -1841,7 +1841,7 @@ namespace System
                  }
              }
  
-            if (trimType != TrimType.Head)
+            if ((trimType & TrimType.Tail) != 0)
              {
                  for (end = Length - 1; end >= start; end--)
                  {
@@ -1873,12 +1873,5 @@ namespace System
                  len == 0 ? string.Empty :
                  InternalSubString(start, len);
          }
-
-        private enum TrimType
-        {
-            Head = 0,
-            Tail = 1,
-            Both = 2
-        }
      }
  }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs b/src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs

new file mode 100644 (file)

index 0000000..db75688
--- /dev/null
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs
@@ -0,0 +1,30 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+
+namespace System.Text
+{
+    /// <summary>
+    /// Specifies which portions of the string should be trimmed in a trimming operation.
+    /// </summary>
+    [Flags]
+    internal enum TrimType
+    {
+        /// <summary>
+        /// Trim from the beginning of the string.
+        /// </summary>
+        Head = 1 << 0,
+
+        /// <summary>
+        /// Trim from the end of the string.
+        /// </summary>
+        Tail = 1 << 1,
+
+        /// <summary>
+        /// Trim from both the beginning and the end of the string.
+        /// </summary>
+        Both = Head | Tail
+    }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs

index 46eb8b5..693c6f4 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs
@@ -431,7 +431,7 @@ namespace System.Text.Unicode
          /// i.e., has binary representation 10xxxxxx, where x is any bit.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsUtf8ContinuationByte(in byte value)
+        internal static bool IsUtf8ContinuationByte(in byte value)
          {
              // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
              // directly rather than bounce a temporary through a register. That is, we want the JIT to be
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs

new file mode 100644 (file)

index 0000000..968144a
--- /dev/null
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs
@@ -0,0 +1,139 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
+
+#pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif
+
+namespace System.Text.Unicode
+{
+    internal static partial class Utf8Utility
+    {
+        /// <summary>
+        /// Returns the index in <paramref name="utf8Data"/> where the first non-whitespace character
+        /// appears, or the input length if the data contains only whitespace characters.
+        /// </summary>
+        public static int GetIndexOfFirstNonWhiteSpaceChar(ReadOnlySpan<byte> utf8Data)
+        {
+            return (int)GetIndexOfFirstNonWhiteSpaceChar(ref MemoryMarshal.GetReference(utf8Data), (uint)utf8Data.Length);
+        }
+
+        private static nuint GetIndexOfFirstNonWhiteSpaceChar(ref byte utf8Data, nuint length)
+        {
+            // This method is optimized for the case where the input data is ASCII, and if the
+            // data does need to be trimmed it's likely that only a relatively small number of
+            // bytes will be trimmed.
+
+            nuint i = 0;
+
+            while (i < length)
+            {
+                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
+                // If so, we can skip the more expensive logic later in this method.
+
+                if ((sbyte)Unsafe.AddByteOffset(ref utf8Data, i) > (sbyte)0x20)
+                {
+                    break;
+                }
+
+                uint possibleAsciiByte = Unsafe.AddByteOffset(ref utf8Data, i);
+                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
+                {
+                    // The simple comparison failed. Let's read the actual byte value,
+                    // and if it's ASCII we can delegate to Rune's inlined method
+                    // implementation.
+
+                    if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte)))
+                    {
+                        i++;
+                        continue;
+                    }
+                }
+                else
+                {
+                    // Not ASCII data. Go back to the slower "decode the entire scalar"
+                    // code path, then compare it against our Unicode tables.
+
+                    Rune.DecodeFromUtf8(new ReadOnlySpan<byte>(ref utf8Data, (int)length).Slice((int)i), out Rune decodedRune, out int bytesConsumed);
+                    if (Rune.IsWhiteSpace(decodedRune))
+                    {
+                        i += (uint)bytesConsumed;
+                        continue;
+                    }
+                }
+
+                break; // If we got here, we saw a non-whitespace subsequence.
+            }
+
+            return i;
+        }
+
+        /// <summary>
+        /// Returns the index in <paramref name="utf8Data"/> where the trailing whitespace sequence
+        /// begins, or 0 if the data contains only whitespace characters, or the span length if the
+        /// data does not end with any whitespace characters.
+        /// </summary>
+        public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan<byte> utf8Data)
+        {
+            return (int)GetIndexOfTrailingWhiteSpaceSequence(ref MemoryMarshal.GetReference(utf8Data), (uint)utf8Data.Length);
+        }
+
+        private static nuint GetIndexOfTrailingWhiteSpaceSequence(ref byte utf8Data, nuint length)
+        {
+            // This method is optimized for the case where the input data is ASCII, and if the
+            // data does need to be trimmed it's likely that only a relatively small number of
+            // bytes will be trimmed.
+
+            while (length > 0)
+            {
+                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
+                // If so, we can skip the more expensive logic later in this method.
+
+                if ((sbyte)Unsafe.Add(ref Unsafe.AddByteOffset(ref utf8Data, length), -1) > (sbyte)0x20)
+                {
+                    break;
+                }
+
+                uint possibleAsciiByte = Unsafe.Add(ref Unsafe.AddByteOffset(ref utf8Data, length), -1);
+                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
+                {
+                    // The simple comparison failed. Let's read the actual byte value,
+                    // and if it's ASCII we can delegate to Rune's inlined method
+                    // implementation.
+
+                    if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte)))
+                    {
+                        length--;
+                        continue;
+                    }
+                }
+                else
+                {
+                    // Not ASCII data. Go back to the slower "decode the entire scalar"
+                    // code path, then compare it against our Unicode tables.
+
+                    Rune.DecodeLastFromUtf8(new ReadOnlySpan<byte>(ref utf8Data, (int)length), out Rune decodedRune, out int bytesConsumed);
+                    if (Rune.IsWhiteSpace(decodedRune))
+                    {
+                        length -= (uint)bytesConsumed;
+                        continue;
+                    }
+                }
+
+                break; // If we got here, we saw a non-whitespace subsequence.
+            }
+
+            return length;
+        }
+    }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs

index 50a3e3f..01aa0a9 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs
@@ -4,7 +4,6 @@
  
  using System.Buffers;
  using System.Diagnostics;
-using System.Diagnostics.CodeAnalysis;
  using System.IO;
  using System.Runtime.CompilerServices;
  using System.Runtime.InteropServices;
@@ -46,15 +45,28 @@ namespace System.Text.Unicode
  
  #if FEATURE_UTF8STRING
          /// <summary>
+        /// Returns a value stating whether <paramref name="utf8Data"/> contains only well-formed UTF-8 data.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe bool IsWellFormedUtf8(ReadOnlySpan<byte> utf8Data)
+        {
+            fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
+            {
+                // The return value here will point to the end of the span if the data is well-formed.
+                byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int _, out _);
+                return (pFirstInvalidByte == (pUtf8Data + (uint)utf8Data.Length));
+            }
+        }
+
+        /// <summary>
          /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data;
          /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as
          /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced
-        /// with U+FFD.
+        /// with U+FFFD.
          /// </summary>
-        [return: NotNullIfNotNull("value")]
-        public static Utf8String? ValidateAndFixupUtf8String(Utf8String? value)
+        public static Utf8String ValidateAndFixupUtf8String(Utf8String value)
          {
-            if (Utf8String.IsNullOrEmpty(value))
+            if (value.Length == 0)
              {
                  return value;
              }
author	Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
	Thu, 26 Sep 2019 00:36:44 +0000 (17:36 -0700)
committer	GitHub <noreply@github.com>
	Thu, 26 Sep 2019 00:36:44 +0000 (17:36 -0700)
src/coreclr/src/System.Private.CoreLib/Resources/Strings.resx		patch \| blob \| history
src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj		patch \| blob \| history
src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Comparison.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Conversion.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Enumeration.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Manipulation.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.Searching.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Text/Utf8Span.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Utf8Extensions.cs		patch \| blob \| history
src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Comparison.cs	[new file with mode: 0644]	patch \| blob
src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs		patch \| blob \| history
src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs		patch \| blob \| history
src/coreclr/src/System.Private.CoreLib/src/System/Utf8StringSplitOptions.cs	[new file with mode: 0644]	patch \| blob
src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/Text/TrimType.cs	[new file with mode: 0644]	patch \| blob
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.WhiteSpace.cs	[new file with mode: 0644]	patch \| blob
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs		patch \| blob \| history