Add Utf8String skeleton (#23209)
authorLevi Broderick <GrabYourPitchforks@users.noreply.github.com>
Tue, 19 Mar 2019 05:58:32 +0000 (22:58 -0700)
committerGitHub <noreply@github.com>
Tue, 19 Mar 2019 05:58:32 +0000 (22:58 -0700)
Utf8String is an experimental type that is string-like (heap-allocated, immutable, variable-length, null-terminated) but whose inner representation is UTF-8, not UTF-16.

This is a skeleton implementation of the basic API shape. The ecosystem of APIs has not yet been built around it. All Utf8String-related code is currently surrounded by ifdefs to allow easy identification and removal from release branches.

43 files changed:
clr.defines.targets
clrdefinitions.cmake
src/System.Private.CoreLib/System.Private.CoreLib.csproj
src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
src/System.Private.CoreLib/shared/System/Memory.cs
src/System.Private.CoreLib/shared/System/ReadOnlyMemory.cs
src/System.Private.CoreLib/shared/System/ReadOnlySpan.Fast.cs
src/System.Private.CoreLib/shared/System/Runtime/InteropServices/MemoryMarshal.cs
src/System.Private.CoreLib/shared/System/Span.Fast.cs
src/System.Private.CoreLib/shared/System/String.cs
src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs [new file with mode: 0644]
src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
src/System.Private.CoreLib/src/System/Char8.cs [new file with mode: 0644]
src/System.Private.CoreLib/src/System/Utf8Extensions.cs [new file with mode: 0644]
src/System.Private.CoreLib/src/System/Utf8String.Construction.cs [new file with mode: 0644]
src/System.Private.CoreLib/src/System/Utf8String.Manipulation.cs [new file with mode: 0644]
src/System.Private.CoreLib/src/System/Utf8String.Searching.cs [new file with mode: 0644]
src/System.Private.CoreLib/src/System/Utf8String.cs [new file with mode: 0644]
src/classlibnative/bcltype/objectnative.cpp
src/inc/dacvars.h
src/strongname/api/common.h
src/vm/appdomain.cpp
src/vm/classnames.h
src/vm/common.h
src/vm/ecall.cpp
src/vm/ecall.h
src/vm/ecalllist.h
src/vm/gchelpers.cpp
src/vm/gchelpers.h
src/vm/jithelpers.cpp
src/vm/jitinterface.cpp
src/vm/jitinterface.h
src/vm/jitinterfacegen.cpp
src/vm/marshalnative.cpp
src/vm/metasig.h
src/vm/methodtable.h
src/vm/methodtablebuilder.cpp
src/vm/mscorlib.h
src/vm/object.h
src/vm/object.inl
src/vm/reflectioninvocation.cpp
src/vm/vars.cpp
src/vm/vars.hpp

index 3fa0417..e2f1058 100644 (file)
@@ -1,6 +1,7 @@
 <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
     <!-- Features we're currently flighting, but don't intend to ship in officially supported releases -->
     <PropertyGroup Condition="'$(IsPrerelease)' == 'true'">
+        <FeatureUtf8String>true</FeatureUtf8String>
         <!-- FeatureXXX>true</FeatureXXX -->
     </PropertyGroup>
 
index 9e22da2..a25d19d 100644 (file)
@@ -6,6 +6,7 @@ set(PRERELEASE 1)
 
 # Features we're currently flighting, but don't intend to ship in officially supported releases
 if (PRERELEASE)
+  add_definitions(-DFEATURE_UTF8STRING=1)  
   # add_definitions(-DFEATURE_XXX=1)
 endif (PRERELEASE)
 
index 6e73aef..0cf6733 100644 (file)
   <!-- CLR Features -->
   <Import Project="$(MSBuildThisFileDirectory)..\..\clr.coreclr.props" />
   <Import Project="$(MSBuildThisFileDirectory)..\..\clr.defines.targets" />
+  <!-- Experimental features -->
+  <PropertyGroup Condition="'$(FeatureUtf8String)' == 'true'">
+    <DefineConstants>$(DefineConstants);FEATURE_UTF8STRING</DefineConstants>
+  </PropertyGroup>
   <!-- Sources -->
   <ItemGroup>
     <Compile Include="$(BclSourcesRoot)\Internal\Console.cs" />
     <Compile Include="shared\Interop\Windows\Ole32\Interop.CoTaskMemAlloc.cs" />
     <Compile Include="shared\Interop\Windows\OleAut32\Interop.SysAllocStringByteLen.cs" />
   </ItemGroup>
+  <ItemGroup Condition="'$(FeatureUtf8String)' == 'true'">
+    <Compile Include="$(BclSourcesRoot)\System\Char8.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8Extensions.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8String.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8String.Construction.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8String.Manipulation.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Utf8String.Searching.cs" />
+  </ItemGroup>
   <ItemGroup>
     <Compile Include="$(BclSourcesRoot)\System\Diagnostics\Eventing\XplatEventLogger.cs" Condition="'$(FeatureXplatEventSource)' == 'true'" />
     <Compile Include="$(IntermediateOutputPath)..\Eventing\NativeRuntimeEventSource.cs" Condition="'$(FeaturePerfTracing)' == 'true'"/>
index 85ba8b8..b1c9da0 100644 (file)
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
index ba31a6a..2074404 100644 (file)
@@ -6,6 +6,7 @@ using System.Buffers;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Text;
 using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
 using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
 
@@ -164,7 +165,13 @@ namespace System
             // No validation performed in release builds; caller must provide any necessary validation.
 
             // 'obj is T[]' below also handles things like int[] <-> uint[] being convertible
-            Debug.Assert((obj == null) || (typeof(T) == typeof(char) && obj is string) || (obj is T[]) || (obj is MemoryManager<T>));
+            Debug.Assert((obj == null)
+                || (typeof(T) == typeof(char) && obj is string)
+#if FEATURE_UTF8STRING
+                || ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj is Utf8String)
+#endif // FEATURE_UTF8STRING
+                || (obj is T[])
+                || (obj is MemoryManager<T>));
 
             _object = obj;
             _index = start;
@@ -212,6 +219,14 @@ namespace System
             {
                 return (_object is string str) ? str.Substring(_index, _length) : Span.ToString();
             }
+#if FEATURE_UTF8STRING
+            else if (typeof(T) == typeof(Char8))
+            {
+                // TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
+                Span<T> span = Span;
+                return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), span.Length));
+            }
+#endif // FEATURE_UTF8STRING
             return string.Format("System.Memory<{0}>[{1}]", typeof(T).Name, _length);
         }
 
@@ -317,6 +332,13 @@ namespace System
                         refToReturn = ref Unsafe.As<char, T>(ref Unsafe.As<string>(tmpObject).GetRawStringData());
                         lengthOfUnderlyingSpan = Unsafe.As<string>(tmpObject).Length;
                     }
+#if FEATURE_UTF8STRING
+                    else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject.GetType() == typeof(Utf8String))
+                    {
+                        refToReturn = ref Unsafe.As<byte, T>(ref Unsafe.As<Utf8String>(tmpObject).DangerousGetMutableReference());
+                        lengthOfUnderlyingSpan = Unsafe.As<Utf8String>(tmpObject).Length;
+                    }
+#endif // FEATURE_UTF8STRING
                     else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
                     {
                         // We know the object is not null, it's not a string, and it is variable-length. The only
@@ -427,6 +449,14 @@ namespace System
                     ref char stringData = ref Unsafe.Add(ref s.GetRawStringData(), _index);
                     return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
                 }
+#if FEATURE_UTF8STRING
+                else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject is Utf8String utf8String)
+                {
+                    GCHandle handle = GCHandle.Alloc(tmpObject, GCHandleType.Pinned);
+                    ref byte stringData = ref utf8String.DangerousGetMutableReference(_index);
+                    return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
+                }
+#endif // FEATURE_UTF8STRING
                 else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
                 {
                     // 'tmpObject is T[]' below also handles things like int[] <-> uint[] being convertible
index 6c59843..bf90f04 100644 (file)
@@ -6,6 +6,7 @@ using System.Buffers;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Text;
 using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
 using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
 
@@ -99,7 +100,13 @@ namespace System
             // No validation performed in release builds; caller must provide any necessary validation.
 
             // 'obj is T[]' below also handles things like int[] <-> uint[] being convertible
-            Debug.Assert((obj == null) || (typeof(T) == typeof(char) && obj is string) || (obj is T[]) || (obj is MemoryManager<T>));
+            Debug.Assert((obj == null)
+                || (typeof(T) == typeof(char) && obj is string)
+#if FEATURE_UTF8STRING
+                || ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj is Utf8String)
+#endif // FEATURE_UTF8STRING
+                || (obj is T[])
+                || (obj is MemoryManager<T>));
 
             _object = obj;
             _index = start;
@@ -141,6 +148,14 @@ namespace System
             {
                 return (_object is string str) ? str.Substring(_index, _length) : Span.ToString();
             }
+#if FEATURE_UTF8STRING
+            else if (typeof(T) == typeof(Char8))
+            {
+                // TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
+                ReadOnlySpan<T> span = Span;
+                return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), span.Length));
+            }
+#endif // FEATURE_UTF8STRING
             return string.Format("System.ReadOnlyMemory<{0}>[{1}]", typeof(T).Name, _length);
         }
 
@@ -239,6 +254,13 @@ namespace System
                         refToReturn = ref Unsafe.As<char, T>(ref Unsafe.As<string>(tmpObject).GetRawStringData());
                         lengthOfUnderlyingSpan = Unsafe.As<string>(tmpObject).Length;
                     }
+#if FEATURE_UTF8STRING
+                    else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject.GetType() == typeof(Utf8String))
+                    {
+                        refToReturn = ref Unsafe.As<byte, T>(ref Unsafe.As<Utf8String>(tmpObject).DangerousGetMutableReference());
+                        lengthOfUnderlyingSpan = Unsafe.As<Utf8String>(tmpObject).Length;
+                    }
+#endif // FEATURE_UTF8STRING
                     else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
                     {
                         // We know the object is not null, it's not a string, and it is variable-length. The only
@@ -342,6 +364,14 @@ namespace System
                     ref char stringData = ref Unsafe.Add(ref s.GetRawStringData(), _index);
                     return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
                 }
+#if FEATURE_UTF8STRING
+                else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject is Utf8String utf8String)
+                {
+                    GCHandle handle = GCHandle.Alloc(tmpObject, GCHandleType.Pinned);
+                    ref byte stringData = ref utf8String.DangerousGetMutableReference(_index);
+                    return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
+                }
+#endif // FEATURE_UTF8STRING
                 else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
                 {
                     // 'tmpObject is T[]' below also handles things like int[] <-> uint[] being convertible
index eb3fd14..00337a5 100644 (file)
@@ -5,6 +5,7 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Versioning;
+using System.Text;
 using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
 using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
 using Internal.Runtime.CompilerServices;
@@ -240,12 +241,15 @@ namespace System
         {
             if (typeof(T) == typeof(char))
             {
-                unsafe
-                {
-                    fixed (char* src = &Unsafe.As<T, char>(ref _pointer.Value))
-                        return new string(src, 0, _length);
-                }
+                return new string(new ReadOnlySpan<char>(ref Unsafe.As<T, char>(ref _pointer.Value), _length));
             }
+#if FEATURE_UTF8STRING
+            else if (typeof(T) == typeof(Char8))
+            {
+                // TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
+                return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref _pointer.Value), _length));
+            }
+#endif // FEATURE_UTF8STRING
             return string.Format("System.ReadOnlySpan<{0}>[{1}]", typeof(T).Name, _length);
         }
 
index b1f5507..225f434 100644 (file)
@@ -28,7 +28,12 @@ namespace System.Runtime.InteropServices
             // As an optimization, we skip the "is string?" check below if typeof(T) is not char,
             // as Memory<T> / ROM<T> can't possibly contain a string instance in this case.
 
-            if (obj != null && (typeof(T) != typeof(char) || obj.GetType() != typeof(string)))
+            if (obj != null && !(
+                (typeof(T) == typeof(char) && obj.GetType() == typeof(string))
+#if FEATURE_UTF8STRING
+                || ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj.GetType() == typeof(Utf8String))
+#endif // FEATURE_UTF8STRING
+                ))
             {
                 if (RuntimeHelpers.ObjectHasComponentSize(obj))
                 {
index 66de4fe..adc1f39 100644 (file)
@@ -5,6 +5,7 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Versioning;
+using System.Text;
 using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
 using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
 using Internal.Runtime.CompilerServices;
@@ -319,12 +320,15 @@ namespace System
         {
             if (typeof(T) == typeof(char))
             {
-                unsafe
-                {
-                    fixed (char* src = &Unsafe.As<T, char>(ref _pointer.Value))
-                        return new string(src, 0, _length);
-                }
+                return new string(new ReadOnlySpan<char>(ref Unsafe.As<T, char>(ref _pointer.Value), _length));
+            }
+#if FEATURE_UTF8STRING
+            else if (typeof(T) == typeof(Char8))
+            {
+                // TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
+                return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref _pointer.Value), _length));
             }
+#endif // FEATURE_UTF8STRING
             return string.Format("System.Span<{0}>[{1}]", typeof(T).Name, _length);
         }
 
index 49afbc8..10f7522 100644 (file)
@@ -24,9 +24,13 @@ namespace System
     [System.Runtime.CompilerServices.TypeForwardedFrom("mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089")]
     public sealed partial class String : IComparable, IEnumerable, IConvertible, IEnumerable<char>, IComparable<string>, IEquatable<string>, ICloneable
     {
-        // String constructors
-        // These are special. The implementation methods for these have a different signature from the
-        // declared constructors.
+        /*
+         * CONSTRUCTORS
+         *
+         * Defining a new constructor for string-like types (like String) requires changes both
+         * to the managed code below and to the native VM code. See the comment at the top of
+         * src/vm/ecall.cpp for instructions on how to add new overloads.
+         */
 
         [MethodImplAttribute(MethodImplOptions.InternalCall)]
         public extern String(char[] value);
@@ -335,8 +339,7 @@ namespace System
                 return Empty;
 
             string result = FastAllocateString(value.Length);
-            fixed (char* dest = &result._firstChar, src = &MemoryMarshal.GetReference(value))
-                wstrcpy(dest, src, value.Length);
+            Buffer.Memmove(ref result._firstChar, ref MemoryMarshal.GetReference(value), (uint)value.Length);
             return result;
         }
 
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
new file mode 100644 (file)
index 0000000..6ee9ca0
--- /dev/null
@@ -0,0 +1,106 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Diagnostics;
+using System.IO;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.Unicode
+{
+    internal static class Utf8Utility
+    {
+        /// <summary>
+        /// The maximum number of bytes that can result from UTF-8 transcoding
+        /// any Unicode scalar value.
+        /// </summary>
+        internal const int MaxBytesPerScalar = 4;
+
+        /// <summary>
+        /// The UTF-8 representation of <see cref="UnicodeUtility.ReplacementChar"/>.
+        /// </summary>
+        private static ReadOnlySpan<byte> ReplacementCharSequence => new byte[] { 0xEF, 0xBF, 0xBD };
+
+        /// <summary>
+        /// Returns the byte index in <paramref name="utf8Data"/> where the first invalid UTF-8 sequence begins,
+        /// or -1 if the buffer contains no invalid sequences. Also outs the <paramref name="isAscii"/> parameter
+        /// stating whether all data observed (up to the first invalid sequence or the end of the buffer, whichever
+        /// comes first) is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
+        {
+            // TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948).
+
+            bool tempIsAscii = true;
+            int originalDataLength = utf8Data.Length;
+
+            while (!utf8Data.IsEmpty)
+            {
+                if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done)
+                {
+                    break;
+                }
+
+                tempIsAscii &= result.IsAscii;
+                utf8Data = utf8Data.Slice(bytesConsumed);
+            }
+
+            isAscii = tempIsAscii;
+            return (utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length);
+        }
+
+#if FEATURE_UTF8STRING
+        /// <summary>
+        /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data;
+        /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as
+        /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced
+        /// with U+FFD.
+        /// </summary>
+        public static Utf8String ValidateAndFixupUtf8String(Utf8String value)
+        {
+            if (Utf8String.IsNullOrEmpty(value))
+            {
+                return value;
+            }
+
+            ReadOnlySpan<byte> valueAsBytes = value.AsBytes();
+
+            int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _);
+            if (idxOfFirstInvalidData < 0)
+            {
+                return value;
+            }
+
+            // TODO_UTF8STRING: Replace this with the faster implementation once it's available.
+            // (The faster implementation is in the dev/utf8string_bak branch currently.)
+
+            MemoryStream memStream = new MemoryStream();
+            memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData));
+
+            valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData);
+            do
+            {
+                if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done)
+                {
+                    //  Valid scalar value - copy data as-is to MemoryStream
+                    memStream.Write(valueAsBytes.Slice(0, bytesConsumed));
+                }
+                else
+                {
+                    // Invalid scalar value - copy U+FFFD to MemoryStream
+                    memStream.Write(ReplacementCharSequence);
+                }
+
+                valueAsBytes = valueAsBytes.Slice(bytesConsumed);
+            } while (!valueAsBytes.IsEmpty);
+
+            bool success = memStream.TryGetBuffer(out ArraySegment<byte> memStreamBuffer);
+            Debug.Assert(success, "Couldn't get underlying MemoryStream buffer.");
+
+            return Utf8String.DangerousCreateWithoutValidation(memStreamBuffer, assumeWellFormed: true);
+        }
+#endif // FEATURE_UTF8STRING
+    }
+}
index 3aad296..065c938 100644 (file)
@@ -11,7 +11,7 @@ namespace System.Text
         /// <summary>
         /// The Unicode replacement character U+FFFD.
         /// </summary>
-        public const uint ReplacementChar = 0xFFFDU;
+        public const uint ReplacementChar = 0xFFFD;
 
         /// <summary>
         /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point.
diff --git a/src/System.Private.CoreLib/src/System/Char8.cs b/src/System.Private.CoreLib/src/System/Char8.cs
new file mode 100644 (file)
index 0000000..7a71e2f
--- /dev/null
@@ -0,0 +1,69 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace System
+{
+    /// <summary>
+    /// Represents a UTF-8 code unit, the elemental type of <see cref="Utf8String"/>.
+    /// </summary>
+    public readonly struct Char8 : IComparable<Char8>, IEquatable<Char8>
+    {
+        private readonly byte _value;
+
+        private Char8(byte value)
+        {
+            _value = value;
+        }
+
+        public static bool operator ==(Char8 left, Char8 right) => left._value == right._value;
+        public static bool operator !=(Char8 left, Char8 right) => left._value != right._value;
+        public static bool operator <(Char8 left, Char8 right) => left._value < right._value;
+        public static bool operator <=(Char8 left, Char8 right) => left._value <= right._value;
+        public static bool operator >(Char8 left, Char8 right) => left._value > right._value;
+        public static bool operator >=(Char8 left, Char8 right) => left._value >= right._value;
+
+        // Operators from Utf8Char to <other primitives>
+        // TODO: Once C# gets support for checked operators, we should add those here.
+
+        public static implicit operator byte(Char8 value) => value._value;
+        [CLSCompliant(false)]
+        public static explicit operator sbyte(Char8 value) => (sbyte)value._value; // explicit because can integer overflow
+        public static explicit operator char(Char8 value) => (char)value._value; // explicit because don't want to encourage char conversion
+        public static implicit operator short(Char8 value) => value._value;
+        [CLSCompliant(false)]
+        public static implicit operator ushort(Char8 value) => value._value;
+        public static implicit operator int(Char8 value) => value._value;
+        [CLSCompliant(false)]
+        public static implicit operator uint(Char8 value) => value._value;
+        public static implicit operator long(Char8 value) => value._value;
+        [CLSCompliant(false)]
+        public static implicit operator ulong(Char8 value) => value._value;
+
+        // Operators from <other primitives> to Char8; most are explicit because narrowing conversions could be lossy
+        // TODO: Once C# gets support for checked operators, we should add those here.
+
+        public static implicit operator Char8(byte value) => new Char8(value);
+        [CLSCompliant(false)]
+        public static explicit operator Char8(sbyte value) => new Char8((byte)value);
+        public static explicit operator Char8(char value) => new Char8((byte)value);
+        public static explicit operator Char8(short value) => new Char8((byte)value);
+        [CLSCompliant(false)]
+        public static explicit operator Char8(ushort value) => new Char8((byte)value);
+        public static explicit operator Char8(int value) => new Char8((byte)value);
+        [CLSCompliant(false)]
+        public static explicit operator Char8(uint value) => new Char8((byte)value);
+        public static explicit operator Char8(long value) => new Char8((byte)value);
+        [CLSCompliant(false)]
+        public static explicit operator Char8(ulong value) => new Char8((byte)value);
+
+        public int CompareTo(Char8 other) => this._value.CompareTo(other._value);
+
+        public override bool Equals(object obj) => (obj is Char8 other) && (this == other);
+        public bool Equals(Char8 other) => this == other;
+
+        public override int GetHashCode() => _value;
+
+        public override string ToString() => _value.ToString("X2");
+    }
+}
diff --git a/src/System.Private.CoreLib/src/System/Utf8Extensions.cs b/src/System.Private.CoreLib/src/System/Utf8Extensions.cs
new file mode 100644 (file)
index 0000000..9fa2a54
--- /dev/null
@@ -0,0 +1,367 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
+
+namespace System
+{
+    public static class Utf8Extensions
+    {
+        /// <summary>
+        /// Projects <paramref name="text"/> as a <see cref="ReadOnlySpan{Byte}"/>.
+        /// </summary>
+        public static ReadOnlySpan<byte> AsBytes(this ReadOnlySpan<Char8> text)
+        {
+            return MemoryMarshal.Cast<Char8, byte>(text);
+        }
+
+        /// <summary>
+        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// </summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ReadOnlySpan<byte> AsBytes(this Utf8String text)
+        {
+            if (text == null)
+                return default;
+
+            return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(), text.Length);
+        }
+
+        /// <summary>
+        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// </summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <exception cref="System.ArgumentNullException">Thrown when <paramref name="text"/> is null.</exception>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index is not in range (&lt;0 or &gt;text.Length).
+        /// </exception>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ReadOnlySpan<byte> AsBytes(this Utf8String text, int start)
+        {
+            if (text == null)
+            {
+                if (start != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+            if ((uint)start > (uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+
+            return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(start), text.Length - start);
+        }
+
+        /// <summary>
+        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// </summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <param name="length">The desired length for the slice (exclusive).</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range.
+        /// </exception>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ReadOnlySpan<byte> AsBytes(this Utf8String text, int start, int length)
+        {
+            if (text == null)
+            {
+                if (start != 0 || length != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+#if BIT64
+            // See comment in Span<T>.Slice for how this works.
+            if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#else
+            if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start))
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#endif
+
+            return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(start), length);
+        }
+
+        /// <summary>
+        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// </summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ReadOnlySpan<Char8> AsSpan(this Utf8String text)
+        {
+            if (text == null)
+                return default;
+
+            return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference()), text.Length);
+        }
+
+        /// <summary>
+        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// </summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <exception cref="System.ArgumentNullException">Thrown when <paramref name="text"/> is null.</exception>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index is not in range (&lt;0 or &gt;text.Length).
+        /// </exception>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ReadOnlySpan<Char8> AsSpan(this Utf8String text, int start)
+        {
+            if (text == null)
+            {
+                if (start != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+            if ((uint)start > (uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+
+            return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference(start)), text.Length - start);
+        }
+
+        /// <summary>
+        /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>.
+        /// </summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <param name="length">The desired length for the slice (exclusive).</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range.
+        /// </exception>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ReadOnlySpan<Char8> AsSpan(this Utf8String text, int start, int length)
+        {
+            if (text == null)
+            {
+                if (start != 0 || length != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+#if BIT64
+            // See comment in Span<T>.Slice for how this works.
+            if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#else
+            if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start))
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#endif
+
+            return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference(start)), length);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text)
+        {
+            if (text == null)
+                return default;
+
+            return new ReadOnlyMemory<Char8>(text, 0, text.Length);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index is not in range (&lt;0 or &gt;text.Length).
+        /// </exception>
+        public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, int start)
+        {
+            if (text == null)
+            {
+                if (start != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+            if ((uint)start > (uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+
+            return new ReadOnlyMemory<Char8>(text, start, text.Length - start);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="startIndex">The index at which to begin this slice.</param>
+        public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, Index startIndex)
+        {
+            if (text == null)
+            {
+                if (!startIndex.Equals(Index.Start))
+                    ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text);
+
+                return default;
+            }
+
+            int actualIndex = startIndex.GetOffset(text.Length);
+            if ((uint)actualIndex > (uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException();
+
+            return new ReadOnlyMemory<Char8>(text, actualIndex, text.Length - actualIndex);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <param name="length">The desired length for the slice (exclusive).</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range.
+        /// </exception>
+        public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, int start, int length)
+        {
+            if (text == null)
+            {
+                if (start != 0 || length != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+#if BIT64
+            // See comment in Span<T>.Slice for how this works.
+            if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#else
+            if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start))
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#endif
+
+            return new ReadOnlyMemory<Char8>(text, start, length);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="range">The range used to indicate the start and length of the sliced string.</param>
+        public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, Range range)
+        {
+            if (text == null)
+            {
+                Index startIndex = range.Start;
+                Index endIndex = range.End;
+
+                if (!startIndex.Equals(Index.Start) || !endIndex.Equals(Index.Start))
+                    ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text);
+
+                return default;
+            }
+
+            (int start, int length) = range.GetOffsetAndLength(text.Length);
+            return new ReadOnlyMemory<Char8>(text, start, length);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text)
+        {
+            if (text == null)
+                return default;
+
+            return new ReadOnlyMemory<byte>(text, 0, text.Length);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index is not in range (&lt;0 or &gt;text.Length).
+        /// </exception>
+        public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, int start)
+        {
+            if (text == null)
+            {
+                if (start != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+            if ((uint)start > (uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+
+            return new ReadOnlyMemory<byte>(text, start, text.Length - start);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="startIndex">The index at which to begin this slice.</param>
+        public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, Index startIndex)
+        {
+            if (text == null)
+            {
+                if (!startIndex.Equals(Index.Start))
+                    ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text);
+
+                return default;
+            }
+
+            int actualIndex = startIndex.GetOffset(text.Length);
+            if ((uint)actualIndex > (uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException();
+
+            return new ReadOnlyMemory<byte>(text, actualIndex, text.Length - actualIndex);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="start">The index at which to begin this slice.</param>
+        /// <param name="length">The desired length for the slice (exclusive).</param>
+        /// <remarks>Returns default when <paramref name="text"/> is null.</remarks>
+        /// <exception cref="System.ArgumentOutOfRangeException">
+        /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range.
+        /// </exception>
+        public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, int start, int length)
+        {
+            if (text == null)
+            {
+                if (start != 0 || length != 0)
+                    ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+                return default;
+            }
+
+#if BIT64
+            // See comment in Span<T>.Slice for how this works.
+            if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length)
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#else
+            if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start))
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start);
+#endif
+
+            return new ReadOnlyMemory<byte>(text, start, length);
+        }
+
+        /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary>
+        /// <param name="text">The target <see cref="Utf8String"/>.</param>
+        /// <param name="range">The range used to indicate the start and length of the sliced string.</param>
+        public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, Range range)
+        {
+            if (text == null)
+            {
+                Index startIndex = range.Start;
+                Index endIndex = range.End;
+
+                if (!startIndex.Equals(Index.Start) || !endIndex.Equals(Index.Start))
+                    ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text);
+
+                return default;
+            }
+
+            (int start, int length) = range.GetOffsetAndLength(text.Length);
+            return new ReadOnlyMemory<byte>(text, start, length);
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs b/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs
new file mode 100644 (file)
index 0000000..9ecd44f
--- /dev/null
@@ -0,0 +1,223 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Text.Unicode;
+
+namespace System
+{
+    public sealed partial class Utf8String
+    {
+        /*
+         * CONSTRUCTORS
+         *
+         * Defining a new constructor for string-like types (like Utf8String) requires changes both
+         * to the managed code below and to the native VM code. See the comment at the top of
+         * src/vm/ecall.cpp for instructions on how to add new overloads.
+         *
+         * The default behavior of each ctor is to validate the input, replacing invalid sequences with the
+         * Unicode replacement character U+FFFD. The resulting Utf8String instance will be well-formed but
+         * might not have full fidelity with the input data. This behavior can be controlled by calling
+         * any of the Create instances and specifying a different action.
+         */
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing UTF-8 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-8 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        public extern Utf8String(ReadOnlySpan<byte> value);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private Utf8String Ctor(ReadOnlySpan<byte> value)
+        {
+            if (value.IsEmpty)
+            {
+                return Empty;
+            }
+
+            Utf8String newString = FastAllocate(value.Length);
+            Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref MemoryMarshal.GetReference(value), (uint)value.Length);
+            return Utf8Utility.ValidateAndFixupUtf8String(newString);
+        }
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing UTF-8 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-8 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        public extern Utf8String(byte[] value, int startIndex, int length);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private Utf8String Ctor(byte[] value, int startIndex, int length) => Ctor(new ReadOnlySpan<byte>(value, startIndex, length));
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing null-terminated UTF-8 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-8 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        [CLSCompliant(false)]
+        public unsafe extern Utf8String(byte* value);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private unsafe Utf8String Ctor(byte* value)
+        {
+            if (value == null)
+            {
+                return Empty;
+            }
+
+            return Ctor(new ReadOnlySpan<byte>(value, string.strlen(value)));
+        }
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing UTF-16 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        public extern Utf8String(ReadOnlySpan<char> value);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private Utf8String Ctor(ReadOnlySpan<char> value)
+        {
+            if (value.IsEmpty)
+            {
+                return Empty;
+            }
+
+            // TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
+
+            Utf8String newString = FastAllocate(Encoding.UTF8.GetByteCount(value));
+            Encoding.UTF8.GetBytes(value, new Span<byte>(ref newString.DangerousGetMutableReference(), newString.Length));
+            return newString;
+        }
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing UTF-16 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        public extern Utf8String(char[] value, int startIndex, int length);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private Utf8String Ctor(char[] value, int startIndex, int length) => Ctor(new ReadOnlySpan<char>(value, startIndex, length));
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing null-terminated UTF-16 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        [CLSCompliant(false)]
+        public unsafe extern Utf8String(char* value);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private unsafe Utf8String Ctor(char* value)
+        {
+            if (value == null)
+            {
+                return Empty;
+            }
+
+            return Ctor(new ReadOnlySpan<char>(value, string.wcslen(value)));
+        }
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing UTF-16 data.
+        /// </summary>
+        /// <remarks>
+        /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction.
+        /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        public extern Utf8String(string value);
+
+#if PROJECTN
+        [DependencyReductionRoot]
+#endif
+#if !CORECLR
+        static
+#endif
+        private Utf8String Ctor(string value) => Ctor(value.AsSpan());
+
+        /*
+         * HELPER METHODS
+         */
+
+        /// <summary>
+        /// Creates a <see cref="Utf8String"/> instance from existing data, bypassing validation.
+        /// Also allows the caller to set flags dictating various attributes of the data.
+        /// </summary>
+        internal static Utf8String DangerousCreateWithoutValidation(ReadOnlySpan<byte> utf8Data, bool assumeWellFormed = false, bool assumeAscii = false)
+        {
+            if (utf8Data.IsEmpty)
+            {
+                return Empty;
+            }
+
+            Utf8String newString = FastAllocate(utf8Data.Length);
+            utf8Data.CopyTo(new Span<byte>(ref newString.DangerousGetMutableReference(), newString.Length));
+            return newString;
+        }
+
+        /// <summary>
+        /// Creates a new zero-initialized instance of the specified length. Actual storage allocated is "length + 1" bytes
+        /// because instances are null-terminated.
+        /// </summary>
+        /// <remarks>
+        /// The implementation of this method checks its input argument for overflow.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.InternalCall)]
+        private static extern Utf8String FastAllocate(int length);
+    }
+}
diff --git a/src/System.Private.CoreLib/src/System/Utf8String.Manipulation.cs b/src/System.Private.CoreLib/src/System/Utf8String.Manipulation.cs
new file mode 100644 (file)
index 0000000..6e52099
--- /dev/null
@@ -0,0 +1,109 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace System
+{
+    public sealed partial class Utf8String
+    {
+        /// <summary>
+        /// Substrings this <see cref="Utf8String"/> without bounds checking.
+        /// </summary>
+        private Utf8String InternalSubstring(int startIndex, int length)
+        {
+            Debug.Assert(startIndex >= 0, "StartIndex cannot be negative.");
+            Debug.Assert(startIndex <= this.Length, "StartIndex cannot point beyond the end of the string (except to the null terminator).");
+            Debug.Assert(length >= 0, "Length cannot be negative.");
+            Debug.Assert(startIndex + length <= this.Length, "StartIndex and Length cannot point beyond the end of the string.");
+
+            Debug.Assert(startIndex != 0 && startIndex != this.Length, "Caller should handle StartIndex boundary conditions.");
+            Debug.Assert(length != 0 && length != this.Length, "Caller should handle Length boundary conditions.");
+
+            Utf8String newString = FastAllocate(length);
+            Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref this.DangerousGetMutableReference(startIndex), (uint)length);
+            return newString;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public Utf8String Substring(Index startIndex)
+        {
+            int actualIndex = startIndex.GetOffset(Length);
+            return Substring(actualIndex);
+        }
+
+        public Utf8String Substring(int startIndex)
+        {
+            if ((uint)startIndex > (uint)this.Length)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startIndex);
+            }
+
+            // Optimizations: since instances are immutable, we can return 'this' or the known
+            // Empty instance if the caller passed us a startIndex at the string boundary.
+
+            if (startIndex == 0)
+            {
+                return this;
+            }
+
+            if (startIndex == Length)
+            {
+                return Empty;
+            }
+
+            return InternalSubstring(startIndex, Length - startIndex);
+        }
+
+        public Utf8String Substring(int startIndex, int length)
+        {
+            ValidateStartIndexAndLength(startIndex, length);
+
+            // Optimizations: since instances are immutable, we can return 'this' or the known
+            // Empty instance if the caller passed us a startIndex at the string boundary.
+
+            if (length == 0)
+            {
+                return Empty;
+            }
+
+            if (length == this.Length)
+            {
+                return this;
+            }
+
+            return InternalSubstring(startIndex, length);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public Utf8String Substring(Range range)
+        {
+            (int start, int length) = range.GetOffsetAndLength(Length);
+            return Substring(start, length);
+        }
+
+        [StackTraceHidden]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ValidateStartIndexAndLength(int startIndex, int length)
+        {
+#if BIT64
+            // See comment in Span<T>.Slice for how this works.
+            if ((ulong)(uint)startIndex + (ulong)(uint)length > (ulong)(uint)this.Length)
+                ValidateStartIndexAndLength_Throw(startIndex, length);
+#else
+            if ((uint)startIndex > (uint)this.Length || (uint)length > (uint)(this.Length - startIndex))
+                ValidateStartIndexAndLength_Throw(startIndex, length);
+#endif
+        }
+
+        [StackTraceHidden]
+        private void ValidateStartIndexAndLength_Throw(int startIndex, int length)
+        {
+            throw new ArgumentOutOfRangeException(paramName: ((uint)startIndex > (uint)this.Length) ? nameof(startIndex) : nameof(length));
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/src/System/Utf8String.Searching.cs b/src/System.Private.CoreLib/src/System/Utf8String.Searching.cs
new file mode 100644 (file)
index 0000000..0373cdd
--- /dev/null
@@ -0,0 +1,93 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Text.Unicode;
+
+namespace System
+{
+    public sealed partial class Utf8String
+    {
+        // Ordinal search
+        public bool Contains(char value)
+        {
+            return Rune.TryCreate(value, out Rune result) && Contains(result);
+        }
+
+        // Ordinal search
+        public bool Contains(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return SpanHelpers.IndexOf(
+                ref DangerousGetMutableReference(), Length,
+                ref MemoryMarshal.GetReference(runeBytes), runeBytesWritten) >= 0;
+        }
+
+        // Ordinal search
+        public bool EndsWith(char value)
+        {
+            return Rune.TryCreate(value, out Rune result) && EndsWith(result);
+        }
+
+        // Ordinal search
+        public bool EndsWith(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return this.AsBytes().EndsWith(runeBytes.Slice(0, runeBytesWritten));
+        }
+
+        // Ordinal search
+        public int IndexOf(char value)
+        {
+            return Rune.TryCreate(value, out Rune result) ? IndexOf(result) : -1;
+        }
+
+        // Ordinal search
+        public int IndexOf(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return SpanHelpers.IndexOf(
+                ref DangerousGetMutableReference(), Length,
+                ref MemoryMarshal.GetReference(runeBytes), runeBytesWritten);
+        }
+
+        // Ordinal search
+        public bool StartsWith(char value)
+        {
+            return Rune.TryCreate(value, out Rune result) && StartsWith(result);
+        }
+
+        // Ordinal search
+        public bool StartsWith(Rune value)
+        {
+            // TODO_UTF8STRING: This should be split into two methods:
+            // One which operates on a single-byte (ASCII) search value,
+            // the other which operates on a multi-byte (non-ASCII) search value.
+
+            Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar];
+            int runeBytesWritten = value.EncodeToUtf8(runeBytes);
+
+            return this.AsBytes().StartsWith(runeBytes.Slice(0, runeBytesWritten));
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/src/System/Utf8String.cs b/src/System.Private.CoreLib/src/System/Utf8String.cs
new file mode 100644 (file)
index 0000000..1a4357a
--- /dev/null
@@ -0,0 +1,252 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Text;
+using Internal.Runtime.CompilerServices;
+
+namespace System
+{
+    /// <summary>
+    /// Represents an immutable string of UTF-8 code units.
+    /// </summary>
+    public sealed partial class Utf8String : IEquatable<Utf8String>
+    {
+        /*
+         * STATIC FIELDS
+         */
+
+        public static readonly Utf8String Empty = FastAllocate(0);
+
+        /*
+         * INSTANCE FIELDS
+         * Do not reorder these fields. They must match the layout of Utf8StringObject in object.h.
+         */
+
+        private readonly int _length;
+        private readonly byte _firstByte;
+
+        /*
+         * OPERATORS
+         */
+
+        /// <summary>
+        /// Compares two <see cref="Utf8String"/> instances for equality using a <see cref="StringComparison.Ordinal"/> comparer.
+        /// </summary>
+        public static bool operator ==(Utf8String left, Utf8String right) => Equals(left, right);
+
+        /// <summary>
+        /// Compares two <see cref="Utf8String"/> instances for inequality using a <see cref="StringComparison.Ordinal"/> comparer.
+        /// </summary>
+        public static bool operator !=(Utf8String left, Utf8String right) => !Equals(left, right);
+
+        /// <summary>
+        /// Projects a <see cref="Utf8String"/> instance as a <see cref="ReadOnlySpan{Byte}"/>.
+        /// </summary>
+        public static explicit operator ReadOnlySpan<byte>(Utf8String value) => value.AsBytes();
+
+        /// <summary>
+        /// Projects a <see cref="Utf8String"/> instance as a <see cref="ReadOnlySpan{Char8}"/>.
+        /// </summary>
+        public static implicit operator ReadOnlySpan<Char8>(Utf8String value) => value.AsSpan();
+
+        /*
+         * INSTANCE PROPERTIES
+         */
+
+        /// <summary>
+        /// Returns the length (in UTF-8 code units) of this instance.
+        /// </summary>
+        public int Length => _length;
+
+        /*
+         * INSTANCE INDEXERS
+         */
+
+        /// <summary>
+        /// Gets the <see cref="Char8"/> at the specified position.
+        /// </summary>
+        public Char8 this[int index]
+        {
+            get
+            {
+                // Just like String, we don't allow indexing into the null terminator itself.
+
+                if ((uint)index >= (uint)Length)
+                {
+                    ThrowHelper.ThrowArgumentOutOfRange_IndexException();
+                }
+
+                return Unsafe.Add(ref DangerousGetMutableReference(), index);
+            }
+        }
+
+        /// <summary>
+        /// Gets the <see cref="Char8"/> at the specified position.
+        /// </summary>
+        public Char8 this[Index index]
+        {
+            get
+            {
+                // Just like String, we don't allow indexing into the null terminator itself.
+
+                int actualIndex = index.GetOffset(Length);
+                return this[actualIndex];
+            }
+        }
+
+        /// <summary>
+        /// Gets a substring of this <see cref="Utf8String"/> based on the provided <paramref name="range"/>.
+        /// </summary>
+        public Utf8String this[Range range] => Substring(range);
+
+        /*
+         * METHODS
+         */
+
+        /// <summary>
+        /// Returns a <em>mutable</em> reference to the first byte of this <see cref="Utf8String"/>
+        /// (or the null terminator if the string is empty).
+        /// </summary>
+        /// <returns></returns>
+        internal ref byte DangerousGetMutableReference() => ref Unsafe.AsRef(in _firstByte);
+
+        /// <summary>
+        /// Returns a <em>mutable</em> reference to the element at index <paramref name="index"/>
+        /// of this <see cref="Utf8String"/> instance. The index is not bounds-checked.
+        /// </summary>
+        internal ref byte DangerousGetMutableReference(int index)
+        {
+            // Allow retrieving references to the null terminator.
+            Debug.Assert((uint)index <= (uint)Length, "Caller should've performed bounds checking.");
+
+            return ref Unsafe.Add(ref DangerousGetMutableReference(), index);
+        }
+
+        /// <summary>
+        /// Performs an equality comparison using a <see cref="StringComparison.Ordinal"/> comparer.
+        /// </summary>
+        public override bool Equals(object obj)
+        {
+            return obj is Utf8String other && this.Equals(other);
+        }
+
+        /// <summary>
+        /// Performs an equality comparison using a <see cref="StringComparison.Ordinal"/> comparer.
+        /// </summary>
+        public bool Equals(Utf8String value)
+        {
+            // First, a very quick check for referential equality.
+
+            if (ReferenceEquals(this, value))
+            {
+                return true;
+            }
+
+            // Otherwise, perform a simple bitwise equality check.
+
+            return !(value is null)
+                && this.Length == value.Length
+                && SpanHelpers.SequenceEqual(ref this.DangerousGetMutableReference(), ref value.DangerousGetMutableReference(), (uint)Length);
+        }
+
+        /// <summary>
+        /// Compares two <see cref="Utf8String"/> instances using a <see cref="StringComparison.Ordinal"/> comparer.
+        /// </summary>
+        public static bool Equals(Utf8String left, Utf8String right)
+        {
+            // First, a very quick check for referential equality.
+
+            if (ReferenceEquals(left, right))
+            {
+                return true;
+            }
+
+            // Otherwise, perform a simple bitwise equality check.
+
+            return !(left is null)
+                && !(right is null)
+                && left.Length == right.Length
+                && SpanHelpers.SequenceEqual(ref left.DangerousGetMutableReference(), ref right.DangerousGetMutableReference(), (uint)left.Length);
+        }
+
+        /// <summary>
+        /// Returns a hash code using a <see cref="StringComparison.Ordinal"/> comparison.
+        /// </summary>
+        public override int GetHashCode()
+        {
+            // TODO_UTF8STRING: Consider whether this should use a different seed than String.GetHashCode.
+
+            ulong seed = Marvin.DefaultSeed;
+            return Marvin.ComputeHash32(ref DangerousGetMutableReference(), _length /* in bytes */, (uint)seed, (uint)(seed >> 32));
+        }
+
+        /// <summary>
+        /// Gets an immutable reference that can be used in a <see langword="fixed"/> statement. The resulting
+        /// reference can be pinned and used as a null-terminated <em>LPCUTF8STR</em>.
+        /// </summary>
+        /// <remarks>
+        /// If this <see cref="Utf8String"/> instance is empty, returns a reference to the null terminator.
+        /// </remarks>
+        [EditorBrowsable(EditorBrowsableState.Never)] // for compiler use only
+        public ref readonly byte GetPinnableReference() => ref _firstByte;
+
+        /// <summary>
+        /// Returns <see langword="true"/> if <paramref name="value"/> is <see langword="null"/> or zero length;
+        /// <see langword="false"/> otherwise.
+        /// </summary>
+        public static bool IsNullOrEmpty(Utf8String value)
+        {
+            // Copied from String.IsNullOrEmpty. See that method for detailed comments on why this pattern is used.
+            return (value is null || 0u >= (uint)value.Length) ? true : false;
+        }
+
+        /// <summary>
+        /// Returns the entire <see cref="Utf8String"/> as an array of bytes.
+        /// </summary>
+        public byte[] ToByteArray()
+        {
+            if (Length == 0)
+            {
+                return Array.Empty<byte>();
+            }
+
+            byte[] bytes = new byte[Length];
+            Buffer.Memmove(ref bytes.GetRawSzArrayData(), ref DangerousGetMutableReference(), (uint)Length);
+            return bytes;
+        }
+
+        /// <summary>
+        /// Returns a substring of this <see cref="Utf8String"/> as an array of bytes.
+        /// </summary>
+        public byte[] ToByteArray(int startIndex, int length)
+        {
+            ValidateStartIndexAndLength(startIndex, length);
+
+            if (length == 0)
+            {
+                return Array.Empty<byte>();
+            }
+
+            byte[] bytes = new byte[length];
+            Buffer.Memmove(ref bytes.GetRawSzArrayData(), ref DangerousGetMutableReference(startIndex), (uint)length);
+            return bytes;
+        }
+
+        /// <summary>
+        /// Converts this <see cref="Utf8String"/> instance to a <see cref="string"/>.
+        /// </summary>
+        /// <remarks>
+        /// Invalid subsequences are replaced with U+FFFD during conversion.
+        /// </remarks>
+        public override string ToString()
+        {
+            // TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
+
+            return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref DangerousGetMutableReference(), Length));
+        }
+    }
+}
index a90a37a..64914d8 100644 (file)
@@ -253,6 +253,9 @@ FCIMPL1(Object*, ObjectNative::Clone, Object* pThisUNSAFE)
 
     // assert that String has overloaded the Clone() method
     _ASSERTE(pMT != g_pStringClass);
+#ifdef FEATURE_UTF8STRING
+    _ASSERTE(pMT != g_pUtf8StringClass);
+#endif // FEATURE_UTF8STRING
 
     if (pMT->IsArray()) {
         refClone = DupArrayForCloning((BASEARRAYREF)refThis);
index fc5be15..cec6d74 100644 (file)
@@ -168,6 +168,9 @@ DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pObjectClass, ::g_pObjectClass
 DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pRuntimeTypeClass, ::g_pRuntimeTypeClass)
 DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pCanonMethodTableClass, ::g_pCanonMethodTableClass)
 DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pStringClass, ::g_pStringClass)
+#ifdef FEATURE_UTF8STRING
+DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pUtf8StringClass, ::g_pUtf8StringClass)
+#endif // FEATURE_UTF8STRING
 DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pArrayClass, ::g_pArrayClass)
 DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pSZArrayHelperClass, ::g_pSZArrayHelperClass)
 DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pNullableClass, ::g_pNullableClass)
index 26c545c..626d9bb 100644 (file)
@@ -146,6 +146,9 @@ typedef DPTR(class ReJitManager)        PTR_ReJitManager;
 typedef DPTR(struct ReJitInfo)          PTR_ReJitInfo;
 typedef DPTR(struct SharedReJitInfo)    PTR_SharedReJitInfo;
 typedef DPTR(class StringObject)        PTR_StringObject;
+#ifdef FEATURE_UTF8STRING
+typedef DPTR(class Utf8StringObject)    PTR_Utf8StringObject;
+#endif // FEATURE_UTF8STRING
 typedef DPTR(class TypeHandle)          PTR_TypeHandle;
 #ifdef STUB_DISPATCH
 typedef VPTR(class VirtualCallStubManager) PTR_VirtualCallStubManager;
index 4eb7161..9362dd9 100644 (file)
@@ -2485,6 +2485,11 @@ void SystemDomain::LoadBaseSystemClasses()
     // Load String
     g_pStringClass = MscorlibBinder::LoadPrimitiveType(ELEMENT_TYPE_STRING);
 
+#ifdef FEATURE_UTF8STRING
+    // Load Utf8String
+    g_pUtf8StringClass = MscorlibBinder::GetClass(CLASS__UTF8_STRING);
+#endif // FEATURE_UTF8STRING
+
     // Used by Buffer::BlockCopy
     g_pByteArrayMT = ClassLoader::LoadArrayTypeThrowing(
         TypeHandle(MscorlibBinder::GetElementType(ELEMENT_TYPE_U1))).AsArray()->GetMethodTable();
index cb71df3..f45311f 100644 (file)
 #define g_ThreadClassName "System.Threading.Thread"
 #define g_TypeClassName   "System.Type"
 
+#ifdef FEATURE_UTF8STRING
+#define g_Utf8StringName "Utf8String"
+#endif // FEATURE_UTF8STRING
+
 #define g_VariantClassName "System.Variant"
 #define g_GuidClassName "System.Guid"
 
index 2a91e77..61ba2a7 100644 (file)
@@ -167,6 +167,9 @@ typedef DPTR(class ReJitManager)        PTR_ReJitManager;
 typedef DPTR(struct ReJitInfo)          PTR_ReJitInfo;
 typedef DPTR(struct SharedReJitInfo)    PTR_SharedReJitInfo;
 typedef DPTR(class StringObject)        PTR_StringObject;
+#ifdef FEATURE_UTF8STRING
+typedef DPTR(class Utf8StringObject)    PTR_Utf8StringObject;
+#endif // FEATURE_UTF8STRING
 typedef DPTR(class TypeHandle)          PTR_TypeHandle;
 typedef VPTR(class VirtualCallStubManager) PTR_VirtualCallStubManager;
 typedef VPTR(class VirtualCallStubManagerManager) PTR_VirtualCallStubManagerManager;
index b8e0d64..dfeff95 100644 (file)
@@ -29,6 +29,36 @@ extern const int c_nECClasses;
 #endif // CROSSGEN_COMPILE
 
 
+/**********
+
+The constructors of string-like types (String, Utf8String) are special since the JIT will
+replace newobj instructions with calls to the corresponding 'Ctor' method. Depending on the
+CLR in use, the ctor methods may be instance methods (with a null 'this' parameter) or
+static methods. See the managed definitions of String.Ctor and Utf8String.Ctor for more
+information.
+
+To add a new ctor overload, in addition to defining the constructor and Ctor methods on
+the managed side, make changes to the following files. (These instructions are for
+Utf8String, but String is similar.)
+
+- src/vm/ecall.cpp (this file), update the definition of "NumberOfUtf8StringConstructors"
+  and add the appropriate static asserts immediately above the definition.
+
+- src/vm/ecall.h, search for "Utf8StringCtor" and add the DYNAMICALLY_ASSIGNED_FCALL_IMPL
+  definitions corresponding to the new overloads.
+
+- src/vm/ecalllist.h, search for "FCFuncStart(gUtf8StringFuncs)" and add the overloads
+  within that block.
+
+- src/vm/metasig.h, add the new Utf8String-returning metasig declarations; and, if necessary,
+  add any void-returning metasig declarations if they haven't already been defined elsewhere.
+  search "String_RetUtf8Str" for an example of how to do this.
+
+- src/vm/mscorlib.h, search "DEFINE_CLASS(UTF8_STRING" and add the new DEFINE_METHOD
+  declarations for the Utf8String-returning Ctor methods, referencing the new metasig declarations.
+
+**********/
+
 // METHOD__STRING__CTORF_XXX has to be in same order as ECall::CtorCharXxx
 #define METHOD__STRING__CTORF_FIRST METHOD__STRING__CTORF_CHARARRAY
 static_assert_no_msg(METHOD__STRING__CTORF_FIRST + 0 == METHOD__STRING__CTORF_CHARARRAY);
@@ -55,14 +85,38 @@ static_assert_no_msg(ECallCtor_First + 8 == ECall::CtorSBytePtrStartLengthEncodi
 
 #define NumberOfStringConstructors 9
 
+#ifdef FEATURE_UTF8STRING
+// METHOD__UTF8STRING__CTORF_XXX has to be in same order as ECall::Utf8StringCtorCharXxx
+#define METHOD__UTF8STRING__CTORF_FIRST METHOD__UTF8_STRING__CTORF_READONLYSPANOFBYTE
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 0 == METHOD__UTF8_STRING__CTORF_READONLYSPANOFBYTE);
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 1 == METHOD__UTF8_STRING__CTORF_READONLYSPANOFCHAR);
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 2 == METHOD__UTF8_STRING__CTORF_BYTEARRAY_START_LEN);
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 3 == METHOD__UTF8_STRING__CTORF_BYTEPTR);
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 4 == METHOD__UTF8_STRING__CTORF_CHARARRAY_START_LEN);
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 5 == METHOD__UTF8_STRING__CTORF_CHARPTR);
+static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 6 == METHOD__UTF8_STRING__CTORF_STRING);
+
+// ECall::Utf8StringCtorCharXxx has to be in same order as METHOD__UTF8STRING__CTORF_XXX
+#define ECallUtf8String_Ctor_First ECall::Utf8StringCtorReadOnlySpanOfByteManaged
+static_assert_no_msg(ECallUtf8String_Ctor_First + 0 == ECall::Utf8StringCtorReadOnlySpanOfByteManaged);
+static_assert_no_msg(ECallUtf8String_Ctor_First + 1 == ECall::Utf8StringCtorReadOnlySpanOfCharManaged);
+static_assert_no_msg(ECallUtf8String_Ctor_First + 2 == ECall::Utf8StringCtorByteArrayStartLengthManaged);
+static_assert_no_msg(ECallUtf8String_Ctor_First + 3 == ECall::Utf8StringCtorBytePtrManaged);
+static_assert_no_msg(ECallUtf8String_Ctor_First + 4 == ECall::Utf8StringCtorCharArrayStartLengthManaged);
+static_assert_no_msg(ECallUtf8String_Ctor_First + 5 == ECall::Utf8StringCtorCharPtrManaged);
+static_assert_no_msg(ECallUtf8String_Ctor_First + 6 == ECall::Utf8StringCtorStringManaged);
+
+#define NumberOfUtf8StringConstructors 7
+#endif // FEATURE_UTF8STRING
+
 void ECall::PopulateManagedStringConstructors()
 {
     STANDARD_VM_CONTRACT;
 
     INDEBUG(static bool fInitialized = false);
     _ASSERTE(!fInitialized);    // assume this method is only called once
-    _ASSERTE(g_pStringClass != NULL);
 
+    _ASSERTE(g_pStringClass != NULL);
     for (int i = 0; i < NumberOfStringConstructors; i++)
     {
         MethodDesc* pMD = MscorlibBinder::GetMethod((BinderMethodID)(METHOD__STRING__CTORF_FIRST + i));
@@ -72,6 +126,20 @@ void ECall::PopulateManagedStringConstructors()
 
         ECall::DynamicallyAssignFCallImpl(pDest, ECallCtor_First + i);
     }
+
+#ifdef FEATURE_UTF8STRING
+    _ASSERTE(g_pUtf8StringClass != NULL);
+    for (int i = 0; i < NumberOfUtf8StringConstructors; i++)
+    {
+        MethodDesc* pMD = MscorlibBinder::GetMethod((BinderMethodID)(METHOD__UTF8STRING__CTORF_FIRST + i));
+        _ASSERTE(pMD != NULL);
+    
+        PCODE pDest = pMD->GetMultiCallableAddrOfCode();
+
+        ECall::DynamicallyAssignFCallImpl(pDest, ECallUtf8String_Ctor_First + i);
+    }
+#endif // FEATURE_UTF8STRING
+
     INDEBUG(fInitialized = true);
 }
 
index c809109..58b4f0c 100644 (file)
@@ -103,7 +103,7 @@ class ECall
         static void EnumFCallMethods();
 #endif // DACCESS_COMPILE
 
-#define DYNAMICALLY_ASSIGNED_FCALLS() \
+#define _DYNAMICALLY_ASSIGNED_FCALLS_BASE() \
     DYNAMICALLY_ASSIGNED_FCALL_IMPL(FastAllocateString,                FramedAllocateString) \
     DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorCharArrayManaged,              NULL) \
     DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorCharArrayStartLengthManaged,   NULL) \
@@ -116,6 +116,22 @@ class ECall
     DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorSBytePtrStartLengthEncodingManaged, NULL) \
     DYNAMICALLY_ASSIGNED_FCALL_IMPL(InternalGetCurrentThread,          NULL) \
 
+#define _DYNAMICALLY_ASSIGNED_FCALLS_UTF8STRING() \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(FastAllocateUtf8String,            FramedAllocateUtf8String) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorReadOnlySpanOfByteManaged,   NULL) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorReadOnlySpanOfCharManaged,   NULL) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorByteArrayStartLengthManaged, NULL) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorBytePtrManaged,              NULL) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorCharArrayStartLengthManaged, NULL) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorCharPtrManaged,              NULL) \
+    DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorStringManaged,               NULL) \
+
+#ifdef FEATURE_UTF8STRING
+#define DYNAMICALLY_ASSIGNED_FCALLS() _DYNAMICALLY_ASSIGNED_FCALLS_BASE() _DYNAMICALLY_ASSIGNED_FCALLS_UTF8STRING()
+#else
+#define DYNAMICALLY_ASSIGNED_FCALLS() _DYNAMICALLY_ASSIGNED_FCALLS_BASE()
+#endif // FEATURE_UTF8STRING
+
         enum
         {
             #undef DYNAMICALLY_ASSIGNED_FCALL_IMPL
index b44669e..7302bb4 100644 (file)
@@ -116,6 +116,19 @@ FCFuncStart(gStringFuncs)
     FCFuncElement("Intern", AppDomainNative::GetOrInternString)
 FCFuncEnd()
 
+#ifdef FEATURE_UTF8STRING
+FCFuncStart(gUtf8StringFuncs)
+    FCDynamic("FastAllocate", CORINFO_INTRINSIC_Illegal, ECall::FastAllocateUtf8String)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ReadOnlySpanOfByte_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorReadOnlySpanOfByteManaged)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ReadOnlySpanOfChar_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorReadOnlySpanOfCharManaged)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ArrByte_Int_Int_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorByteArrayStartLengthManaged)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_PtrByte_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorBytePtrManaged)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ArrChar_Int_Int_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorCharArrayStartLengthManaged)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_PtrChar_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorCharPtrManaged)
+    FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_Str_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorStringManaged)
+FCFuncEnd()
+#endif // FEATURE_UTF8STRING
+
 FCFuncStart(gValueTypeFuncs)
     FCFuncElement("CanCompareBits", ValueTypeHelper::CanCompareBits)
     FCFuncElement("FastEqualsCheck", ValueTypeHelper::FastEqualsCheck)
@@ -1270,6 +1283,9 @@ FCClassElement("TypedReference", "System", gTypedReferenceFuncs)
 #ifdef FEATURE_COMINTEROP
 FCClassElement("UriMarshaler", "System.StubHelpers", gUriMarshalerFuncs)
 #endif
+#ifdef FEATURE_UTF8STRING
+FCClassElement("Utf8String", "System", gUtf8StringFuncs)
+#endif // FEATURE_UTF8STRING
 FCClassElement("ValueClassMarshaler", "System.StubHelpers", gValueClassMarshalerFuncs)
 FCClassElement("ValueType", "System", gValueTypeFuncs)
 #ifdef FEATURE_COMINTEROP
index a52e10b..af3a160 100644 (file)
@@ -981,6 +981,8 @@ STRINGREF SlowAllocateString( DWORD cchStringLength )
 
     // Limit the maximum string size to <2GB to mitigate risk of security issues caused by 32-bit integer
     // overflows in buffer size calculations.
+    //
+    // If the value below is changed, also change SlowAllocateUtf8String.
     if (cchStringLength > 0x3FFFFFDF)
         ThrowOutOfMemory();
 
@@ -1028,6 +1030,81 @@ STRINGREF SlowAllocateString( DWORD cchStringLength )
     return( ObjectToSTRINGREF(orObject) );
 }
 
+#ifdef FEATURE_UTF8STRING
+UTF8STRINGREF SlowAllocateUtf8String(DWORD cchStringLength)
+{
+    CONTRACTL{
+        THROWS;
+        GC_TRIGGERS;
+        MODE_COOPERATIVE; // returns an objref without pinning it => cooperative
+    } CONTRACTL_END;
+
+    Utf8StringObject    *orObject = NULL;
+
+#ifdef _DEBUG
+    if (g_pConfig->ShouldInjectFault(INJECTFAULT_GCHEAP))
+    {
+        char *a = new char;
+        delete a;
+    }
+#endif
+
+    // Limit the maximum string size to <2GB to mitigate risk of security issues caused by 32-bit integer
+    // overflows in buffer size calculations.
+    //
+    // 0x7FFFFFBF is derived from the const 0x3FFFFFDF in SlowAllocateString.
+    // Adding +1 (for null terminator) and multiplying by sizeof(WCHAR) means that
+    // SlowAllocateString allows a maximum of 0x7FFFFFC0 bytes to be used for the
+    // string data itself, with some additional buffer for object headers and other
+    // data. Since we don't have the sizeof(WCHAR) multiplication here, we only need
+    // -1 to account for the null terminator, leading to a max size of 0x7FFFFFBF.
+    if (cchStringLength > 0x7FFFFFBF)
+        ThrowOutOfMemory();
+
+    SIZE_T ObjectSize = PtrAlign(Utf8StringObject::GetSize(cchStringLength));
+    _ASSERTE(ObjectSize > cchStringLength);
+
+    SetTypeHandleOnThreadForAlloc(TypeHandle(g_pUtf8StringClass));
+
+    orObject = (Utf8StringObject *)Alloc(ObjectSize, FALSE, FALSE);
+
+    // Object is zero-init already
+    _ASSERTE(orObject->HasEmptySyncBlockInfo());
+
+    // Initialize Object
+    orObject->SetMethodTable(g_pUtf8StringClass);
+    orObject->SetLength(cchStringLength);
+
+    if (ObjectSize >= LARGE_OBJECT_SIZE)
+    {
+        GCHeapUtilities::GetGCHeap()->PublishObject((BYTE*)orObject);
+    }
+
+    // Notify the profiler of the allocation
+    if (TrackAllocations())
+    {
+        OBJECTREF objref = ObjectToOBJECTREF((Object*)orObject);
+        GCPROTECT_BEGIN(objref);
+        ProfilerObjectAllocatedCallback(objref, (ClassID)orObject->GetTypeHandle().AsPtr());
+        GCPROTECT_END();
+
+        orObject = (Utf8StringObject *)OBJECTREFToObject(objref);
+    }
+
+#ifdef FEATURE_EVENT_TRACE
+    // Send ETW event for allocation
+    if (ETW::TypeSystemLog::IsHeapAllocEventEnabled())
+    {
+        ETW::TypeSystemLog::SendObjectAllocatedEvent(orObject);
+    }
+#endif // FEATURE_EVENT_TRACE
+
+    LogAlloc(ObjectSize, g_pUtf8StringClass, orObject);
+
+    return( ObjectToUTF8STRINGREF(orObject) );
+}
+#endif // FEATURE_UTF8STRING
+
 #ifdef FEATURE_COMINTEROP_UNMANAGED_ACTIVATION
 // OBJECTREF AllocateComClassObject(ComClassFactory* pComClsFac)
 void AllocateComClassObject(ComClassFactory* pComClsFac, OBJECTREF* ppRefClass)
index 0e407c6..8f6a16a 100644 (file)
@@ -71,6 +71,10 @@ STRINGREF AllocateString( DWORD cchStringLength );
     // The slow version, implemented in gcscan.cpp
 STRINGREF SlowAllocateString( DWORD cchStringLength );
 
+#ifdef FEATURE_UTF8STRING
+UTF8STRINGREF SlowAllocateUtf8String( DWORD cchStringLength );
+#endif // FEATURE_UTF8STRING
+
 #else
 
 // On other platforms, go to the (somewhat less efficient) implementations in gcscan.cpp
@@ -83,6 +87,10 @@ OBJECTREF AllocateObjectArray(DWORD cElements, TypeHandle ElementType, BOOL bAll
 
 STRINGREF SlowAllocateString( DWORD cchStringLength );
 
+#ifdef FEATURE_UTF8STRING
+UTF8STRINGREF SlowAllocateUtf8String( DWORD cchStringLength );
+#endif // FEATURE_UTF8STRING
+
 inline STRINGREF AllocateString( DWORD cchStringLength )
 {
     WRAPPER_NO_CONTRACT;
@@ -92,6 +100,15 @@ inline STRINGREF AllocateString( DWORD cchStringLength )
 
 #endif
 
+#ifdef FEATURE_UTF8STRING
+inline UTF8STRINGREF AllocateUtf8String(DWORD cchStringLength)
+{
+    WRAPPER_NO_CONTRACT;
+
+    return SlowAllocateUtf8String(cchStringLength);
+}
+#endif // FEATURE_UTF8STRING
+
 OBJECTREF DupArrayForCloning(BASEARRAYREF pRef, BOOL bAllocateInLargeHeap = FALSE);
 
 // The JIT requests the EE to specify an allocation helper to use at each new-site.
index 303f061..0576ca7 100644 (file)
@@ -2895,6 +2895,61 @@ HCIMPL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength)
 }
 HCIMPLEND
 
+#ifdef FEATURE_UTF8STRING
+HCIMPL1(Utf8StringObject*, AllocateUtf8String_MP_FastPortable, DWORD stringLength)
+{
+    FCALL_CONTRACT;
+
+    do
+    {
+        _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());
+
+        // Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow
+        // problems, as well as making sure big string objects are correctly allocated in the big object heap.
+        if (stringLength >= LARGE_OBJECT_SIZE - 256)
+        {
+            break;
+        }
+
+        // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
+        // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
+        // some reshuffling of intermediate values into nonvolatile registers around the call.
+        Thread *thread = GetThread();
+
+        SIZE_T totalSize = Utf8StringObject::GetSize(stringLength);
+
+        // The method table's base size includes space for a terminating null character
+        _ASSERTE(totalSize >= g_pUtf8StringClass->GetBaseSize());
+        _ASSERTE(totalSize - g_pUtf8StringClass->GetBaseSize() == stringLength);
+
+        SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
+        _ASSERTE(alignedTotalSize >= totalSize);
+        totalSize = alignedTotalSize;
+
+        gc_alloc_context *allocContext = thread->GetAllocContext();
+        BYTE *allocPtr = allocContext->alloc_ptr;
+        _ASSERTE(allocPtr <= allocContext->alloc_limit);
+        if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+        {
+            break;
+        }
+        allocContext->alloc_ptr = allocPtr + totalSize;
+
+        _ASSERTE(allocPtr != nullptr);
+        Utf8StringObject *stringObject = reinterpret_cast<Utf8StringObject *>(allocPtr);
+        stringObject->SetMethodTable(g_pUtf8StringClass);
+        stringObject->SetLength(stringLength);
+
+        return stringObject;
+    } while (false);
+
+    // Tail call to the slow helper
+    ENDFORBIDGC();
+    return HCCALL1(FramedAllocateUtf8String, stringLength);
+}
+HCIMPLEND
+#endif // FEATURE_UTF8STRING
+
 #include <optdefault.h>
 
 /*********************************************************************/
@@ -2933,6 +2988,22 @@ HCIMPL1(StringObject*, FramedAllocateString, DWORD stringLength)
 }
 HCIMPLEND
 
+#ifdef FEATURE_UTF8STRING
+HCIMPL1(Utf8StringObject*, FramedAllocateUtf8String, DWORD stringLength)
+{
+    FCALL_CONTRACT;
+
+    UTF8STRINGREF result = NULL;
+    HELPER_METHOD_FRAME_BEGIN_RET_0();    // Set up a frame
+
+    result = SlowAllocateUtf8String(stringLength);
+
+    HELPER_METHOD_FRAME_END();
+    return((Utf8StringObject*) OBJECTREFToObject(result));
+}
+HCIMPLEND
+#endif // FEATURE_UTF8STRING
+
 /*********************************************************************/
 OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok)
 {
index af5fdba..b3ede3b 100644 (file)
@@ -7514,6 +7514,9 @@ bool getILIntrinsicImplementationForRuntimeHelpers(MethodDesc * ftn,
         if (methodTable == MscorlibBinder::GetClass(CLASS__BOOLEAN)
             || methodTable == MscorlibBinder::GetClass(CLASS__BYTE)
             || methodTable == MscorlibBinder::GetClass(CLASS__SBYTE)
+#ifdef FEATURE_UTF8STRING
+            || methodTable == MscorlibBinder::GetClass(CLASS__CHAR8)
+#endif // FEATURE_UTF8STRING
             || methodTable == MscorlibBinder::GetClass(CLASS__CHAR)
             || methodTable == MscorlibBinder::GetClass(CLASS__INT16)
             || methodTable == MscorlibBinder::GetClass(CLASS__UINT16)
index fe7dd4a..af42bd2 100644 (file)
@@ -231,6 +231,11 @@ extern FCDECL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength
 extern FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength);
 extern FCDECL1(StringObject*, FramedAllocateString, DWORD stringLength);
 
+#ifdef FEATURE_UTF8STRING
+extern FCDECL1(Utf8StringObject*, AllocateUtf8String_MP_FastPortable, DWORD stringLength);
+extern FCDECL1(Utf8StringObject*, FramedAllocateUtf8String, DWORD stringLength);
+#endif // FEATURE_UTF8STRING
+
 extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
 extern FCDECL2(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
 extern FCDECL2(Object*, JIT_NewArr1_R2R, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size);
index f86011d..3a5b618 100644 (file)
@@ -80,6 +80,9 @@ void InitJITHelpers1()
         SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_FastPortable);
 
         ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateString_MP_FastPortable), ECall::FastAllocateString);
+#ifdef FEATURE_UTF8STRING
+        ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateUtf8String_MP_FastPortable), ECall::FastAllocateUtf8String);
+#endif // FEATURE_UTF8STRING
 #else // FEATURE_PAL
         // if (multi-proc || server GC)
         if (GCHeapUtilities::UseThreadAllocationContexts())
@@ -91,6 +94,9 @@ void InitJITHelpers1()
             SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_InlineGetThread);
 
             ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP_InlineGetThread), ECall::FastAllocateString);
+#ifdef FEATURE_UTF8STRING
+            ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateUtf8String_MP_FastPortable), ECall::FastAllocateUtf8String);
+#endif // FEATURE_UTF8STRING
         }
         else
         {
@@ -105,6 +111,9 @@ void InitJITHelpers1()
             SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_UP);
 
             ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastUP), ECall::FastAllocateString);
+#ifdef FEATURE_UTF8STRING
+            ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateUtf8String_MP_FastPortable), ECall::FastAllocateUtf8String);
+#endif // FEATURE_UTF8STRING
         }
 #endif // FEATURE_PAL
     }
index 334a4a8..23df97d 100644 (file)
@@ -266,6 +266,11 @@ FCIMPL1(FC_BOOL_RET, MarshalNative::IsPinnable, Object* obj)
     if (obj->GetMethodTable() == g_pStringClass)
         FC_RETURN_BOOL(TRUE);
 
+#ifdef FEATURE_UTF8STRING
+    if (obj->GetMethodTable() == g_pUtf8StringClass)
+        FC_RETURN_BOOL(TRUE);
+#endif // FEATURE_UTF8STRING
+
     if (obj->GetMethodTable()->IsArray())
     {
         BASEARRAYREF asArray = (BASEARRAYREF)ObjectToOBJECTREF(obj);
@@ -527,6 +532,11 @@ void ValidatePinnedObject(OBJECTREF obj)
     if (obj->GetMethodTable() == g_pStringClass)
         return;
 
+#ifdef FEATURE_UTF8STRING
+    if (obj->GetMethodTable() == g_pUtf8StringClass)
+        return;
+#endif // FEATURE_UTF8STRING
+
     if (obj->GetMethodTable()->IsArray())
     {
         BASEARRAYREF asArray = (BASEARRAYREF) obj;
index 5321fd3..5e0a821 100644 (file)
@@ -402,6 +402,7 @@ DEFINE_METASIG(IM(Bool_Bool_RetStr, F F, s))
 
 DEFINE_METASIG(IM(PtrChar_RetVoid, P(u), v))
 DEFINE_METASIG(IM(PtrChar_Int_Int_RetVoid, P(u) i i, v))
+DEFINE_METASIG_T(IM(ReadOnlySpanOfByte_RetVoid, GI(g(READONLY_SPAN), 1, b), v))
 DEFINE_METASIG_T(IM(ReadOnlySpanOfChar_RetVoid, GI(g(READONLY_SPAN), 1, u), v))
 DEFINE_METASIG(IM(PtrSByt_RetVoid, P(B), v))
 DEFINE_METASIG(IM(PtrSByt_Int_Int_RetVoid, P(B) i i, v))
@@ -420,6 +421,19 @@ DEFINE_METASIG(IM(PtrSByt_Int_Int_RetStr, P(B) i i, s))
 DEFINE_METASIG_T(IM(PtrSByt_Int_Int_Encoding_RetStr, P(B) i i C(ENCODING), s))
 DEFINE_METASIG(IM(Obj_Int_RetIntPtr, j i, I))
 
+DEFINE_METASIG(IM(ArrByte_Int_Int_RetVoid, a(b) i i, v))
+DEFINE_METASIG(IM(PtrByte_RetVoid, P(b), v))
+
+#ifdef FEATURE_UTF8STRING
+DEFINE_METASIG_T(IM(ReadOnlySpanOfByte_RetUtf8Str, GI(g(READONLY_SPAN), 1, b), C(UTF8_STRING)))
+DEFINE_METASIG_T(IM(ReadOnlySpanOfChar_RetUtf8Str, GI(g(READONLY_SPAN), 1, u), C(UTF8_STRING)))
+DEFINE_METASIG_T(IM(ArrByte_Int_Int_RetUtf8Str, a(b) i i, C(UTF8_STRING)))
+DEFINE_METASIG_T(IM(PtrByte_RetUtf8Str, P(b), C(UTF8_STRING)))
+DEFINE_METASIG_T(IM(ArrChar_Int_Int_RetUtf8Str, a(u) i i, C(UTF8_STRING)))
+DEFINE_METASIG_T(IM(PtrChar_RetUtf8Str, P(u), C(UTF8_STRING)))
+DEFINE_METASIG_T(IM(String_RetUtf8Str, s, C(UTF8_STRING)))
+#endif // FEATURE_UTF8STRING
+
 DEFINE_METASIG(IM(Char_Char_RetStr, u u, s))
 DEFINE_METASIG(IM(Char_Int_RetVoid, u i, v))
 DEFINE_METASIG_T(SM(RetCultureInfo, _, C(CULTURE_INFO)))
index 9f9b25e..84f8399 100644 (file)
@@ -1743,7 +1743,7 @@ public:
     BOOL IsString()
     {
         LIMITED_METHOD_DAC_CONTRACT;
-        return HasComponentSize() && !IsArray();
+        return HasComponentSize() && !IsArray() && RawGetComponentSize() == 2;
     }
 
     BOOL            HasComponentSize() const
index 568a231..d4ce5b0 100644 (file)
@@ -9711,6 +9711,19 @@ void MethodTableBuilder::CheckForSystemTypes()
 
             pMT->SetComponentSize(2);
         }
+#ifdef FEATURE_UTF8STRING
+        else if (strcmp(name, g_Utf8StringName) == 0 && strcmp(nameSpace, g_SystemNS) == 0)
+        {
+            // Utf8Strings are not "normal" objects, so we need to mess with their method table a bit
+            // so that the GC can figure out how big each string is...
+            DWORD baseSize = Utf8StringObject::GetBaseSize();
+            pMT->SetBaseSize(baseSize); // NULL character included
+
+            GetHalfBakedClass()->SetBaseSizePadding(baseSize - bmtFP->NumInstanceFieldBytes);
+
+            pMT->SetComponentSize(1);
+        }
+#endif // FEATURE_UTF8STRING
         else if (strcmp(name, g_CriticalFinalizerObjectName) == 0 && strcmp(nameSpace, g_ConstrainedExecutionNS) == 0)
         {
             // To introduce a class with a critical finalizer,
index 264408f..c54a635 100644 (file)
@@ -322,6 +322,10 @@ DEFINE_CLASS(ENCODING,              Text,                   Encoding)
 
 DEFINE_CLASS(RUNE,                  Text,                   Rune)
 
+#ifdef FEATURE_UTF8STRING
+DEFINE_CLASS(CHAR8,                 System,                 Char8)
+#endif // FEATURE_UTF8STRING
+
 DEFINE_CLASS(ENUM,                  System,                 Enum)
 
 DEFINE_CLASS(ENVIRONMENT,           System,                 Environment)
@@ -818,6 +822,17 @@ DEFINE_METHOD(STRING,               WCSLEN,                 wcslen,
 DEFINE_METHOD(STRING,               STRLEN,                 strlen,                     SM_PtrByte_RetInt)
 DEFINE_PROPERTY(STRING,             LENGTH,                 Length,                     Int)
 
+#ifdef FEATURE_UTF8STRING
+DEFINE_CLASS(UTF8_STRING,           System,                 Utf8String)
+DEFINE_METHOD(UTF8_STRING,          CTORF_READONLYSPANOFBYTE,Ctor,                      IM_ReadOnlySpanOfByte_RetUtf8Str)
+DEFINE_METHOD(UTF8_STRING,          CTORF_READONLYSPANOFCHAR,Ctor,                      IM_ReadOnlySpanOfChar_RetUtf8Str)
+DEFINE_METHOD(UTF8_STRING,          CTORF_BYTEARRAY_START_LEN,Ctor,                     IM_ArrByte_Int_Int_RetUtf8Str)
+DEFINE_METHOD(UTF8_STRING,          CTORF_BYTEPTR,           Ctor,                      IM_PtrByte_RetUtf8Str)
+DEFINE_METHOD(UTF8_STRING,          CTORF_CHARARRAY_START_LEN,Ctor,                     IM_ArrChar_Int_Int_RetUtf8Str)
+DEFINE_METHOD(UTF8_STRING,          CTORF_CHARPTR,           Ctor,                      IM_PtrChar_RetUtf8Str)
+DEFINE_METHOD(UTF8_STRING,          CTORF_STRING,            Ctor,                      IM_String_RetUtf8Str)
+#endif // FEATURE_UTF8STRING
+
 DEFINE_CLASS(STRING_BUILDER,        Text,                   StringBuilder)
 DEFINE_PROPERTY(STRING_BUILDER,     LENGTH,                 Length,                     Int)
 DEFINE_PROPERTY(STRING_BUILDER,     CAPACITY,               Capacity,                   Int)
index 6bc3a74..9087afa 100644 (file)
@@ -35,7 +35,10 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref);
  *  |                        sync block index, which is at a negative offset
  *  |
  *  +-- code:StringObject       - String objects are specialized objects for string
- *  |                        storage/retrieval for higher performance
+ *  |                        storage/retrieval for higher performance (UCS-2 / UTF-16 data)
+ *  |
+ *  +-- code:Utf8StringObject       - String objects are specialized objects for string
+ *  |                        storage/retrieval for higher performance (UTF-8 data)
  *  |
  *  +-- BaseObjectWithCachedData - Object Plus one object field for caching.
  *  |       |
@@ -870,6 +873,9 @@ typedef DPTR(UPTRArray) PTR_UPTRArray;
 typedef DPTR(PTRArray)  PTR_PTRArray;
 
 class StringObject;
+#ifdef FEATURE_UTF8STRING
+class Utf8StringObject;
+#endif // FEATURE_UTF8STRING
 
 #ifdef USE_CHECKED_OBJECTREFS
 typedef REF<ArrayBase>  BASEARRAYREF;
@@ -888,6 +894,9 @@ typedef REF<UPTRArray>  UPTRARRAYREF;
 typedef REF<CHARArray>  CHARARRAYREF;
 typedef REF<PTRArray>   PTRARRAYREF;  // Warning: Use PtrArray only for single dimensional arrays, not multidim arrays.
 typedef REF<StringObject> STRINGREF;
+#ifdef FEATURE_UTF8STRING
+typedef REF<Utf8StringObject> UTF8STRINGREF;
+#endif // FEATURE_UTF8STRING
 
 #else   // USE_CHECKED_OBJECTREFS
 
@@ -907,6 +916,9 @@ typedef PTR_UPTRArray   UPTRARRAYREF;
 typedef PTR_CHARArray   CHARARRAYREF;
 typedef PTR_PTRArray    PTRARRAYREF;  // Warning: Use PtrArray only for single dimensional arrays, not multidim arrays.
 typedef PTR_StringObject STRINGREF;
+#ifdef FEATURE_UTF8STRING
+typedef PTR_Utf8StringObject UTF8STRINGREF;
+#endif // FEATURE_UTF8STRING
 
 #endif // USE_CHECKED_OBJECTREFS
 
@@ -1199,6 +1211,56 @@ public:
 
 };
 
+#ifdef FEATURE_UTF8STRING
+class Utf8StringObject : public Object
+{
+#ifdef DACCESS_COMPILE
+    friend class ClrDataAccess;
+#endif
+
+private:
+    DWORD   m_StringLength;
+    BYTE    m_FirstChar;
+
+public:
+    VOID    SetLength(DWORD len) { LIMITED_METHOD_CONTRACT; _ASSERTE(len >= 0); m_StringLength = len; }
+
+protected:
+    Utf8StringObject() { LIMITED_METHOD_CONTRACT; }
+    ~Utf8StringObject() { LIMITED_METHOD_CONTRACT; }
+
+public:
+
+    /*=================RefInterpretGetStringValuesDangerousForGC======================
+    **N.B.: This perfoms no range checking and relies on the caller to have done this.
+    **Args: (IN)ref -- the Utf8String to be interpretted.
+    **      (OUT)chars -- a pointer to the characters in the buffer.
+    **      (OUT)length -- a pointer to the length of the buffer.
+    **Returns: void.
+    **Exceptions: None.
+    ==============================================================================*/
+    // !!!! If you use this function, you have to be careful because chars is a pointer
+    // !!!! to the data buffer of ref.  If GC happens after this call, you need to make
+    // !!!! sure that you have a pin handle on ref, or use GCPROTECT_BEGINPINNING on ref.
+    void RefInterpretGetStringValuesDangerousForGC(__deref_out_ecount(*length + 1) CHAR **chars, int *length) {
+        WRAPPER_NO_CONTRACT;
+    
+        _ASSERTE(GetGCSafeMethodTable() == g_pUtf8StringClass);
+        *length = GetStringLength();
+        *chars  = GetBuffer();
+#ifdef _DEBUG
+        EnableStressHeapHelper();
+#endif
+    }
+
+    DWORD   GetStringLength()                           { LIMITED_METHOD_DAC_CONTRACT; return( m_StringLength );}
+    CHAR*   GetBuffer()                                 { LIMITED_METHOD_CONTRACT; _ASSERTE(this != nullptr); return (CHAR*)( dac_cast<TADDR>(this) + offsetof(Utf8StringObject, m_FirstChar) );  }
+
+    static DWORD GetBaseSize();
+    static SIZE_T GetSize(DWORD stringLength);
+};
+#endif // FEATURE_UTF8STRING
+
 // This is the Method version of the Reflection object.
 //  A Method has adddition information.
 //   m_pMD - A pointer to the actual MethodDesc of the method.
index 9652909..ebf9d36 100644 (file)
@@ -71,6 +71,22 @@ __forceinline /*static*/ SIZE_T StringObject::GetSize(DWORD strLen)
     return GetBaseSize() + strLen * sizeof(WCHAR);
 }
 
+#ifdef FEATURE_UTF8STRING
+__forceinline /*static*/ DWORD Utf8StringObject::GetBaseSize()
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    return OBJECT_BASESIZE + sizeof(DWORD) /* length */ + sizeof(BYTE) /* null terminator */;
+}
+
+__forceinline /*static*/ SIZE_T Utf8StringObject::GetSize(DWORD strLen)
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    return GetBaseSize() + strLen;
+}
+#endif // FEATURE_UTF8STRING
+
 #ifdef DACCESS_COMPILE
 
 inline void Object::EnumMemoryRegions(void)
index 1f8aa04..954d6ae 100644 (file)
@@ -1001,6 +1001,7 @@ FCIMPL5(Object*, RuntimeMethodHandle::InvokeMethod,
     // Skip the activation optimization for remoting because of remoting proxy is not always activated.
     // It would be nice to clean this up and get remoting to always activate methodtable behind the proxy.
     BOOL fForceActivationForRemoting = FALSE;
+    BOOL fCtorOfVariableSizedObject = FALSE;
 
     if (fConstructor)
     {
@@ -1018,7 +1019,8 @@ FCIMPL5(Object*, RuntimeMethodHandle::InvokeMethod,
         MethodTable * pMT = ownerType.AsMethodTable();
 
         {
-            if (pMT != g_pStringClass)
+            fCtorOfVariableSizedObject = pMT->HasComponentSize();
+            if (!fCtorOfVariableSizedObject)
                 gc.retVal = pMT->Allocate();
         }
     }
@@ -1324,7 +1326,11 @@ FCIMPL5(Object*, RuntimeMethodHandle::InvokeMethod,
     if (fConstructor)
     {
         // We have a special case for Strings...The object is returned...
-        if (ownerType == TypeHandle(g_pStringClass)) {
+        if (ownerType == TypeHandle(g_pStringClass)
+#ifdef FEATURE_UTF8STRING
+            || ownerType == TypeHandle(g_pUtf8StringClass)
+#endif // FEATURE_UTF8STRING
+            ) {
             PVOID pReturnValue = &callDescrData.returnValue;
             gc.retVal = *(OBJECTREF *)pReturnValue;
         }
@@ -2590,8 +2596,12 @@ FCIMPL1(Object*, ReflectionSerialization::GetUninitializedObject, ReflectClassBa
     MethodTable *pMT = type.GetMethodTable();
     PREFIX_ASSUME(pMT != NULL);
 
-    //We don't allow unitialized strings.
-    if (pMT == g_pStringClass) {
+    //We don't allow unitialized Strings or Utf8Strings.
+    if (pMT == g_pStringClass
+#ifdef FEATURE_UTF8STRING
+        || pMT == g_pUtf8StringClass
+#endif // FEATURE_UTF8STRING
+        ) {
         COMPlusThrow(kArgumentException, W("Argument_NoUninitializedStrings"));
     }
 
index 179acda..8b329d4 100644 (file)
@@ -61,6 +61,9 @@ GPTR_IMPL(MethodTable,      g_pObjectClass);
 GPTR_IMPL(MethodTable,      g_pRuntimeTypeClass);
 GPTR_IMPL(MethodTable,      g_pCanonMethodTableClass);  // System.__Canon
 GPTR_IMPL(MethodTable,      g_pStringClass);
+#ifdef FEATURE_UTF8STRING
+GPTR_IMPL(MethodTable,      g_pUtf8StringClass);
+#endif // FEATURE_UTF8STRING
 GPTR_IMPL(MethodTable,      g_pArrayClass);
 GPTR_IMPL(MethodTable,      g_pSZArrayHelperClass);
 GPTR_IMPL(MethodTable,      g_pNullableClass);
index 91ad42a..d8ffc60 100644 (file)
@@ -79,6 +79,9 @@ class LoaderHeap;
 class IGCHeap;
 class Object;
 class StringObject;
+#ifdef FEATURE_UTF8STRING
+class Utf8StringObject;
+#endif // FEATURE_UTF8STRING
 class ArrayClass;
 class MethodTable;
 class MethodDesc;
@@ -313,6 +316,10 @@ class REF : public OBJECTREF
 #define OBJECTREFToObject(objref)  ((objref).operator-> ())
 #define ObjectToSTRINGREF(obj)     (STRINGREF(obj))
 #define STRINGREFToObject(objref)  (*( (StringObject**) &(objref) ))
+#ifdef FEATURE_UTF8STRING
+#define ObjectToUTF8STRINGREF(obj)   (UTF8STRINGREF(obj))
+#define UTF8STRINGREFToObject(objref) (*( (Utf8StringObject**) &(objref) ))
+#endif // FEATURE_UTF8STRING
 
 #else   // _DEBUG_IMPL
 
@@ -323,6 +330,10 @@ class REF : public OBJECTREF
 #define OBJECTREFToObject(objref) ((PTR_Object) (objref))
 #define ObjectToSTRINGREF(obj)    ((PTR_StringObject) (obj))
 #define STRINGREFToObject(objref) ((PTR_StringObject) (objref))
+#ifdef FEATURE_UTF8STRING
+#define ObjectToUTF8STRINGREF(obj)    ((PTR_Utf8StringObject) (obj))
+#define UTF8STRINGREFToObject(objref) ((PTR_Utf8StringObject) (objref))
+#endif // FEATURE_UTF8STRING
 
 #endif // _DEBUG_IMPL
 
@@ -363,6 +374,9 @@ GPTR_DECL(MethodTable,      g_pObjectClass);
 GPTR_DECL(MethodTable,      g_pRuntimeTypeClass);
 GPTR_DECL(MethodTable,      g_pCanonMethodTableClass);  // System.__Canon
 GPTR_DECL(MethodTable,      g_pStringClass);
+#ifdef FEATURE_UTF8STRING
+GPTR_DECL(MethodTable,      g_pUtf8StringClass);
+#endif // FEATURE_UTF8STRING
 GPTR_DECL(MethodTable,      g_pArrayClass);
 GPTR_DECL(MethodTable,      g_pSZArrayHelperClass);
 GPTR_DECL(MethodTable,      g_pNullableClass);