Big endian fixes for dotnet runtime (#47981)
authorNeale Ferguson <neale@sinenomine.net>
Tue, 9 Feb 2021 21:54:23 +0000 (07:54 +1000)
committerGitHub <noreply@github.com>
Tue, 9 Feb 2021 21:54:23 +0000 (13:54 -0800)
src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs
src/libraries/System.Runtime.Serialization.Formatters/src/System/Runtime/Serialization/Formatters/Binary/BinaryFormatterWriter.cs
src/libraries/System.Runtime.Serialization.Formatters/src/System/Runtime/Serialization/Formatters/Binary/BinaryParser.cs
src/libraries/System.Text.Encoding.CodePages/src/System.Text.Encoding.CodePages.csproj
src/libraries/System.Text.Encoding.CodePages/src/System/Text/BaseCodePageEncoding.cs
src/libraries/System.Text.Encoding.CodePages/src/System/Text/BaseCodePageEncoding.netcoreapp.cs
src/libraries/System.Text.Encoding.CodePages/src/System/Text/DBCSCodePageEncoding.cs
src/libraries/System.Text.Encoding.CodePages/src/System/Text/SBCSCodePageEncoding.cs

index 6a67e78..e490a2d 100644 (file)
@@ -200,6 +200,10 @@ namespace System.Text
                 currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
                 if (!AllBytesInUInt32AreAscii(currentUInt32))
                 {
+                    if (!BitConverter.IsLittleEndian)
+                    {
+                        currentUInt32 = currentUInt32 << 16;
+                    }
                     goto FoundNonAsciiData;
                 }
 
@@ -1678,6 +1682,10 @@ namespace System.Text
                 asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
                 if (!AllBytesInUInt32AreAscii(asciiData))
                 {
+                    if (!BitConverter.IsLittleEndian)
+                    {
+                        asciiData = asciiData << 16;
+                    }
                     goto FoundNonAsciiData;
                 }
 
@@ -1719,11 +1727,23 @@ namespace System.Text
 
             // Drain ASCII bytes one at a time.
 
-            while (((byte)asciiData & 0x80) == 0)
+            if (BitConverter.IsLittleEndian)
             {
-                pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
-                currentOffset++;
-                asciiData >>= 8;
+                while (((byte)asciiData & 0x80) == 0)
+                {
+                    pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
+                    currentOffset++;
+                    asciiData >>= 8;
+                }
+            }
+            else
+            {
+                while ((asciiData & 0x80000000) == 0)
+                {
+                    asciiData = BitOperations.RotateLeft(asciiData, 8);
+                    pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
+                    currentOffset++;
+                }
             }
 
             goto Finish;
index ac59f0b..628c8a9 100644 (file)
@@ -143,7 +143,7 @@ namespace System.Text.Unicode
                 tempB |= tempA;
 
                 uint tempC = (value << 2) & 0x0000_0F00u; // = [ 00000000 00000000 0000yyyy 00000000 ]
-                uint tempD = (value >> 6) & 0x0003_0000u; // = [ 00000000 00000000 00yy0000 00000000 ]
+                uint tempD = (value >> 4) & 0x0000_3000u; // = [ 00000000 00000000 00yy0000 00000000 ]
                 tempD |= tempC;
 
                 uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ]
@@ -232,7 +232,7 @@ namespace System.Text.Unicode
                 // want to return [ ######## ######## 110yyyyy 10xxxxxx ]
 
                 uint temp = (value >> 16) & 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
-                value = (value >> 22) & 0x1F00u; // [ 00000000 00000000 000yyyyy 0000000 ]
+                value = (value >> 14) & 0x1F00u; // [ 00000000 00000000 000yyyyy 0000000 ]
                 return value + temp + 0xC080u;
             }
         }
@@ -498,7 +498,7 @@ namespace System.Text.Unicode
             // Return statement is written this way to work around https://github.com/dotnet/runtime/issues/4207.
 
             return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
-                || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
+                || (!BitConverter.IsLittleEndian && (((value - 0xF080_8080u) & 0xF8C0_C0C0u) == 0));
         }
 
         /// <summary>
index ce8693b..b13c66c 100644 (file)
@@ -1139,7 +1139,7 @@ namespace System.Text.Unicode
                     }
                     else
                     {
-                        pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ]
+                        pOutputBuffer[0] = (byte)(thisDWord >> 16); // extract [ 00 AA ## ## ]
                     }
 
                     pInputBuffer++;
index e8a1305..9e5f11d 100644 (file)
@@ -268,7 +268,6 @@ namespace System.Runtime.Serialization.Formatters.Binary
                 if (!BitConverter.IsLittleEndian)
                 {
                     // we know that we are writing a primitive type, so just do a simple swap
-                    Debug.Fail("Re-review this code if/when we start running on big endian systems");
                     for (int i = 0; i < bufferUsed; i += typeLength)
                     {
                         for (int j = 0; j < typeLength / 2; j++)
index 628d54d..811b74b 100644 (file)
@@ -894,7 +894,6 @@ namespace System.Runtime.Serialization.Formatters.Binary
                     if (!BitConverter.IsLittleEndian)
                     {
                         // we know that we are reading a primitive type, so just do a simple swap
-                        Debug.Fail("Re-review this code if/when we start running on big endian systems");
                         for (int i = 0; i < bufferUsed; i += typeLength)
                         {
                             for (int j = 0; j < typeLength / 2; j++)
index fc36d49..9c56c17 100644 (file)
     <Reference Include="System.Collections" />
     <Reference Include="System.Diagnostics.Debug" />
     <Reference Include="System.Diagnostics.Tools" />
+    <Reference Include="System.Memory" />
     <Reference Include="System.Resources.ResourceManager" />
     <Reference Include="System.Runtime" />
     <Reference Include="System.Runtime.Extensions" />
     <Reference Include="System.Runtime.InteropServices" />
     <Reference Include="System.Threading" />
   </ItemGroup>
+  <ItemGroup Condition="!$(TargetFramework.StartsWith('$(NetCoreAppCurrent)'))">
+    <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
+  </ItemGroup>
 </Project>
index 1a48d4a..01208d2 100644 (file)
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Buffers.Binary;
 using System.Reflection;
 using System.IO;
 using System.Diagnostics;
@@ -99,6 +100,28 @@ namespace System.Text
             internal short unused1;             // Add an unused WORD so that CodePages is aligned with DWORD boundary.
         }
         private const int CODEPAGE_DATA_FILE_HEADER_SIZE = 44;
+        internal static unsafe void ReadCodePageDataFileHeader(Stream stream, byte[] codePageDataFileHeader)
+        {
+            stream.Read(codePageDataFileHeader, 0, codePageDataFileHeader.Length);
+            if (!BitConverter.IsLittleEndian)
+            {
+                fixed (byte* pBytes = &codePageDataFileHeader[0])
+                {
+                    CodePageDataFileHeader* p = (CodePageDataFileHeader*)pBytes;
+                    char *pTableName = &p->TableName;
+                    for (int i = 0; i < 16; i++)
+                    {
+                            pTableName[i] = (char)BinaryPrimitives.ReverseEndianness((ushort)pTableName[i]);
+                    }
+                    ushort *pVersion = &p->Version;
+                    for (int i = 0; i < 4; i++)
+                    {
+                            pVersion[i] = BinaryPrimitives.ReverseEndianness(pVersion[i]);
+                    }
+                    p->CodePageCount = BinaryPrimitives.ReverseEndianness(p->CodePageCount);
+                }
+            }
+        }
 
         [StructLayout(LayoutKind.Explicit, Pack = 2)]
         internal unsafe struct CodePageIndex
@@ -112,6 +135,25 @@ namespace System.Text
             [FieldOffset(0x24)]
             internal int Offset;            // DWORD
         }
+        internal static unsafe void ReadCodePageIndex(Stream stream, byte[] codePageIndex)
+        {
+            stream.Read(codePageIndex, 0, codePageIndex.Length);
+            if (!BitConverter.IsLittleEndian)
+            {
+                fixed (byte* pBytes = &codePageIndex[0])
+                {
+                    CodePageIndex* p = (CodePageIndex*)pBytes;
+                    char *pCodePageName = &p->CodePageName;
+                    for (int i = 0; i < 16; i++)
+                    {
+                            pCodePageName[i] = (char)BinaryPrimitives.ReverseEndianness((ushort)pCodePageName[i]);
+                    }
+                    p->CodePage = BinaryPrimitives.ReverseEndianness(p->CodePage);
+                    p->ByteCount = BinaryPrimitives.ReverseEndianness(p->ByteCount);
+                    p->Offset = BinaryPrimitives.ReverseEndianness(p->Offset);
+                }
+            }
+        }
 
         [StructLayout(LayoutKind.Explicit)]
         internal unsafe struct CodePageHeader
@@ -136,6 +178,30 @@ namespace System.Text
             internal ushort ByteReplace;    // WORD     // default replacement bytes
         }
         private const int CODEPAGE_HEADER_SIZE = 48;
+        internal static unsafe void ReadCodePageHeader(Stream stream, byte[] codePageHeader)
+        {
+            stream.Read(codePageHeader, 0, codePageHeader!.Length);
+            if (!BitConverter.IsLittleEndian)
+            {
+                fixed (byte* pBytes = &codePageHeader[0])
+                {
+                    CodePageHeader* p = (CodePageHeader*)pBytes;
+                    char *pCodePageName = &p->CodePageName;
+                    for (int i = 0; i < 16; i++)
+                    {
+                            pCodePageName[i] = (char)BinaryPrimitives.ReverseEndianness((ushort)pCodePageName[i]);
+                    }
+                    p->VersionMajor = BinaryPrimitives.ReverseEndianness(p->VersionMajor);
+                    p->VersionMinor = BinaryPrimitives.ReverseEndianness(p->VersionMinor);
+                    p->VersionRevision = BinaryPrimitives.ReverseEndianness(p->VersionRevision);
+                    p->VersionBuild = BinaryPrimitives.ReverseEndianness(p->VersionBuild);
+                    p->CodePage = BinaryPrimitives.ReverseEndianness(p->CodePage);
+                    p->ByteCount = BinaryPrimitives.ReverseEndianness(p->ByteCount);
+                    p->UnicodeReplace = (char)BinaryPrimitives.ReverseEndianness((ushort)p->UnicodeReplace);
+                    p->ByteReplace = BinaryPrimitives.ReverseEndianness(p->ByteReplace);
+                }
+            }
+        }
 
         // Initialize our global stuff
         private static readonly byte[] s_codePagesDataHeader = new byte[CODEPAGE_DATA_FILE_HEADER_SIZE];
@@ -166,7 +232,7 @@ namespace System.Text
             }
 
             // Read the header
-            stream.Read(s_codePagesDataHeader, 0, s_codePagesDataHeader.Length);
+            ReadCodePageDataFileHeader(stream, s_codePagesDataHeader);
 
             return stream;
         }
@@ -210,14 +276,14 @@ namespace System.Text
                     CodePageIndex* pCodePageIndex = (CodePageIndex*)pBytes;
                     for (int i = 0; i < codePagesCount; i++)
                     {
-                        s_codePagesEncodingDataStream.Read(codePageIndex, 0, codePageIndex.Length);
+                        ReadCodePageIndex(s_codePagesEncodingDataStream, codePageIndex);
 
                         if (pCodePageIndex->CodePage == codePage)
                         {
                             // Found it!
                             long position = s_codePagesEncodingDataStream.Position;
                             s_codePagesEncodingDataStream.Seek((long)pCodePageIndex->Offset, SeekOrigin.Begin);
-                            s_codePagesEncodingDataStream.Read(m_codePageHeader, 0, m_codePageHeader!.Length);
+                            ReadCodePageHeader(s_codePagesEncodingDataStream, m_codePageHeader);
                             m_firstDataWordOffset = (int)s_codePagesEncodingDataStream.Position; // stream now pointing to the codepage data
 
                             if (i == codePagesCount - 1) // last codepage
@@ -229,7 +295,7 @@ namespace System.Text
                                 // Read Next codepage data to get the offset and then calculate the size
                                 s_codePagesEncodingDataStream.Seek(position, SeekOrigin.Begin);
                                 int currentOffset = pCodePageIndex->Offset;
-                                s_codePagesEncodingDataStream.Read(codePageIndex, 0, codePageIndex.Length);
+                                ReadCodePageIndex(s_codePagesEncodingDataStream, codePageIndex);
                                 m_dataSize = pCodePageIndex->Offset - currentOffset - m_codePageHeader.Length;
                             }
 
@@ -266,7 +332,7 @@ namespace System.Text
                     CodePageIndex* pCodePageIndex = (CodePageIndex*)pBytes;
                     for (int i = 0; i < codePagesCount; i++)
                     {
-                        s_codePagesEncodingDataStream.Read(codePageIndex, 0, codePageIndex.Length);
+                        ReadCodePageIndex(s_codePagesEncodingDataStream, codePageIndex);
 
                         if (pCodePageIndex->CodePage == codePage)
                         {
index a0b19e2..7fc23a6 100644 (file)
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.IO;
+using System.Buffers.Binary;
 using System.Runtime.Serialization;
 using System.Runtime.CompilerServices;
 
@@ -9,6 +10,26 @@ namespace System.Text
 {
     internal abstract partial class BaseCodePageEncoding : EncodingNLS, ISerializable
     {
+        internal static unsafe void ReadCodePageIndex(Stream stream, Span<byte> codePageIndex)
+        {
+            stream.Read(codePageIndex);
+            if (!BitConverter.IsLittleEndian)
+            {
+                fixed (byte* pBytes = &codePageIndex[0])
+                {
+                    CodePageIndex* p = (CodePageIndex*)pBytes;
+                    char *pCodePageName = &p->CodePageName;
+                    for (int i = 0; i < 16; i++)
+                    {
+                            pCodePageName[i] = (char)BinaryPrimitives.ReverseEndianness((ushort)pCodePageName[i]);
+                    }
+                    p->CodePage = BinaryPrimitives.ReverseEndianness(p->CodePage);
+                    p->ByteCount = BinaryPrimitives.ReverseEndianness(p->ByteCount);
+                    p->Offset = BinaryPrimitives.ReverseEndianness(p->Offset);
+                }
+            }
+        }
+
         internal static unsafe EncodingInfo [] GetEncodings(CodePagesEncodingProvider provider)
         {
             lock (s_streamLock)
@@ -29,7 +50,7 @@ namespace System.Text
 
                 for (int i = 0; i < codePagesCount; i++)
                 {
-                    s_codePagesEncodingDataStream.Read(pCodePageIndex);
+                    ReadCodePageIndex(s_codePagesEncodingDataStream, pCodePageIndex);
 
                     string codePageName;
                     switch (codePageIndex.CodePage)
index 00a03b3..92da49d 100644 (file)
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System;
+using System.Buffers.Binary;
 using System.IO;
 using System.Diagnostics;
 using System.Text;
@@ -41,6 +42,18 @@ namespace System.Text
         {
         }
 
+        internal static unsafe char ReadChar(char *pChar)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+              return *pChar;
+            }
+            else
+            {
+              return (char)BinaryPrimitives.ReverseEndianness((ushort)*pChar);
+            }
+        }
+
         // MBCS data section:
         //
         // We treat each multibyte pattern as 2 bytes in our table.  If it's a single byte, then the high byte
@@ -136,14 +149,14 @@ namespace System.Text
                     while (bytePosition < 0x10000)
                     {
                         // Get the next byte
-                        char input = *pData;
+                        char input = ReadChar(pData);
                         pData++;
 
                         // build our table:
                         if (input == 1)
                         {
                             // Use next data as our byte position
-                            bytePosition = (int)(*pData);
+                            bytePosition = (int)ReadChar(pData);
                             pData++;
                             continue;
                         }
@@ -258,14 +271,14 @@ namespace System.Text
                         while (bytesPosition < 0x10000)
                         {
                             // Get the next byte
-                            char input = *pData;
+                            char input = ReadChar(pData);
                             pData++;
 
                             // build our table:
                             if (input == 1)
                             {
                                 // Use next data as our byte position
-                                bytesPosition = (int)(*pData);
+                                bytesPosition = (int)ReadChar(pData);
                                 pData++;
                             }
                             else if (input < 0x20 && input > 0)
@@ -286,20 +299,20 @@ namespace System.Text
                         // Now pData should be pointing to first word of bytes -> unicode best fit table
                         // (which we're also not using at the moment)
                         int iBestFitCount = 0;
-                        bytesPosition = *pData;
+                        bytesPosition = ReadChar(pData);
                         pData++;
 
                         while (bytesPosition < 0x10000)
                         {
                             // Get the next byte
-                            char input = *pData;
+                            char input = ReadChar(pData);
                             pData++;
 
                             // build our table:
                             if (input == 1)
                             {
                                 // Use next data as our byte position
-                                bytesPosition = (int)(*pData);
+                                bytesPosition = (int)ReadChar(pData);
                                 pData++;
                             }
                             else if (input < 0x20 && input > 0)
@@ -334,7 +347,7 @@ namespace System.Text
                         // Now we know how many best fits we have, so go back & read them in
                         iBestFitCount = 0;
                         pData = pBytes2Unicode;
-                        bytesPosition = *pData;
+                        bytesPosition = ReadChar(pData);
                         pData++;
                         bool bOutOfOrder = false;
 
@@ -342,14 +355,14 @@ namespace System.Text
                         while (bytesPosition < 0x10000)
                         {
                             // Get the next byte
-                            char input = *pData;
+                            char input = ReadChar(pData);
                             pData++;
 
                             // build our table:
                             if (input == 1)
                             {
                                 // Use next data as our byte position
-                                bytesPosition = (int)(*pData);
+                                bytesPosition = (int)ReadChar(pData);
                                 pData++;
                             }
                             else if (input < 0x20 && input > 0)
@@ -421,20 +434,20 @@ namespace System.Text
 
                         // Now were at beginning of Unicode -> Bytes best fit table, need to count them
                         char* pUnicode2Bytes = pData;
-                        int unicodePosition = *(pData++);
+                        int unicodePosition = ReadChar(pData++);
                         iBestFitCount = 0;
 
                         while (unicodePosition < 0x10000)
                         {
                             // Get the next byte
-                            char input = *pData;
+                            char input = ReadChar(pData);
                             pData++;
 
                             // build our table:
                             if (input == 1)
                             {
                                 // Use next data as our byte position
-                                unicodePosition = (int)*pData;
+                                unicodePosition = (int)ReadChar(pData);
                                 pData++;
                             }
                             else if (input < 0x20 && input > 0)
@@ -456,20 +469,20 @@ namespace System.Text
 
                         // Now do it again to fill the array with real values
                         pData = pUnicode2Bytes;
-                        unicodePosition = *(pData++);
+                        unicodePosition = ReadChar(pData++);
                         iBestFitCount = 0;
 
                         while (unicodePosition < 0x10000)
                         {
                             // Get the next byte
-                            char input = *pData;
+                            char input = ReadChar(pData);
                             pData++;
 
                             // build our table:
                             if (input == 1)
                             {
                                 // Use next data as our byte position
-                                unicodePosition = (int)*pData;
+                                unicodePosition = (int)ReadChar(pData);
                                 pData++;
                             }
                             else if (input < 0x20 && input > 0)
index 6bff617..cd969d3 100644 (file)
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System;
+using System.Buffers.Binary;
 using System.IO;
 using System.Diagnostics;
 using System.Text;
@@ -32,6 +33,18 @@ namespace System.Text
         {
         }
 
+        internal static unsafe ushort ReadUInt16(byte* pByte)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+              return *(ushort*)pByte;
+            }
+            else
+            {
+              return BinaryPrimitives.ReverseEndianness(*(ushort*)pByte);
+            }
+        }
+
         // We have a managed code page entry, so load our tables
         // SBCS data section looks like:
         //
@@ -91,16 +104,16 @@ namespace System.Text
 
                 fixed (byte* pBuffer = &buffer[0])
                 {
-                    char* pTemp = (char*)pBuffer;
                     for (int b = 0; b < 256; b++)
                     {
+                        char c = (char)ReadUInt16(pBuffer + 2 * b);
                         // Don't want to force 0's to map Unicode wrong.  0 byte == 0 unicode already taken care of
-                        if (pTemp[b] != 0 || b == 0)
+                        if (c != 0 || b == 0)
                         {
-                            mapBytesToUnicode[b] = pTemp[b];
+                            mapBytesToUnicode[b] = c;
 
-                            if (pTemp[b] != UNKNOWN_CHAR)
-                                mapUnicodeToBytes[pTemp[b]] = (byte)b;
+                            if (c != UNKNOWN_CHAR)
+                                mapUnicodeToBytes[c] = (byte)b;
                         }
                         else
                         {
@@ -162,12 +175,12 @@ namespace System.Text
 
                         // See if our words are zero
                         ushort byteTemp;
-                        while ((byteTemp = *((ushort*)pData)) != 0)
+                        while ((byteTemp = ReadUInt16(pData)) != 0)
                         {
                             Debug.Assert(arrayTemp[byteTemp] == UNKNOWN_CHAR, $"[SBCSCodePageEncoding::ReadBestFitTable] Expected unallocated byte (not 0x{(int)arrayTemp[byteTemp]:X2}) for best fit byte at 0x{byteTemp:X2} for code page {CodePage}");
                             pData += 2;
 
-                            arrayTemp[byteTemp] = *((char*)pData);
+                            arrayTemp[byteTemp] = (char)ReadUInt16(pData);
                             pData += 2;
                         }
 
@@ -184,7 +197,7 @@ namespace System.Text
 
                         // Now do the UnicodeToBytes Best Fit mapping (this is the one we normally think of when we say "best fit")
                         // pData should be pointing at the first data point for Bytes->Unicode table
-                        int unicodePosition = *((ushort*)pData);
+                        int unicodePosition = ReadUInt16(pData);
                         pData += 2;
 
                         while (unicodePosition < 0x10000)
@@ -197,7 +210,7 @@ namespace System.Text
                             if (input == 1)
                             {
                                 // Use next 2 bytes as our byte position
-                                unicodePosition = *((ushort*)pData);
+                                unicodePosition = ReadUInt16(pData);
                                 pData += 2;
                             }
                             else if (input < 0x20 && input > 0 && input != 0x1e)
@@ -222,7 +235,7 @@ namespace System.Text
                         // Now actually read in the data
                         // reset pData should be pointing at the first data point for Bytes->Unicode table
                         pData = pUnicodeToSBCS;
-                        unicodePosition = *((ushort*)pData);
+                        unicodePosition = ReadUInt16(pData);
                         pData += 2;
                         iBestFitCount = 0;
 
@@ -236,7 +249,7 @@ namespace System.Text
                             if (input == 1)
                             {
                                 // Use next 2 bytes as our byte position
-                                unicodePosition = *((ushort*)pData);
+                                unicodePosition = ReadUInt16(pData);
                                 pData += 2;
                             }
                             else if (input < 0x20 && input > 0 && input != 0x1e)