src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 using System.Diagnostics;
   6 using System.Numerics;
   7 using System.Runtime.CompilerServices;
   8 using System.Runtime.Intrinsics;
   9 using System.Runtime.Intrinsics.X86;
  10 using Internal.Runtime.CompilerServices;
  11
  12 #if BIT64
  13 using nint = System.Int64;
  14 using nuint = System.UInt64;
  15 #else // BIT64
  16 using nint = System.Int32;
  17 using nuint = System.UInt32;
  18 #endif // BIT64
  19
  20 namespace System.Text
  21 {
  22     internal static partial class ASCIIUtility
  23     {
  24 #if DEBUG
  25         static ASCIIUtility()
  26         {
  27             Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
  28             Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
  29         }
  30 #endif // DEBUG
  31
  32         [MethodImpl(MethodImplOptions.AggressiveInlining)]
  33         private static bool AllBytesInUInt64AreAscii(ulong value)
  34         {
  35             // If the high bit of any byte is set, that byte is non-ASCII.
  36
  37             return ((value & UInt64HighBitsOnlyMask) == 0);
  38         }
  39
  40         /// <summary>
  41         /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
  42         /// </summary>
  43         [MethodImpl(MethodImplOptions.AggressiveInlining)]
  44         private static bool AllCharsInUInt32AreAscii(uint value)
  45         {
  46             return ((value & ~0x007F007Fu) == 0);
  47         }
  48
  49         /// <summary>
  50         /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
  51         /// </summary>
  52         [MethodImpl(MethodImplOptions.AggressiveInlining)]
  53         private static bool AllCharsInUInt64AreAscii(ulong value)
  54         {
  55             return ((value & ~0x007F007F_007F007Ful) == 0);
  56         }
  57
  58         /// <summary>
  59         /// Given a DWORD which represents two packed chars in machine-endian order,
  60         /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
  61         /// </summary>
  62         /// <param name="value"></param>
  63         /// <returns></returns>
  64         private static bool FirstCharInUInt32IsAscii(uint value)
  65         {
  66             return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
  67                 || (!BitConverter.IsLittleEndian && (value & 0xFF800000u) == 0);
  68         }
  69
  70         /// <summary>
  71         /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
  72         /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
  73         /// </summary>
  74         /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
  75         [MethodImpl(MethodImplOptions.AggressiveInlining)]
  76         public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
  77         {
  78             // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  79             // code below. This has two benefits: (a) we can take advantage of specific instructions like
  80             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  81             // this method is running.
  82
  83             return (Sse2.IsSupported)
  84                 ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
  85                 : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
  86         }
  87
  88         private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
  89         {
  90             // Squirrel away the original buffer reference. This method works by determining the exact
  91             // byte reference where non-ASCII data begins, so we need this base value to perform the
  92             // final subtraction at the end of the method to get the index into the original buffer.
  93
  94             byte* pOriginalBuffer = pBuffer;
  95
  96             // Before we drain off byte-by-byte, try a generic vectorized loop.
  97             // Only run the loop if we have at least two vectors we can pull out.
  98             // Note use of SBYTE instead of BYTE below; we're using the two's-complement
  99             // representation of negative integers to act as a surrogate for "is ASCII?".
 100
 101             if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<sbyte>.Count)
 102             {
 103                 uint SizeOfVectorInBytes = (uint)Vector<sbyte>.Count; // JIT will make this a const
 104
 105                 if (Vector.GreaterThanOrEqualAll(Unsafe.ReadUnaligned<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
 106                 {
 107                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
 108                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
 109                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 110
 111                     byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInBytes;
 112                     pBuffer = (byte*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
 113
 114 #if DEBUG
 115                     long numBytesRead = pBuffer - pOriginalBuffer;
 116                     Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVectorInBytes, "We should've made forward progress of at least one byte.");
 117                     Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 118 #endif
 119
 120                     Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
 121
 122                     do
 123                     {
 124                         Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned.");
 125                         if (Vector.LessThanAny(Unsafe.Read<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
 126                         {
 127                             break; // found non-ASCII data
 128                         }
 129
 130                         pBuffer += SizeOfVectorInBytes;
 131                     } while (pBuffer <= pFinalVectorReadPos);
 132
 133                     // Adjust the remaining buffer length for the number of elements we just consumed.
 134
 135                     bufferLength -= (nuint)pBuffer;
 136                     bufferLength += (nuint)pOriginalBuffer;
 137                 }
 138             }
 139
 140             // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
 141             // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
 142             // path to drain any remaining ASCII bytes.
 143             //
 144             // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
 145             // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes.
 146
 147             uint currentUInt32;
 148
 149             // Try reading 64 bits at a time in a loop.
 150
 151             for (; bufferLength >= 8; bufferLength -= 8)
 152             {
 153                 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
 154                 uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
 155
 156                 if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
 157                 {
 158                     // One of these two values contains non-ASCII bytes.
 159                     // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
 160
 161                     if (AllBytesInUInt32AreAscii(currentUInt32))
 162                     {
 163                         currentUInt32 = nextUInt32;
 164                         pBuffer += 4;
 165                     }
 166
 167                     goto FoundNonAsciiData;
 168                 }
 169
 170                 pBuffer += 8; // consumed 8 ASCII bytes
 171             }
 172
 173             // From this point forward we don't need to update bufferLength.
 174             // Try reading 32 bits.
 175
 176             if ((bufferLength & 4) != 0)
 177             {
 178                 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
 179                 if (!AllBytesInUInt32AreAscii(currentUInt32))
 180                 {
 181                     goto FoundNonAsciiData;
 182                 }
 183
 184                 pBuffer += 4;
 185             }
 186
 187             // Try reading 16 bits.
 188
 189             if ((bufferLength & 2) != 0)
 190             {
 191                 currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
 192                 if (!AllBytesInUInt32AreAscii(currentUInt32))
 193                 {
 194                     goto FoundNonAsciiData;
 195                 }
 196
 197                 pBuffer += 2;
 198             }
 199
 200             // Try reading 8 bits
 201
 202             if ((bufferLength & 1) != 0)
 203             {
 204                 // If the buffer contains non-ASCII data, the comparison below will fail, and
 205                 // we'll end up not incrementing the buffer reference.
 206
 207                 if (*(sbyte*)pBuffer >= 0)
 208                 {
 209                     pBuffer++;
 210                 }
 211             }
 212
 213         Finish:
 214
 215             nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
 216             return totalNumBytesRead;
 217
 218         FoundNonAsciiData:
 219
 220             Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
 221
 222             // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
 223             // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
 224             // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
 225             // non-ASCII. In both cases we only care about the low 24 bits.
 226
 227             pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
 228             goto Finish;
 229         }
 230
 231         private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
 232         {
 233             // JIT turns the below into constants
 234
 235             uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
 236             nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
 237
 238             Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
 239             Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
 240
 241             uint currentMask, secondMask;
 242             byte* pOriginalBuffer = pBuffer;
 243
 244             // This method is written such that control generally flows top-to-bottom, avoiding
 245             // jumps as much as possible in the optimistic case of a large enough buffer and
 246             // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
 247             // after all the main logic.
 248
 249             if (bufferLength < SizeOfVector128)
 250             {
 251                 goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
 252             }
 253
 254             // Read the first vector unaligned.
 255
 256             currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
 257
 258             if (currentMask != 0)
 259             {
 260                 goto FoundNonAsciiDataInCurrentMask;
 261             }
 262
 263             // If we have less than 32 bytes to process, just go straight to the final unaligned
 264             // read. There's no need to mess with the loop logic in the middle of this method.
 265
 266             if (bufferLength < 2 * SizeOfVector128)
 267             {
 268                 goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
 269             }
 270
 271             // Now adjust the read pointer so that future reads are aligned.
 272
 273             pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
 274
 275 #if DEBUG
 276             long numBytesRead = pBuffer - pOriginalBuffer;
 277             Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
 278             Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 279 #endif
 280
 281             // Adjust the remaining length to account for what we just read.
 282
 283             bufferLength += (nuint)pOriginalBuffer;
 284             bufferLength -= (nuint)pBuffer;
 285
 286             // The buffer is now properly aligned.
 287             // Read 2 vectors at a time if possible.
 288
 289             if (bufferLength >= 2 * SizeOfVector128)
 290             {
 291                 byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
 292
 293                 // After this point, we no longer need to update the bufferLength value.
 294
 295                 do
 296                 {
 297                     Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
 298                     Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
 299
 300                     currentMask = (uint)Sse2.MoveMask(firstVector);
 301                     secondMask = (uint)Sse2.MoveMask(secondVector);
 302
 303                     if ((currentMask | secondMask) != 0)
 304                     {
 305                         goto FoundNonAsciiDataInInnerLoop;
 306                     }
 307
 308                     pBuffer += 2 * SizeOfVector128;
 309                 } while (pBuffer <= pFinalVectorReadPos);
 310             }
 311
 312             // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
 313             // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
 314             // But we _can_ rely on it to tell us how much remaining data must be drained by looking
 315             // at what bits of it are set. This works because had we updated it within the loop above,
 316             // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
 317             // bits which are less significant than those that the addition would've acted on.
 318
 319             // If there is fewer than one vector length remaining, skip the next aligned read.
 320
 321             if ((bufferLength & SizeOfVector128) == 0)
 322             {
 323                 goto DoFinalUnalignedVectorRead;
 324             }
 325
 326             // At least one full vector's worth of data remains, so we can safely read it.
 327             // Remember, at this point pBuffer is still aligned.
 328
 329             currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
 330             if (currentMask != 0)
 331             {
 332                 goto FoundNonAsciiDataInCurrentMask;
 333             }
 334
 335         IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
 336
 337             pBuffer += SizeOfVector128;
 338
 339         DoFinalUnalignedVectorRead:
 340
 341             if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
 342             {
 343                 // Perform an unaligned read of the last vector.
 344                 // We need to adjust the pointer because we're re-reading data.
 345
 346                 pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
 347
 348                 currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
 349                 if (currentMask != 0)
 350                 {
 351                     goto FoundNonAsciiDataInCurrentMask;
 352                 }
 353
 354                 pBuffer += SizeOfVector128;
 355             }
 356
 357         Finish:
 358
 359             return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
 360
 361         FoundNonAsciiDataInInnerLoop:
 362
 363             // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
 364             // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
 365             // from the second mask.
 366
 367             if (currentMask == 0)
 368             {
 369                 pBuffer += SizeOfVector128;
 370                 currentMask = secondMask;
 371             }
 372
 373         FoundNonAsciiDataInCurrentMask:
 374
 375             // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
 376             // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
 377             // available, we'll fall back to a normal loop.
 378
 379             Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
 380             pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
 381
 382             goto Finish;
 383
 384         FoundNonAsciiDataInCurrentDWord:
 385
 386             uint currentDWord;
 387             Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
 388             pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
 389
 390             goto Finish;
 391
 392         InputBufferLessThanOneVectorInLength:
 393
 394             // These code paths get hit if the original input length was less than one vector in size.
 395             // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
 396             // directly. Note that all of these reads are unaligned.
 397
 398             Debug.Assert(bufferLength < SizeOfVector128);
 399
 400             // QWORD drain
 401
 402             if ((bufferLength & 8) != 0)
 403             {
 404                 if (Bmi1.X64.IsSupported)
 405                 {
 406                     // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
 407
 408                     ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
 409                     if (!AllBytesInUInt64AreAscii(candidateUInt64))
 410                     {
 411                         // Clear everything but the high bit of each byte, then tzcnt.
 412                         // Remember the / 8 at the end to convert bit count to byte count.
 413
 414                         candidateUInt64 &= UInt64HighBitsOnlyMask;
 415                         pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
 416                         goto Finish;
 417                     }
 418                 }
 419                 else
 420                 {
 421                     // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
 422
 423                     currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
 424                     uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
 425
 426                     if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
 427                     {
 428                         // At least one of the values wasn't all-ASCII.
 429                         // We need to figure out which one it was and stick it in the currentMask local.
 430
 431                         if (AllBytesInUInt32AreAscii(currentDWord))
 432                         {
 433                             currentDWord = nextDWord; // this one is the culprit
 434                             pBuffer += 4;
 435                         }
 436
 437                         goto FoundNonAsciiDataInCurrentDWord;
 438                     }
 439                 }
 440
 441                 pBuffer += 8; // successfully consumed 8 ASCII bytes
 442             }
 443
 444             // DWORD drain
 445
 446             if ((bufferLength & 4) != 0)
 447             {
 448                 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
 449
 450                 if (!AllBytesInUInt32AreAscii(currentDWord))
 451                 {
 452                     goto FoundNonAsciiDataInCurrentDWord;
 453                 }
 454
 455                 pBuffer += 4; // successfully consumed 4 ASCII bytes
 456             }
 457
 458             // WORD drain
 459             // (We movzx to a DWORD for ease of manipulation.)
 460
 461             if ((bufferLength & 2) != 0)
 462             {
 463                 currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
 464
 465                 if (!AllBytesInUInt32AreAscii(currentDWord))
 466                 {
 467                     // We only care about the 0x0080 bit of the value. If it's not set, then we
 468                     // increment currentOffset by 1. If it's set, we don't increment it at all.
 469
 470                     pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
 471                     goto Finish;
 472                 }
 473
 474                 pBuffer += 2; // successfully consumed 2 ASCII bytes
 475             }
 476
 477             // BYTE drain
 478
 479             if ((bufferLength & 1) != 0)
 480             {
 481                 // sbyte has non-negative value if byte is ASCII.
 482
 483                 if (*(sbyte*)(pBuffer) >= 0)
 484                 {
 485                     pBuffer++; // successfully consumed a single byte
 486                 }
 487             }
 488
 489             goto Finish;
 490         }
 491
 492         /// <summary>
 493         /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
 494         /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
 495         /// </summary>
 496         /// <returns>An ASCII char is defined as 0x0000 - 0x007F, inclusive.</returns>
 497         [MethodImpl(MethodImplOptions.AggressiveInlining)]
 498         public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
 499         {
 500             // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
 501             // code below. This has two benefits: (a) we can take advantage of specific instructions like
 502             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
 503             // this method is running.
 504
 505             return (Sse2.IsSupported)
 506                 ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
 507                 : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
 508         }
 509
 510         private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */)
 511         {
 512             // Squirrel away the original buffer reference.This method works by determining the exact
 513             // char reference where non-ASCII data begins, so we need this base value to perform the
 514             // final subtraction at the end of the method to get the index into the original buffer.
 515
 516             char* pOriginalBuffer = pBuffer;
 517
 518             Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
 519
 520             // Before we drain off char-by-char, try a generic vectorized loop.
 521             // Only run the loop if we have at least two vectors we can pull out.
 522
 523             if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<ushort>.Count)
 524             {
 525                 uint SizeOfVectorInChars = (uint)Vector<ushort>.Count; // JIT will make this a const
 526                 uint SizeOfVectorInBytes = (uint)Vector<byte>.Count; // JIT will make this a const
 527
 528                 Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
 529
 530                 if (Vector.LessThanOrEqualAll(Unsafe.ReadUnaligned<Vector<ushort>>(pBuffer), maxAscii))
 531                 {
 532                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
 533                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
 534                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 535
 536                     char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInChars;
 537                     pBuffer = (char*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
 538
 539 #if DEBUG
 540                     long numCharsRead = pBuffer - pOriginalBuffer;
 541                     Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVectorInChars, "We should've made forward progress of at least one char.");
 542                     Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 543 #endif
 544
 545                     Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
 546
 547                     do
 548                     {
 549                         Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned.");
 550                         if (Vector.GreaterThanAny(Unsafe.Read<Vector<ushort>>(pBuffer), maxAscii))
 551                         {
 552                             break; // found non-ASCII data
 553                         }
 554                         pBuffer += SizeOfVectorInChars;
 555                     } while (pBuffer <= pFinalVectorReadPos);
 556
 557                     // Adjust the remaining buffer length for the number of elements we just consumed.
 558
 559                     bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
 560                 }
 561             }
 562
 563             // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
 564             // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
 565             // path to drain any remaining ASCII chars.
 566             //
 567             // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
 568             // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars.
 569
 570             uint currentUInt32;
 571
 572             // Try reading 64 bits at a time in a loop.
 573
 574             for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars
 575             {
 576                 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
 577                 uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
 578
 579                 if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32))
 580                 {
 581                     // One of these two values contains non-ASCII chars.
 582                     // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars.
 583
 584                     if (AllCharsInUInt32AreAscii(currentUInt32))
 585                     {
 586                         currentUInt32 = nextUInt32;
 587                         pBuffer += 2;
 588                     }
 589
 590                     goto FoundNonAsciiData;
 591                 }
 592
 593                 pBuffer += 4; // consumed 4 ASCII chars
 594             }
 595
 596             // From this point forward we don't need to keep track of the remaining buffer length.
 597             // Try reading 32 bits.
 598
 599             if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars
 600             {
 601                 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
 602                 if (!AllCharsInUInt32AreAscii(currentUInt32))
 603                 {
 604                     goto FoundNonAsciiData;
 605                 }
 606
 607                 pBuffer += 2;
 608             }
 609
 610             // Try reading 16 bits.
 611             // No need to try an 8-bit read after this since we're working with chars.
 612
 613             if ((bufferLength & 1) != 0)
 614             {
 615                 // If the buffer contains non-ASCII data, the comparison below will fail, and
 616                 // we'll end up not incrementing the buffer reference.
 617
 618                 if (*pBuffer <= 0x007F)
 619                 {
 620                     pBuffer++;
 621                 }
 622             }
 623
 624         Finish:
 625
 626             nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
 627             Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars.");
 628             return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning
 629
 630         FoundNonAsciiData:
 631
 632             Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
 633
 634             // We don't bother looking at the second char - only the first char.
 635
 636             if (FirstCharInUInt32IsAscii(currentUInt32))
 637             {
 638                 pBuffer++;
 639             }
 640
 641             goto Finish;
 642         }
 643
 644         private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
 645         {
 646             // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
 647             // will be elided by JIT once we determine which specific ISAs we support.
 648
 649             // Quick check for empty inputs.
 650
 651             if (bufferLength == 0)
 652             {
 653                 return 0;
 654             }
 655
 656             // JIT turns the below into constants
 657
 658             uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
 659             uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
 660
 661             Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
 662             Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
 663
 664             Vector128<short> firstVector, secondVector;
 665             uint currentMask;
 666             char* pOriginalBuffer = pBuffer;
 667
 668             if (bufferLength < SizeOfVector128InChars)
 669             {
 670                 goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
 671             }
 672
 673             // This method is written such that control generally flows top-to-bottom, avoiding
 674             // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
 675             // data, we jump out of the hot paths to targets at the end of the method.
 676
 677             Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
 678             Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
 679             Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
 680             Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
 681
 682             Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
 683
 684             // Read the first vector unaligned.
 685
 686             firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
 687
 688             if (Sse41.IsSupported)
 689             {
 690                 // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
 691                 // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
 692                 // in order to extract the mask.
 693                 currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
 694             }
 695             else
 696             {
 697                 // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
 698                 // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
 699                 // the mask.
 700                 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
 701             }
 702
 703             if (currentMask != 0)
 704             {
 705                 goto FoundNonAsciiDataInCurrentMask;
 706             }
 707
 708             // If we have less than 32 bytes to process, just go straight to the final unaligned
 709             // read. There's no need to mess with the loop logic in the middle of this method.
 710
 711             // Adjust the remaining length to account for what we just read.
 712             // For the remainder of this code path, bufferLength will be in bytes, not chars.
 713
 714             bufferLength <<= 1; // chars to bytes
 715
 716             if (bufferLength < 2 * SizeOfVector128InBytes)
 717             {
 718                 goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
 719             }
 720
 721             // Now adjust the read pointer so that future reads are aligned.
 722
 723             pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
 724
 725 #if DEBUG
 726             long numCharsRead = pBuffer - pOriginalBuffer;
 727             Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
 728             Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 729 #endif
 730
 731             // Adjust remaining buffer length.
 732
 733             bufferLength += (nuint)pOriginalBuffer;
 734             bufferLength -= (nuint)pBuffer;
 735
 736             // The buffer is now properly aligned.
 737             // Read 2 vectors at a time if possible.
 738
 739             if (bufferLength >= 2 * SizeOfVector128InBytes)
 740             {
 741                 char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
 742
 743                 // After this point, we no longer need to update the bufferLength value.
 744
 745                 do
 746                 {
 747                     firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
 748                     secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
 749                     Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
 750
 751                     if (Sse41.IsSupported)
 752                     {
 753                         // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
 754                         // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
 755                         if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
 756                         {
 757                             goto FoundNonAsciiDataInFirstOrSecondVector;
 758                         }
 759                     }
 760                     else
 761                     {
 762                         // See comment earlier in the method for an explanation of how the below logic works.
 763                         if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
 764                         {
 765                             goto FoundNonAsciiDataInFirstOrSecondVector;
 766                         }
 767                     }
 768
 769                     pBuffer += 2 * SizeOfVector128InChars;
 770                 } while (pBuffer <= pFinalVectorReadPos);
 771             }
 772
 773             // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
 774             // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
 775             // But we _can_ rely on it to tell us how much remaining data must be drained by looking
 776             // at what bits of it are set. This works because had we updated it within the loop above,
 777             // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
 778             // bits which are less significant than those that the addition would've acted on.
 779
 780             // If there is fewer than one vector length remaining, skip the next aligned read.
 781             // Remember, at this point bufferLength is measured in bytes, not chars.
 782
 783             if ((bufferLength & SizeOfVector128InBytes) == 0)
 784             {
 785                 goto DoFinalUnalignedVectorRead;
 786             }
 787
 788             // At least one full vector's worth of data remains, so we can safely read it.
 789             // Remember, at this point pBuffer is still aligned.
 790
 791             firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
 792
 793             if (Sse41.IsSupported)
 794             {
 795                 // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
 796                 // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
 797                 if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
 798                 {
 799                     goto FoundNonAsciiDataInFirstVector;
 800                 }
 801             }
 802             else
 803             {
 804                 // See comment earlier in the method for an explanation of how the below logic works.
 805                 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
 806                 if (currentMask != 0)
 807                 {
 808                     goto FoundNonAsciiDataInCurrentMask;
 809                 }
 810             }
 811
 812         IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
 813
 814             pBuffer += SizeOfVector128InChars;
 815
 816         DoFinalUnalignedVectorRead:
 817
 818             if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
 819             {
 820                 // Perform an unaligned read of the last vector.
 821                 // We need to adjust the pointer because we're re-reading data.
 822
 823                 pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
 824                 firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
 825
 826                 if (Sse41.IsSupported)
 827                 {
 828                     // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
 829                     // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
 830                     if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
 831                     {
 832                         goto FoundNonAsciiDataInFirstVector;
 833                     }
 834                 }
 835                 else
 836                 {
 837                     // See comment earlier in the method for an explanation of how the below logic works.
 838                     currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
 839                     if (currentMask != 0)
 840                     {
 841                         goto FoundNonAsciiDataInCurrentMask;
 842                     }
 843                 }
 844
 845                 pBuffer += SizeOfVector128InChars;
 846             }
 847
 848         Finish:
 849
 850             Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
 851             return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
 852
 853         FoundNonAsciiDataInFirstOrSecondVector:
 854
 855             // We don't know if the first or the second vector contains non-ASCII data. Check the first
 856             // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
 857             // we'll make sure the first vector local is the one that contains the non-ASCII data.
 858
 859             // See comment earlier in the method for an explanation of how the below logic works.
 860             if (Sse41.IsSupported)
 861             {
 862                 if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
 863                 {
 864                     goto FoundNonAsciiDataInFirstVector;
 865                 }
 866             }
 867             else
 868             {
 869                 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
 870                 if (currentMask != 0)
 871                 {
 872                     goto FoundNonAsciiDataInCurrentMask;
 873                 }
 874             }
 875
 876             // Wasn't the first vector; must be the second.
 877
 878             pBuffer += SizeOfVector128InChars;
 879             firstVector = secondVector;
 880
 881         FoundNonAsciiDataInFirstVector:
 882
 883             // See comment earlier in the method for an explanation of how the below logic works.
 884             if (Sse41.IsSupported)
 885             {
 886                 currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
 887             }
 888             else
 889             {
 890                 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
 891             }
 892
 893         FoundNonAsciiDataInCurrentMask:
 894
 895             // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
 896             // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
 897             // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
 898             // masks work on BYTE elements, and we account for this in the final fixup.)
 899
 900             Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
 901             pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
 902
 903             goto Finish;
 904
 905         FoundNonAsciiDataInCurrentDWord:
 906
 907             uint currentDWord;
 908             Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
 909
 910             if (FirstCharInUInt32IsAscii(currentDWord))
 911             {
 912                 pBuffer++; // skip past the ASCII char
 913             }
 914
 915             goto Finish;
 916
 917         InputBufferLessThanOneVectorInLength:
 918
 919             // These code paths get hit if the original input length was less than one vector in size.
 920             // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
 921             // directly. Note that all of these reads are unaligned.
 922
 923             // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
 924             // We skipped the code path that multiplied the count by sizeof(char).
 925
 926             Debug.Assert(bufferLength < SizeOfVector128InChars);
 927
 928             // QWORD drain
 929
 930             if ((bufferLength & 4) != 0)
 931             {
 932                 if (Bmi1.X64.IsSupported)
 933                 {
 934                     // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
 935
 936                     ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
 937                     if (!AllCharsInUInt64AreAscii(candidateUInt64))
 938                     {
 939                         // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
 940                         // Remember the / 8 at the end to convert bit count to byte count,
 941                         // then the & ~1 at the end to treat a match in the high byte of
 942                         // any char the same as a match in the low byte of that same char.
 943
 944                         candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
 945                         pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
 946                         goto Finish;
 947                     }
 948                 }
 949                 else
 950                 {
 951                     // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
 952
 953                     currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
 954                     uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
 955
 956                     if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
 957                     {
 958                         // At least one of the values wasn't all-ASCII.
 959                         // We need to figure out which one it was and stick it in the currentMask local.
 960
 961                         if (AllCharsInUInt32AreAscii(currentDWord))
 962                         {
 963                             currentDWord = nextDWord; // this one is the culprit
 964                             pBuffer += 4 / sizeof(char);
 965                         }
 966
 967                         goto FoundNonAsciiDataInCurrentDWord;
 968                     }
 969                 }
 970
 971                 pBuffer += 4; // successfully consumed 4 ASCII chars
 972             }
 973
 974             // DWORD drain
 975
 976             if ((bufferLength & 2) != 0)
 977             {
 978                 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
 979
 980                 if (!AllCharsInUInt32AreAscii(currentDWord))
 981                 {
 982                     goto FoundNonAsciiDataInCurrentDWord;
 983                 }
 984
 985                 pBuffer += 2; // successfully consumed 2 ASCII chars
 986             }
 987
 988             // WORD drain
 989             // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
 990
 991             if ((bufferLength & 1) != 0)
 992             {
 993                 if (*pBuffer <= 0x007F)
 994                 {
 995                     pBuffer++; // successfully consumed a single char
 996                 }
 997             }
 998
 999             goto Finish;
1000         }
1001
1002         /// <summary>
1003         /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
1004         /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
1005         /// also in machine-endian order.
1006         /// </summary>
1007         [MethodImpl(MethodImplOptions.AggressiveInlining)]
1008         private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
1009         {
1010             Debug.Assert(AllCharsInUInt64AreAscii(value));
1011
1012             if (Bmi2.X64.IsSupported)
1013             {
1014                 // BMI2 will work regardless of the processor's endianness.
1015                 Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
1016             }
1017             else
1018             {
1019                 if (BitConverter.IsLittleEndian)
1020                 {
1021                     outputBuffer = (byte)value;
1022                     value >>= 16;
1023                     Unsafe.Add(ref outputBuffer, 1) = (byte)value;
1024                     value >>= 16;
1025                     Unsafe.Add(ref outputBuffer, 2) = (byte)value;
1026                     value >>= 16;
1027                     Unsafe.Add(ref outputBuffer, 3) = (byte)value;
1028                 }
1029                 else
1030                 {
1031                     Unsafe.Add(ref outputBuffer, 3) = (byte)value;
1032                     value >>= 16;
1033                     Unsafe.Add(ref outputBuffer, 2) = (byte)value;
1034                     value >>= 16;
1035                     Unsafe.Add(ref outputBuffer, 1) = (byte)value;
1036                     value >>= 16;
1037                     outputBuffer = (byte)value;
1038                 }
1039             }
1040         }
1041
1042         /// <summary>
1043         /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
1044         /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
1045         /// machine-endian order.
1046         /// </summary>
1047         [MethodImpl(MethodImplOptions.AggressiveInlining)]
1048         private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
1049         {
1050             Debug.Assert(AllCharsInUInt32AreAscii(value));
1051
1052             if (BitConverter.IsLittleEndian)
1053             {
1054                 outputBuffer = (byte)value;
1055                 Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
1056             }
1057             else
1058             {
1059                 Unsafe.Add(ref outputBuffer, 1) = (byte)value;
1060                 outputBuffer = (byte)(value >> 16);
1061             }
1062         }
1063
1064         /// <summary>
1065         /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
1066         /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
1067         /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
1068         /// of elements that were able to be converted.
1069         /// </summary>
1070         public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
1071         {
1072             nuint currentOffset = 0;
1073
1074             uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
1075             ulong utf16Data64Bits = 0;
1076
1077             // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
1078             // code below. This has two benefits: (a) we can take advantage of specific instructions like
1079             // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
1080             // processor while this method is running.
1081
1082             if (Sse2.IsSupported)
1083             {
1084                 Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
1085
1086                 if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
1087                 {
1088                     // Since there's overhead to setting up the vectorized code path, we only want to
1089                     // call into it after a quick probe to ensure the next immediate characters really are ASCII.
1090                     // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
1091
1092                     if (IntPtr.Size >= 8)
1093                     {
1094                         utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
1095                         if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
1096                         {
1097                             goto FoundNonAsciiDataIn64BitRead;
1098                         }
1099                     }
1100                     else
1101                     {
1102                         utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
1103                         utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
1104                         if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
1105                         {
1106                             goto FoundNonAsciiDataIn64BitRead;
1107                         }
1108                     }
1109
1110                     currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
1111                 }
1112             }
1113             else if (Vector.IsHardwareAccelerated)
1114             {
1115                 uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
1116
1117                 // Only bother vectorizing if we have enough data to do so.
1118                 if (elementCount >= 2 * SizeOfVector)
1119                 {
1120                     // Since there's overhead to setting up the vectorized code path, we only want to
1121                     // call into it after a quick probe to ensure the next immediate characters really are ASCII.
1122                     // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
1123
1124                     if (IntPtr.Size >= 8)
1125                     {
1126                         utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
1127                         if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
1128                         {
1129                             goto FoundNonAsciiDataIn64BitRead;
1130                         }
1131                     }
1132                     else
1133                     {
1134                         utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
1135                         utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
1136                         if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
1137                         {
1138                             goto FoundNonAsciiDataIn64BitRead;
1139                         }
1140                     }
1141
1142                     Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
1143
1144                     nuint finalOffsetWhereCanLoop = elementCount - 2 * SizeOfVector;
1145                     do
1146                     {
1147                         Vector<ushort> utf16VectorHigh = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset);
1148                         Vector<ushort> utf16VectorLow = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count);
1149
1150                         if (Vector.GreaterThanAny(Vector.BitwiseOr(utf16VectorHigh, utf16VectorLow), maxAscii))
1151                         {
1152                             break; // found non-ASCII data
1153                         }
1154
1155                         // TODO: Is the below logic also valid for big-endian platforms?
1156                         Vector<byte> asciiVector = Vector.Narrow(utf16VectorHigh, utf16VectorLow);
1157                         Unsafe.WriteUnaligned<Vector<byte>>(pAsciiBuffer + currentOffset, asciiVector);
1158
1159                         currentOffset += SizeOfVector;
1160                     } while (currentOffset <= finalOffsetWhereCanLoop);
1161                 }
1162             }
1163
1164             Debug.Assert(currentOffset <= elementCount);
1165             nuint remainingElementCount = elementCount - currentOffset;
1166
1167             // Try to narrow 64 bits -> 32 bits at a time.
1168             // We needn't update remainingElementCount after this point.
1169
1170             if (remainingElementCount >= 4)
1171             {
1172                 nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
1173                 do
1174                 {
1175                     if (IntPtr.Size >= 8)
1176                     {
1177                         // Only perform QWORD reads on a 64-bit platform.
1178                         utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer + currentOffset);
1179                         if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
1180                         {
1181                             goto FoundNonAsciiDataIn64BitRead;
1182                         }
1183
1184                         NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data64Bits);
1185                     }
1186                     else
1187                     {
1188                         utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
1189                         utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset + 4 / sizeof(char));
1190                         if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
1191                         {
1192                             goto FoundNonAsciiDataIn64BitRead;
1193                         }
1194
1195                         NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1196                         NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset + 2], utf16Data32BitsLow);
1197                     }
1198
1199                     currentOffset += 4;
1200                 } while (currentOffset <= finalOffsetWhereCanLoop);
1201             }
1202
1203             // Try to narrow 32 bits -> 16 bits.
1204
1205             if (((uint)remainingElementCount & 2) != 0)
1206             {
1207                 utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
1208                 if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
1209                 {
1210                     goto FoundNonAsciiDataInHigh32Bits;
1211                 }
1212
1213                 NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1214                 currentOffset += 2;
1215             }
1216
1217             // Try to narrow 16 bits -> 8 bits.
1218
1219             if (((uint)remainingElementCount & 1) != 0)
1220             {
1221                 utf16Data32BitsHigh = pUtf16Buffer[currentOffset];
1222                 if (utf16Data32BitsHigh <= 0x007Fu)
1223                 {
1224                     pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
1225                     currentOffset++;
1226                 }
1227             }
1228
1229         Finish:
1230
1231             return currentOffset;
1232
1233         FoundNonAsciiDataIn64BitRead:
1234
1235             if (IntPtr.Size >= 8)
1236             {
1237                 // Try checking the first 32 bits of the buffer for non-ASCII data.
1238                 // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
1239
1240                 if (BitConverter.IsLittleEndian)
1241                 {
1242                     utf16Data32BitsHigh = (uint)utf16Data64Bits;
1243                 }
1244                 else
1245                 {
1246                     utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
1247                 }
1248
1249                 if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
1250                 {
1251                     NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1252
1253                     if (BitConverter.IsLittleEndian)
1254                     {
1255                         utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
1256                     }
1257                     else
1258                     {
1259                         utf16Data32BitsHigh = (uint)utf16Data64Bits;
1260                     }
1261
1262                     currentOffset += 2;
1263                 }
1264             }
1265             else
1266             {
1267                 // Need to determine if the high or the low 32-bit value contained non-ASCII data.
1268                 // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
1269
1270                 if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
1271                 {
1272                     NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1273                     utf16Data32BitsHigh = utf16Data32BitsLow;
1274                     currentOffset += 2;
1275                 }
1276             }
1277
1278         FoundNonAsciiDataInHigh32Bits:
1279
1280             Debug.Assert(!AllCharsInUInt32AreAscii(utf16Data32BitsHigh), "Shouldn't have reached this point if we have an all-ASCII input.");
1281
1282             // There's at most one char that needs to be drained.
1283
1284             if (FirstCharInUInt32IsAscii(utf16Data32BitsHigh))
1285             {
1286                 if (!BitConverter.IsLittleEndian)
1287                 {
1288                     utf16Data32BitsHigh >>= 16; // move high char down to low char
1289                 }
1290
1291                 pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
1292                 currentOffset++;
1293             }
1294
1295             goto Finish;
1296         }
1297
1298         private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
1299         {
1300             // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
1301             // will be elided by JIT once we determine which specific ISAs we support.
1302
1303             // JIT turns the below into constants
1304
1305             uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
1306             nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
1307
1308             // This method is written such that control generally flows top-to-bottom, avoiding
1309             // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
1310             // data, we jump out of the hot paths to targets at the end of the method.
1311
1312             Debug.Assert(Sse2.IsSupported);
1313             Debug.Assert(BitConverter.IsLittleEndian);
1314             Debug.Assert(elementCount >= 2 * SizeOfVector128);
1315
1316             Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
1317             Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
1318             Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
1319
1320             // First, perform an unaligned read of the first part of the input buffer.
1321
1322             Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
1323
1324             // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
1325             // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
1326
1327             if (Sse41.IsSupported)
1328             {
1329                 if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
1330                 {
1331                     return 0;
1332                 }
1333             }
1334             else
1335             {
1336                 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1337                 {
1338                     return 0;
1339                 }
1340             }
1341
1342             // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
1343
1344             Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1345             Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
1346
1347             nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
1348
1349             // We're going to get the best performance when we have aligned writes, so we'll take the
1350             // hit of potentially unaligned reads in order to hit this sweet spot.
1351
1352             // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
1353             // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
1354             // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
1355             // that case we can immediately back up to the previous aligned boundary and start the main loop.
1356             // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
1357             // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
1358             // just past the next aligned boundary address.
1359
1360             if (((uint)pAsciiBuffer & (SizeOfVector128 / 2)) == 0)
1361             {
1362                 // We need to perform one more partial vector write before we can get the alignment we want.
1363
1364                 utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
1365
1366                 // See comments earlier in this method for information about how this works.
1367                 if (Sse41.IsSupported)
1368                 {
1369                     if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
1370                     {
1371                         goto Finish;
1372                     }
1373                 }
1374                 else
1375                 {
1376                     if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1377                     {
1378                         goto Finish;
1379                     }
1380                 }
1381
1382                 // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
1383                 asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1384                 Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
1385             }
1386
1387             // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
1388             // point, then use that as the base offset going forward.
1389
1390             currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
1391             Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
1392
1393             Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
1394             Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
1395
1396             nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
1397             do
1398             {
1399                 // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
1400
1401                 utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
1402                 Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
1403                 Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
1404
1405                 // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
1406                 if (Sse41.IsSupported)
1407                 {
1408                     if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
1409                     {
1410                         goto FoundNonAsciiDataInLoop;
1411                     }
1412                 }
1413                 else
1414                 {
1415                     if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1416                     {
1417                         goto FoundNonAsciiDataInLoop;
1418                     }
1419                 }
1420
1421                 // Build up the UTF-8 vector and perform the store.
1422
1423                 asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
1424
1425                 Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
1426                 Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
1427
1428                 currentOffsetInElements += SizeOfVector128;
1429             } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
1430
1431         Finish:
1432
1433             // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
1434             return currentOffsetInElements;
1435
1436         FoundNonAsciiDataInLoop:
1437
1438             // Can we at least narrow the high vector?
1439             // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
1440             if (Sse41.IsSupported)
1441             {
1442                 if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
1443                 {
1444                     goto Finish; // found non-ASCII data
1445                 }
1446             }
1447             else
1448             {
1449                 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1450                 {
1451                     goto Finish; // found non-ASCII data
1452                 }
1453             }
1454
1455             // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
1456             asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1457
1458             Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
1459
1460             Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
1461             currentOffsetInElements += SizeOfVector128 / 2;
1462
1463             goto Finish;
1464         }
1465
1466         /// <summary>
1467         /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
1468         /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
1469         /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
1470         /// of elements that were able to be converted.
1471         /// </summary>
1472         public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
1473         {
1474             nuint currentOffset = 0;
1475
1476             // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
1477             // code below. This has two benefits: (a) we can take advantage of specific instructions like
1478             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
1479             // this method is running.
1480
1481             if (Sse2.IsSupported)
1482             {
1483                 if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
1484                 {
1485                     currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
1486                 }
1487             }
1488             else if (Vector.IsHardwareAccelerated)
1489             {
1490                 uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
1491
1492                 // Only bother vectorizing if we have enough data to do so.
1493                 if (elementCount >= SizeOfVector)
1494                 {
1495                     // Note use of SBYTE instead of BYTE below; we're using the two's-complement
1496                     // representation of negative integers to act as a surrogate for "is ASCII?".
1497
1498                     nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
1499                     do
1500                     {
1501                         Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
1502                         if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
1503                         {
1504                             break; // found non-ASCII data
1505                         }
1506
1507                         Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
1508
1509                         // TODO: Is the below logic also valid for big-endian platforms?
1510                         Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
1511                         Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
1512
1513                         currentOffset += SizeOfVector;
1514                     } while (currentOffset <= finalOffsetWhereCanLoop);
1515                 }
1516             }
1517
1518             Debug.Assert(currentOffset <= elementCount);
1519             nuint remainingElementCount = elementCount - currentOffset;
1520
1521             // Try to widen 32 bits -> 64 bits at a time.
1522             // We needn't update remainingElementCount after this point.
1523
1524             uint asciiData;
1525
1526             if (remainingElementCount >= 4)
1527             {
1528                 nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
1529                 do
1530                 {
1531                     asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
1532                     if (!AllBytesInUInt32AreAscii(asciiData))
1533                     {
1534                         goto FoundNonAsciiData;
1535                     }
1536
1537                     WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
1538                     currentOffset += 4;
1539                 } while (currentOffset <= finalOffsetWhereCanLoop);
1540             }
1541
1542             // Try to widen 16 bits -> 32 bits.
1543
1544             if (((uint)remainingElementCount & 2) != 0)
1545             {
1546                 asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
1547                 if (!AllBytesInUInt32AreAscii(asciiData))
1548                 {
1549                     goto FoundNonAsciiData;
1550                 }
1551
1552                 if (BitConverter.IsLittleEndian)
1553                 {
1554                     pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
1555                     pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
1556                 }
1557                 else
1558                 {
1559                     pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
1560                     pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
1561                 }
1562
1563                 currentOffset += 2;
1564             }
1565
1566             // Try to widen 8 bits -> 16 bits.
1567
1568             if (((uint)remainingElementCount & 1) != 0)
1569             {
1570                 asciiData = pAsciiBuffer[currentOffset];
1571                 if (((byte)asciiData & 0x80) != 0)
1572                 {
1573                     goto Finish;
1574                 }
1575
1576                 pUtf16Buffer[currentOffset] = (char)asciiData;
1577                 currentOffset += 1;
1578             }
1579
1580         Finish:
1581
1582             return currentOffset;
1583
1584         FoundNonAsciiData:
1585
1586             Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
1587
1588             // Drain ASCII bytes one at a time.
1589
1590             while (((byte)asciiData & 0x80) == 0)
1591             {
1592                 pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
1593                 currentOffset += 1;
1594                 asciiData >>= 8;
1595             }
1596
1597             goto Finish;
1598         }
1599
1600         private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
1601         {
1602             // JIT turns the below into constants
1603
1604             uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
1605             nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
1606
1607             // This method is written such that control generally flows top-to-bottom, avoiding
1608             // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
1609             // data, we jump out of the hot paths to targets at the end of the method.
1610
1611             Debug.Assert(Sse2.IsSupported);
1612             Debug.Assert(BitConverter.IsLittleEndian);
1613             Debug.Assert(elementCount >= 2 * SizeOfVector128);
1614
1615             // We're going to get the best performance when we have aligned writes, so we'll take the
1616             // hit of potentially unaligned reads in order to hit this sweet spot.
1617
1618             Vector128<byte> asciiVector;
1619             Vector128<byte> utf16FirstHalfVector;
1620             uint mask;
1621
1622             // First, perform an unaligned read of the first part of the input buffer.
1623
1624             asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
1625             mask = (uint)Sse2.MoveMask(asciiVector);
1626
1627             // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
1628
1629             if ((byte)mask != 0)
1630             {
1631                 return 0;
1632             }
1633
1634             // Then perform an unaligned write of the first part of the input buffer.
1635
1636             Vector128<byte> zeroVector = Vector128<byte>.Zero;
1637
1638             utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
1639             Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
1640
1641             // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
1642             // point, then use that as the base offset going forward. Remember the >> 1 to account for
1643             // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
1644             // the loop, but this is ok.
1645
1646             nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
1647             Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
1648
1649             nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
1650
1651             do
1652             {
1653                 // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
1654
1655                 asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
1656                 mask = (uint)Sse2.MoveMask(asciiVector);
1657
1658                 if (mask != 0)
1659                 {
1660                     // non-ASCII byte somewhere
1661                     goto NonAsciiDataSeenInInnerLoop;
1662                 }
1663
1664                 byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
1665                 Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
1666
1667                 pStore += SizeOfVector128;
1668                 Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
1669
1670                 currentOffset += SizeOfVector128;
1671             } while (currentOffset <= finalOffsetWhereCanRunLoop);
1672
1673         Finish:
1674
1675             return currentOffset;
1676
1677         NonAsciiDataSeenInInnerLoop:
1678
1679             // Can we at least widen the first part of the vector?
1680
1681             if ((byte)mask == 0)
1682             {
1683                 // First part was all ASCII, widen
1684                 utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
1685                 Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
1686                 currentOffset += SizeOfVector128 / 2;
1687             }
1688
1689             goto Finish;
1690         }
1691
1692         /// <summary>
1693         /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
1694         /// writes them to the output buffer with machine endianness.
1695         /// </summary>
1696         [MethodImpl(MethodImplOptions.AggressiveInlining)]
1697         private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
1698         {
1699             Debug.Assert(AllBytesInUInt32AreAscii(value));
1700
1701             if (Bmi2.X64.IsSupported)
1702             {
1703                 // BMI2 will work regardless of the processor's endianness.
1704                 Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
1705             }
1706             else
1707             {
1708                 if (BitConverter.IsLittleEndian)
1709                 {
1710                     outputBuffer = (char)(byte)value;
1711                     value >>= 8;
1712                     Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
1713                     value >>= 8;
1714                     Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
1715                     value >>= 8;
1716                     Unsafe.Add(ref outputBuffer, 3) = (char)value;
1717                 }
1718                 else
1719                 {
1720                     Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
1721                     value >>= 8;
1722                     Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
1723                     value >>= 8;
1724                     Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
1725                     value >>= 8;
1726                     outputBuffer = (char)value;
1727                 }
1728             }
1729         }
1730     }
1731 }