1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 using System.Diagnostics;
7 using System.Runtime.CompilerServices;
8 using System.Runtime.Intrinsics;
9 using System.Runtime.Intrinsics.X86;
10 using Internal.Runtime.CompilerServices;
13 using nint = System.Int64;
14 using nuint = System.UInt64;
16 using nint = System.Int32;
17 using nuint = System.UInt32;
22 internal static partial class ASCIIUtility
27 Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
28 Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
32 [MethodImpl(MethodImplOptions.AggressiveInlining)]
33 private static bool AllBytesInUInt64AreAscii(ulong value)
35 // If the high bit of any byte is set, that byte is non-ASCII.
37 return ((value & UInt64HighBitsOnlyMask) == 0);
41 /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
43 [MethodImpl(MethodImplOptions.AggressiveInlining)]
44 private static bool AllCharsInUInt32AreAscii(uint value)
46 return ((value & ~0x007F007Fu) == 0);
50 /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
52 [MethodImpl(MethodImplOptions.AggressiveInlining)]
53 private static bool AllCharsInUInt64AreAscii(ulong value)
55 return ((value & ~0x007F007F_007F007Ful) == 0);
59 /// Given a DWORD which represents two packed chars in machine-endian order,
60 /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
62 /// <param name="value"></param>
63 /// <returns></returns>
64 private static bool FirstCharInUInt32IsAscii(uint value)
66 return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
67 || (!BitConverter.IsLittleEndian && (value & 0xFF800000u) == 0);
71 /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
72 /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
74 /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
75 [MethodImpl(MethodImplOptions.AggressiveInlining)]
76 public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
78 // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
79 // code below. This has two benefits: (a) we can take advantage of specific instructions like
80 // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
81 // this method is running.
83 return (Sse2.IsSupported)
84 ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
85 : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
88 private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
90 // Squirrel away the original buffer reference. This method works by determining the exact
91 // byte reference where non-ASCII data begins, so we need this base value to perform the
92 // final subtraction at the end of the method to get the index into the original buffer.
94 byte* pOriginalBuffer = pBuffer;
96 // Before we drain off byte-by-byte, try a generic vectorized loop.
97 // Only run the loop if we have at least two vectors we can pull out.
98 // Note use of SBYTE instead of BYTE below; we're using the two's-complement
99 // representation of negative integers to act as a surrogate for "is ASCII?".
101 if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<sbyte>.Count)
103 uint SizeOfVectorInBytes = (uint)Vector<sbyte>.Count; // JIT will make this a const
105 if (Vector.GreaterThanOrEqualAll(Unsafe.ReadUnaligned<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
107 // The first several elements of the input buffer were ASCII. Bump up the pointer to the
108 // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
109 // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
111 byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInBytes;
112 pBuffer = (byte*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
115 long numBytesRead = pBuffer - pOriginalBuffer;
116 Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVectorInBytes, "We should've made forward progress of at least one byte.");
117 Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
120 Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
124 Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned.");
125 if (Vector.LessThanAny(Unsafe.Read<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
127 break; // found non-ASCII data
130 pBuffer += SizeOfVectorInBytes;
131 } while (pBuffer <= pFinalVectorReadPos);
133 // Adjust the remaining buffer length for the number of elements we just consumed.
135 bufferLength -= (nuint)pBuffer;
136 bufferLength += (nuint)pOriginalBuffer;
140 // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
141 // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
142 // path to drain any remaining ASCII bytes.
144 // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
145 // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes.
149 // Try reading 64 bits at a time in a loop.
151 for (; bufferLength >= 8; bufferLength -= 8)
153 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
154 uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
156 if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
158 // One of these two values contains non-ASCII bytes.
159 // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
161 if (AllBytesInUInt32AreAscii(currentUInt32))
163 currentUInt32 = nextUInt32;
167 goto FoundNonAsciiData;
170 pBuffer += 8; // consumed 8 ASCII bytes
173 // From this point forward we don't need to update bufferLength.
174 // Try reading 32 bits.
176 if ((bufferLength & 4) != 0)
178 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
179 if (!AllBytesInUInt32AreAscii(currentUInt32))
181 goto FoundNonAsciiData;
187 // Try reading 16 bits.
189 if ((bufferLength & 2) != 0)
191 currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
192 if (!AllBytesInUInt32AreAscii(currentUInt32))
194 goto FoundNonAsciiData;
200 // Try reading 8 bits
202 if ((bufferLength & 1) != 0)
204 // If the buffer contains non-ASCII data, the comparison below will fail, and
205 // we'll end up not incrementing the buffer reference.
207 if (*(sbyte*)pBuffer >= 0)
215 nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
216 return totalNumBytesRead;
220 Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
222 // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
223 // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
224 // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
225 // non-ASCII. In both cases we only care about the low 24 bits.
227 pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
231 private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
233 // JIT turns the below into constants
235 uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
236 nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
238 Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
239 Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
241 uint currentMask, secondMask;
242 byte* pOriginalBuffer = pBuffer;
244 // This method is written such that control generally flows top-to-bottom, avoiding
245 // jumps as much as possible in the optimistic case of a large enough buffer and
246 // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
247 // after all the main logic.
249 if (bufferLength < SizeOfVector128)
251 goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
254 // Read the first vector unaligned.
256 currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
258 if (currentMask != 0)
260 goto FoundNonAsciiDataInCurrentMask;
263 // If we have less than 32 bytes to process, just go straight to the final unaligned
264 // read. There's no need to mess with the loop logic in the middle of this method.
266 if (bufferLength < 2 * SizeOfVector128)
268 goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
271 // Now adjust the read pointer so that future reads are aligned.
273 pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
276 long numBytesRead = pBuffer - pOriginalBuffer;
277 Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
278 Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
281 // Adjust the remaining length to account for what we just read.
283 bufferLength += (nuint)pOriginalBuffer;
284 bufferLength -= (nuint)pBuffer;
286 // The buffer is now properly aligned.
287 // Read 2 vectors at a time if possible.
289 if (bufferLength >= 2 * SizeOfVector128)
291 byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
293 // After this point, we no longer need to update the bufferLength value.
297 Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
298 Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
300 currentMask = (uint)Sse2.MoveMask(firstVector);
301 secondMask = (uint)Sse2.MoveMask(secondVector);
303 if ((currentMask | secondMask) != 0)
305 goto FoundNonAsciiDataInInnerLoop;
308 pBuffer += 2 * SizeOfVector128;
309 } while (pBuffer <= pFinalVectorReadPos);
312 // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
313 // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
314 // But we _can_ rely on it to tell us how much remaining data must be drained by looking
315 // at what bits of it are set. This works because had we updated it within the loop above,
316 // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
317 // bits which are less significant than those that the addition would've acted on.
319 // If there is fewer than one vector length remaining, skip the next aligned read.
321 if ((bufferLength & SizeOfVector128) == 0)
323 goto DoFinalUnalignedVectorRead;
326 // At least one full vector's worth of data remains, so we can safely read it.
327 // Remember, at this point pBuffer is still aligned.
329 currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
330 if (currentMask != 0)
332 goto FoundNonAsciiDataInCurrentMask;
335 IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
337 pBuffer += SizeOfVector128;
339 DoFinalUnalignedVectorRead:
341 if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
343 // Perform an unaligned read of the last vector.
344 // We need to adjust the pointer because we're re-reading data.
346 pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
348 currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
349 if (currentMask != 0)
351 goto FoundNonAsciiDataInCurrentMask;
354 pBuffer += SizeOfVector128;
359 return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
361 FoundNonAsciiDataInInnerLoop:
363 // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
364 // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
365 // from the second mask.
367 if (currentMask == 0)
369 pBuffer += SizeOfVector128;
370 currentMask = secondMask;
373 FoundNonAsciiDataInCurrentMask:
375 // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
376 // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
377 // available, we'll fall back to a normal loop.
379 Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
380 pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
384 FoundNonAsciiDataInCurrentDWord:
387 Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
388 pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
392 InputBufferLessThanOneVectorInLength:
394 // These code paths get hit if the original input length was less than one vector in size.
395 // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
396 // directly. Note that all of these reads are unaligned.
398 Debug.Assert(bufferLength < SizeOfVector128);
402 if ((bufferLength & 8) != 0)
404 if (Bmi1.X64.IsSupported)
406 // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
408 ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
409 if (!AllBytesInUInt64AreAscii(candidateUInt64))
411 // Clear everything but the high bit of each byte, then tzcnt.
412 // Remember the / 8 at the end to convert bit count to byte count.
414 candidateUInt64 &= UInt64HighBitsOnlyMask;
415 pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
421 // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
423 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
424 uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
426 if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
428 // At least one of the values wasn't all-ASCII.
429 // We need to figure out which one it was and stick it in the currentMask local.
431 if (AllBytesInUInt32AreAscii(currentDWord))
433 currentDWord = nextDWord; // this one is the culprit
437 goto FoundNonAsciiDataInCurrentDWord;
441 pBuffer += 8; // successfully consumed 8 ASCII bytes
446 if ((bufferLength & 4) != 0)
448 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
450 if (!AllBytesInUInt32AreAscii(currentDWord))
452 goto FoundNonAsciiDataInCurrentDWord;
455 pBuffer += 4; // successfully consumed 4 ASCII bytes
459 // (We movzx to a DWORD for ease of manipulation.)
461 if ((bufferLength & 2) != 0)
463 currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
465 if (!AllBytesInUInt32AreAscii(currentDWord))
467 // We only care about the 0x0080 bit of the value. If it's not set, then we
468 // increment currentOffset by 1. If it's set, we don't increment it at all.
470 pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
474 pBuffer += 2; // successfully consumed 2 ASCII bytes
479 if ((bufferLength & 1) != 0)
481 // sbyte has non-negative value if byte is ASCII.
483 if (*(sbyte*)(pBuffer) >= 0)
485 pBuffer++; // successfully consumed a single byte
493 /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
494 /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
496 /// <returns>An ASCII char is defined as 0x0000 - 0x007F, inclusive.</returns>
497 [MethodImpl(MethodImplOptions.AggressiveInlining)]
498 public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
500 // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
501 // code below. This has two benefits: (a) we can take advantage of specific instructions like
502 // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
503 // this method is running.
505 return (Sse2.IsSupported)
506 ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
507 : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
510 private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */)
512 // Squirrel away the original buffer reference.This method works by determining the exact
513 // char reference where non-ASCII data begins, so we need this base value to perform the
514 // final subtraction at the end of the method to get the index into the original buffer.
516 char* pOriginalBuffer = pBuffer;
518 Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
520 // Before we drain off char-by-char, try a generic vectorized loop.
521 // Only run the loop if we have at least two vectors we can pull out.
523 if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<ushort>.Count)
525 uint SizeOfVectorInChars = (uint)Vector<ushort>.Count; // JIT will make this a const
526 uint SizeOfVectorInBytes = (uint)Vector<byte>.Count; // JIT will make this a const
528 Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
530 if (Vector.LessThanOrEqualAll(Unsafe.ReadUnaligned<Vector<ushort>>(pBuffer), maxAscii))
532 // The first several elements of the input buffer were ASCII. Bump up the pointer to the
533 // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
534 // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
536 char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInChars;
537 pBuffer = (char*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
540 long numCharsRead = pBuffer - pOriginalBuffer;
541 Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVectorInChars, "We should've made forward progress of at least one char.");
542 Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
545 Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
549 Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned.");
550 if (Vector.GreaterThanAny(Unsafe.Read<Vector<ushort>>(pBuffer), maxAscii))
552 break; // found non-ASCII data
554 pBuffer += SizeOfVectorInChars;
555 } while (pBuffer <= pFinalVectorReadPos);
557 // Adjust the remaining buffer length for the number of elements we just consumed.
559 bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
563 // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
564 // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
565 // path to drain any remaining ASCII chars.
567 // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
568 // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars.
572 // Try reading 64 bits at a time in a loop.
574 for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars
576 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
577 uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
579 if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32))
581 // One of these two values contains non-ASCII chars.
582 // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars.
584 if (AllCharsInUInt32AreAscii(currentUInt32))
586 currentUInt32 = nextUInt32;
590 goto FoundNonAsciiData;
593 pBuffer += 4; // consumed 4 ASCII chars
596 // From this point forward we don't need to keep track of the remaining buffer length.
597 // Try reading 32 bits.
599 if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars
601 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
602 if (!AllCharsInUInt32AreAscii(currentUInt32))
604 goto FoundNonAsciiData;
610 // Try reading 16 bits.
611 // No need to try an 8-bit read after this since we're working with chars.
613 if ((bufferLength & 1) != 0)
615 // If the buffer contains non-ASCII data, the comparison below will fail, and
616 // we'll end up not incrementing the buffer reference.
618 if (*pBuffer <= 0x007F)
626 nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
627 Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars.");
628 return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning
632 Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
634 // We don't bother looking at the second char - only the first char.
636 if (FirstCharInUInt32IsAscii(currentUInt32))
644 private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
646 // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
647 // will be elided by JIT once we determine which specific ISAs we support.
649 // Quick check for empty inputs.
651 if (bufferLength == 0)
656 // JIT turns the below into constants
658 uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
659 uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
661 Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
662 Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
664 Vector128<short> firstVector, secondVector;
666 char* pOriginalBuffer = pBuffer;
668 if (bufferLength < SizeOfVector128InChars)
670 goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
673 // This method is written such that control generally flows top-to-bottom, avoiding
674 // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
675 // data, we jump out of the hot paths to targets at the end of the method.
677 Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
678 Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
679 Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
680 Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
682 Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
684 // Read the first vector unaligned.
686 firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
688 if (Sse41.IsSupported)
690 // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
691 // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
692 // in order to extract the mask.
693 currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
697 // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
698 // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
700 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
703 if (currentMask != 0)
705 goto FoundNonAsciiDataInCurrentMask;
708 // If we have less than 32 bytes to process, just go straight to the final unaligned
709 // read. There's no need to mess with the loop logic in the middle of this method.
711 // Adjust the remaining length to account for what we just read.
712 // For the remainder of this code path, bufferLength will be in bytes, not chars.
714 bufferLength <<= 1; // chars to bytes
716 if (bufferLength < 2 * SizeOfVector128InBytes)
718 goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
721 // Now adjust the read pointer so that future reads are aligned.
723 pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
726 long numCharsRead = pBuffer - pOriginalBuffer;
727 Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
728 Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
731 // Adjust remaining buffer length.
733 bufferLength += (nuint)pOriginalBuffer;
734 bufferLength -= (nuint)pBuffer;
736 // The buffer is now properly aligned.
737 // Read 2 vectors at a time if possible.
739 if (bufferLength >= 2 * SizeOfVector128InBytes)
741 char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
743 // After this point, we no longer need to update the bufferLength value.
747 firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
748 secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
749 Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
751 if (Sse41.IsSupported)
753 // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
754 // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
755 if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
757 goto FoundNonAsciiDataInFirstOrSecondVector;
762 // See comment earlier in the method for an explanation of how the below logic works.
763 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
765 goto FoundNonAsciiDataInFirstOrSecondVector;
769 pBuffer += 2 * SizeOfVector128InChars;
770 } while (pBuffer <= pFinalVectorReadPos);
773 // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
774 // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
775 // But we _can_ rely on it to tell us how much remaining data must be drained by looking
776 // at what bits of it are set. This works because had we updated it within the loop above,
777 // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
778 // bits which are less significant than those that the addition would've acted on.
780 // If there is fewer than one vector length remaining, skip the next aligned read.
781 // Remember, at this point bufferLength is measured in bytes, not chars.
783 if ((bufferLength & SizeOfVector128InBytes) == 0)
785 goto DoFinalUnalignedVectorRead;
788 // At least one full vector's worth of data remains, so we can safely read it.
789 // Remember, at this point pBuffer is still aligned.
791 firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
793 if (Sse41.IsSupported)
795 // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
796 // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
797 if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
799 goto FoundNonAsciiDataInFirstVector;
804 // See comment earlier in the method for an explanation of how the below logic works.
805 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
806 if (currentMask != 0)
808 goto FoundNonAsciiDataInCurrentMask;
812 IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
814 pBuffer += SizeOfVector128InChars;
816 DoFinalUnalignedVectorRead:
818 if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
820 // Perform an unaligned read of the last vector.
821 // We need to adjust the pointer because we're re-reading data.
823 pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
824 firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
826 if (Sse41.IsSupported)
828 // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
829 // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
830 if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
832 goto FoundNonAsciiDataInFirstVector;
837 // See comment earlier in the method for an explanation of how the below logic works.
838 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
839 if (currentMask != 0)
841 goto FoundNonAsciiDataInCurrentMask;
845 pBuffer += SizeOfVector128InChars;
850 Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
851 return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
853 FoundNonAsciiDataInFirstOrSecondVector:
855 // We don't know if the first or the second vector contains non-ASCII data. Check the first
856 // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
857 // we'll make sure the first vector local is the one that contains the non-ASCII data.
859 // See comment earlier in the method for an explanation of how the below logic works.
860 if (Sse41.IsSupported)
862 if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
864 goto FoundNonAsciiDataInFirstVector;
869 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
870 if (currentMask != 0)
872 goto FoundNonAsciiDataInCurrentMask;
876 // Wasn't the first vector; must be the second.
878 pBuffer += SizeOfVector128InChars;
879 firstVector = secondVector;
881 FoundNonAsciiDataInFirstVector:
883 // See comment earlier in the method for an explanation of how the below logic works.
884 if (Sse41.IsSupported)
886 currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
890 currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
893 FoundNonAsciiDataInCurrentMask:
895 // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
896 // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
897 // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
898 // masks work on BYTE elements, and we account for this in the final fixup.)
900 Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
901 pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
905 FoundNonAsciiDataInCurrentDWord:
908 Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
910 if (FirstCharInUInt32IsAscii(currentDWord))
912 pBuffer++; // skip past the ASCII char
917 InputBufferLessThanOneVectorInLength:
919 // These code paths get hit if the original input length was less than one vector in size.
920 // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
921 // directly. Note that all of these reads are unaligned.
923 // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
924 // We skipped the code path that multiplied the count by sizeof(char).
926 Debug.Assert(bufferLength < SizeOfVector128InChars);
930 if ((bufferLength & 4) != 0)
932 if (Bmi1.X64.IsSupported)
934 // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
936 ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
937 if (!AllCharsInUInt64AreAscii(candidateUInt64))
939 // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
940 // Remember the / 8 at the end to convert bit count to byte count,
941 // then the & ~1 at the end to treat a match in the high byte of
942 // any char the same as a match in the low byte of that same char.
944 candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
945 pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
951 // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
953 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
954 uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
956 if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
958 // At least one of the values wasn't all-ASCII.
959 // We need to figure out which one it was and stick it in the currentMask local.
961 if (AllCharsInUInt32AreAscii(currentDWord))
963 currentDWord = nextDWord; // this one is the culprit
964 pBuffer += 4 / sizeof(char);
967 goto FoundNonAsciiDataInCurrentDWord;
971 pBuffer += 4; // successfully consumed 4 ASCII chars
976 if ((bufferLength & 2) != 0)
978 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
980 if (!AllCharsInUInt32AreAscii(currentDWord))
982 goto FoundNonAsciiDataInCurrentDWord;
985 pBuffer += 2; // successfully consumed 2 ASCII chars
989 // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
991 if ((bufferLength & 1) != 0)
993 if (*pBuffer <= 0x007F)
995 pBuffer++; // successfully consumed a single char
1003 /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
1004 /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
1005 /// also in machine-endian order.
1007 [MethodImpl(MethodImplOptions.AggressiveInlining)]
1008 private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
1010 Debug.Assert(AllCharsInUInt64AreAscii(value));
1012 if (Bmi2.X64.IsSupported)
1014 // BMI2 will work regardless of the processor's endianness.
1015 Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
1019 if (BitConverter.IsLittleEndian)
1021 outputBuffer = (byte)value;
1023 Unsafe.Add(ref outputBuffer, 1) = (byte)value;
1025 Unsafe.Add(ref outputBuffer, 2) = (byte)value;
1027 Unsafe.Add(ref outputBuffer, 3) = (byte)value;
1031 Unsafe.Add(ref outputBuffer, 3) = (byte)value;
1033 Unsafe.Add(ref outputBuffer, 2) = (byte)value;
1035 Unsafe.Add(ref outputBuffer, 1) = (byte)value;
1037 outputBuffer = (byte)value;
1043 /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
1044 /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
1045 /// machine-endian order.
1047 [MethodImpl(MethodImplOptions.AggressiveInlining)]
1048 private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
1050 Debug.Assert(AllCharsInUInt32AreAscii(value));
1052 if (BitConverter.IsLittleEndian)
1054 outputBuffer = (byte)value;
1055 Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
1059 Unsafe.Add(ref outputBuffer, 1) = (byte)value;
1060 outputBuffer = (byte)(value >> 16);
1065 /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
1066 /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
1067 /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
1068 /// of elements that were able to be converted.
1070 public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
1072 nuint currentOffset = 0;
1074 uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
1075 ulong utf16Data64Bits = 0;
1077 // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
1078 // code below. This has two benefits: (a) we can take advantage of specific instructions like
1079 // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
1080 // processor while this method is running.
1082 if (Sse2.IsSupported)
1084 Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
1086 if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
1088 // Since there's overhead to setting up the vectorized code path, we only want to
1089 // call into it after a quick probe to ensure the next immediate characters really are ASCII.
1090 // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
1092 if (IntPtr.Size >= 8)
1094 utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
1095 if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
1097 goto FoundNonAsciiDataIn64BitRead;
1102 utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
1103 utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
1104 if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
1106 goto FoundNonAsciiDataIn64BitRead;
1110 currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
1113 else if (Vector.IsHardwareAccelerated)
1115 uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
1117 // Only bother vectorizing if we have enough data to do so.
1118 if (elementCount >= 2 * SizeOfVector)
1120 // Since there's overhead to setting up the vectorized code path, we only want to
1121 // call into it after a quick probe to ensure the next immediate characters really are ASCII.
1122 // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
1124 if (IntPtr.Size >= 8)
1126 utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
1127 if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
1129 goto FoundNonAsciiDataIn64BitRead;
1134 utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
1135 utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
1136 if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
1138 goto FoundNonAsciiDataIn64BitRead;
1142 Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
1144 nuint finalOffsetWhereCanLoop = elementCount - 2 * SizeOfVector;
1147 Vector<ushort> utf16VectorHigh = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset);
1148 Vector<ushort> utf16VectorLow = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count);
1150 if (Vector.GreaterThanAny(Vector.BitwiseOr(utf16VectorHigh, utf16VectorLow), maxAscii))
1152 break; // found non-ASCII data
1155 // TODO: Is the below logic also valid for big-endian platforms?
1156 Vector<byte> asciiVector = Vector.Narrow(utf16VectorHigh, utf16VectorLow);
1157 Unsafe.WriteUnaligned<Vector<byte>>(pAsciiBuffer + currentOffset, asciiVector);
1159 currentOffset += SizeOfVector;
1160 } while (currentOffset <= finalOffsetWhereCanLoop);
1164 Debug.Assert(currentOffset <= elementCount);
1165 nuint remainingElementCount = elementCount - currentOffset;
1167 // Try to narrow 64 bits -> 32 bits at a time.
1168 // We needn't update remainingElementCount after this point.
1170 if (remainingElementCount >= 4)
1172 nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
1175 if (IntPtr.Size >= 8)
1177 // Only perform QWORD reads on a 64-bit platform.
1178 utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer + currentOffset);
1179 if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
1181 goto FoundNonAsciiDataIn64BitRead;
1184 NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data64Bits);
1188 utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
1189 utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset + 4 / sizeof(char));
1190 if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
1192 goto FoundNonAsciiDataIn64BitRead;
1195 NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1196 NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset + 2], utf16Data32BitsLow);
1200 } while (currentOffset <= finalOffsetWhereCanLoop);
1203 // Try to narrow 32 bits -> 16 bits.
1205 if (((uint)remainingElementCount & 2) != 0)
1207 utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
1208 if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
1210 goto FoundNonAsciiDataInHigh32Bits;
1213 NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1217 // Try to narrow 16 bits -> 8 bits.
1219 if (((uint)remainingElementCount & 1) != 0)
1221 utf16Data32BitsHigh = pUtf16Buffer[currentOffset];
1222 if (utf16Data32BitsHigh <= 0x007Fu)
1224 pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
1231 return currentOffset;
1233 FoundNonAsciiDataIn64BitRead:
1235 if (IntPtr.Size >= 8)
1237 // Try checking the first 32 bits of the buffer for non-ASCII data.
1238 // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
1240 if (BitConverter.IsLittleEndian)
1242 utf16Data32BitsHigh = (uint)utf16Data64Bits;
1246 utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
1249 if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
1251 NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1253 if (BitConverter.IsLittleEndian)
1255 utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
1259 utf16Data32BitsHigh = (uint)utf16Data64Bits;
1267 // Need to determine if the high or the low 32-bit value contained non-ASCII data.
1268 // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
1270 if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
1272 NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
1273 utf16Data32BitsHigh = utf16Data32BitsLow;
1278 FoundNonAsciiDataInHigh32Bits:
1280 Debug.Assert(!AllCharsInUInt32AreAscii(utf16Data32BitsHigh), "Shouldn't have reached this point if we have an all-ASCII input.");
1282 // There's at most one char that needs to be drained.
1284 if (FirstCharInUInt32IsAscii(utf16Data32BitsHigh))
1286 if (!BitConverter.IsLittleEndian)
1288 utf16Data32BitsHigh >>= 16; // move high char down to low char
1291 pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
1298 private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
1300 // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
1301 // will be elided by JIT once we determine which specific ISAs we support.
1303 // JIT turns the below into constants
1305 uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
1306 nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
1308 // This method is written such that control generally flows top-to-bottom, avoiding
1309 // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
1310 // data, we jump out of the hot paths to targets at the end of the method.
1312 Debug.Assert(Sse2.IsSupported);
1313 Debug.Assert(BitConverter.IsLittleEndian);
1314 Debug.Assert(elementCount >= 2 * SizeOfVector128);
1316 Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
1317 Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
1318 Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
1320 // First, perform an unaligned read of the first part of the input buffer.
1322 Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
1324 // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
1325 // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
1327 if (Sse41.IsSupported)
1329 if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
1336 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1342 // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
1344 Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1345 Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
1347 nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
1349 // We're going to get the best performance when we have aligned writes, so we'll take the
1350 // hit of potentially unaligned reads in order to hit this sweet spot.
1352 // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
1353 // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
1354 // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
1355 // that case we can immediately back up to the previous aligned boundary and start the main loop.
1356 // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
1357 // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
1358 // just past the next aligned boundary address.
1360 if (((uint)pAsciiBuffer & (SizeOfVector128 / 2)) == 0)
1362 // We need to perform one more partial vector write before we can get the alignment we want.
1364 utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
1366 // See comments earlier in this method for information about how this works.
1367 if (Sse41.IsSupported)
1369 if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
1376 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1382 // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
1383 asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1384 Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
1387 // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
1388 // point, then use that as the base offset going forward.
1390 currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
1391 Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
1393 Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
1394 Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
1396 nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
1399 // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
1401 utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
1402 Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
1403 Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
1405 // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
1406 if (Sse41.IsSupported)
1408 if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
1410 goto FoundNonAsciiDataInLoop;
1415 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1417 goto FoundNonAsciiDataInLoop;
1421 // Build up the UTF-8 vector and perform the store.
1423 asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
1425 Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
1426 Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
1428 currentOffsetInElements += SizeOfVector128;
1429 } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
1433 // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
1434 return currentOffsetInElements;
1436 FoundNonAsciiDataInLoop:
1438 // Can we at least narrow the high vector?
1439 // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
1440 if (Sse41.IsSupported)
1442 if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
1444 goto Finish; // found non-ASCII data
1449 if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
1451 goto Finish; // found non-ASCII data
1455 // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
1456 asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1458 Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
1460 Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
1461 currentOffsetInElements += SizeOfVector128 / 2;
1467 /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
1468 /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
1469 /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
1470 /// of elements that were able to be converted.
1472 public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
1474 nuint currentOffset = 0;
1476 // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
1477 // code below. This has two benefits: (a) we can take advantage of specific instructions like
1478 // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
1479 // this method is running.
1481 if (Sse2.IsSupported)
1483 if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
1485 currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
1488 else if (Vector.IsHardwareAccelerated)
1490 uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
1492 // Only bother vectorizing if we have enough data to do so.
1493 if (elementCount >= SizeOfVector)
1495 // Note use of SBYTE instead of BYTE below; we're using the two's-complement
1496 // representation of negative integers to act as a surrogate for "is ASCII?".
1498 nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
1501 Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
1502 if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
1504 break; // found non-ASCII data
1507 Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
1509 // TODO: Is the below logic also valid for big-endian platforms?
1510 Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
1511 Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
1513 currentOffset += SizeOfVector;
1514 } while (currentOffset <= finalOffsetWhereCanLoop);
1518 Debug.Assert(currentOffset <= elementCount);
1519 nuint remainingElementCount = elementCount - currentOffset;
1521 // Try to widen 32 bits -> 64 bits at a time.
1522 // We needn't update remainingElementCount after this point.
1526 if (remainingElementCount >= 4)
1528 nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
1531 asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
1532 if (!AllBytesInUInt32AreAscii(asciiData))
1534 goto FoundNonAsciiData;
1537 WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
1539 } while (currentOffset <= finalOffsetWhereCanLoop);
1542 // Try to widen 16 bits -> 32 bits.
1544 if (((uint)remainingElementCount & 2) != 0)
1546 asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
1547 if (!AllBytesInUInt32AreAscii(asciiData))
1549 goto FoundNonAsciiData;
1552 if (BitConverter.IsLittleEndian)
1554 pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
1555 pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
1559 pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
1560 pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
1566 // Try to widen 8 bits -> 16 bits.
1568 if (((uint)remainingElementCount & 1) != 0)
1570 asciiData = pAsciiBuffer[currentOffset];
1571 if (((byte)asciiData & 0x80) != 0)
1576 pUtf16Buffer[currentOffset] = (char)asciiData;
1582 return currentOffset;
1586 Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
1588 // Drain ASCII bytes one at a time.
1590 while (((byte)asciiData & 0x80) == 0)
1592 pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
1600 private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
1602 // JIT turns the below into constants
1604 uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
1605 nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
1607 // This method is written such that control generally flows top-to-bottom, avoiding
1608 // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
1609 // data, we jump out of the hot paths to targets at the end of the method.
1611 Debug.Assert(Sse2.IsSupported);
1612 Debug.Assert(BitConverter.IsLittleEndian);
1613 Debug.Assert(elementCount >= 2 * SizeOfVector128);
1615 // We're going to get the best performance when we have aligned writes, so we'll take the
1616 // hit of potentially unaligned reads in order to hit this sweet spot.
1618 Vector128<byte> asciiVector;
1619 Vector128<byte> utf16FirstHalfVector;
1622 // First, perform an unaligned read of the first part of the input buffer.
1624 asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
1625 mask = (uint)Sse2.MoveMask(asciiVector);
1627 // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
1629 if ((byte)mask != 0)
1634 // Then perform an unaligned write of the first part of the input buffer.
1636 Vector128<byte> zeroVector = Vector128<byte>.Zero;
1638 utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
1639 Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
1641 // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
1642 // point, then use that as the base offset going forward. Remember the >> 1 to account for
1643 // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
1644 // the loop, but this is ok.
1646 nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
1647 Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
1649 nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
1653 // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
1655 asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
1656 mask = (uint)Sse2.MoveMask(asciiVector);
1660 // non-ASCII byte somewhere
1661 goto NonAsciiDataSeenInInnerLoop;
1664 byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
1665 Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
1667 pStore += SizeOfVector128;
1668 Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
1670 currentOffset += SizeOfVector128;
1671 } while (currentOffset <= finalOffsetWhereCanRunLoop);
1675 return currentOffset;
1677 NonAsciiDataSeenInInnerLoop:
1679 // Can we at least widen the first part of the vector?
1681 if ((byte)mask == 0)
1683 // First part was all ASCII, widen
1684 utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
1685 Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
1686 currentOffset += SizeOfVector128 / 2;
1693 /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
1694 /// writes them to the output buffer with machine endianness.
1696 [MethodImpl(MethodImplOptions.AggressiveInlining)]
1697 private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
1699 Debug.Assert(AllBytesInUInt32AreAscii(value));
1701 if (Bmi2.X64.IsSupported)
1703 // BMI2 will work regardless of the processor's endianness.
1704 Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
1708 if (BitConverter.IsLittleEndian)
1710 outputBuffer = (char)(byte)value;
1712 Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
1714 Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
1716 Unsafe.Add(ref outputBuffer, 3) = (char)value;
1720 Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
1722 Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
1724 Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
1726 outputBuffer = (char)value;