using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
#if SYSTEM_PRIVATE_CORELIB
// the alignment check consumes at most a single DWORD.)
byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
- uint mask;
+ nuint trailingZeroCount;
+
+ Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
+ Vector128.Create((ushort)0x1001).AsByte() :
+ Vector128.Create((ushort)0x0110).AsByte();
do
{
- if (Sse2.IsSupported)
+ // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
+ // going to perform an unaligned load. We don't necessarily care about aligning
+ // this because we pessimistically assume we'll encounter non-ASCII data at some
+ // point in the not-too-distant future (otherwise we would've stayed entirely
+ // within the all-ASCII vectorized code at the entry to this method).
+ if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)
{
- // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
- // going to perform an unaligned load. We don't necessarily care about aligning
- // this because we pessimistically assume we'll encounter non-ASCII data at some
- // point in the not-too-distant future (otherwise we would've stayed entirely
- // within the all-ASCII vectorized code at the entry to this method).
-
- mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer));
+ ulong mask = GetNonAsciiBytes(AdvSimd.LoadVector128(pInputBuffer), bitMask128);
+ if (mask != 0)
+ {
+ trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2;
+ goto LoopTerminatedEarlyDueToNonAsciiData;
+ }
+ }
+ else if (Sse2.IsSupported)
+ {
+ uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer));
if (mask != 0)
{
- goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
+ trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask);
+ goto LoopTerminatedEarlyDueToNonAsciiData;
}
}
else
continue; // need to perform a bounds check because we might be running out of data
- Sse2LoopTerminatedEarlyDueToNonAsciiData:
+ LoopTerminatedEarlyDueToNonAsciiData:
+ // x86 can only be little endian, while ARM can be big or little endian
+ // so if we reached this label we need to check both combinations are supported
+ Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported);
- Debug.Assert(BitConverter.IsLittleEndian);
- Debug.Assert(Sse2.IsSupported);
// The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
- // for each non-ASCII byte we saw. We can count the number of ASCII bytes,
+ // for each non-ASCII byte we saw. trailingZeroCount will count the number of ASCII bytes,
// bump our input counter by that amount, and resume processing from the
// "the first byte is no longer ASCII" portion of the main loop.
+ // We should not expect a total number of zeroes equal or larger than 16.
+ Debug.Assert(trailingZeroCount < 16);
- Debug.Assert(mask != 0);
-
- pInputBuffer += BitOperations.TrailingZeroCount(mask);
+ pInputBuffer += trailingZeroCount;
if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
{
goto ProcessRemainingBytesSlow;
scalarCountAdjustment = tempScalarCountAdjustment;
return pInputBuffer;
}
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
+ {
+ if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
+ {
+ throw new PlatformNotSupportedException();
+ }
+
+ Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
+ Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128);
+ extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+ return extractedBits.AsUInt64().ToScalar();
+ }
}
}