public static unsafe void StoreAlignedNonTemporal<T>(this Vector128<T> source, T* destination)
where T : unmanaged => source.StoreAligned(destination);
+ /// <summary>
+ /// Stores to lower 64 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
+ /// </summary>
+ /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+ /// <param name="source">The vector that will be stored.</param>
+ /// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
+ /// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
+ /// <remarks>
+ /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
+ /// </remarks>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static void StoreLowerUnsafe<T>(this Vector128<T> source, ref T destination, nuint elementOffset = 0)
+ where T : struct
+ {
+ ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
+ Unsafe.WriteUnaligned<double>(ref address, source.AsDouble().ToScalar());
+ }
+
/// <summary>Stores a vector at the given destination.</summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="source">The vector that will be stored.</param>
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static unsafe void Widen8To16AndAndWriteTo(Vector128<byte> narrowVector, char* pDest, nuint destOffset)
- {
- if (Vector256.IsHardwareAccelerated)
- {
- Vector256<ushort> wide = Vector256.WidenLower(narrowVector.ToVector256Unsafe());
- wide.StoreUnsafe(ref *(ushort*)pDest, destOffset);
- }
- else
- {
- Vector128.WidenLower(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset);
- Vector128.WidenUpper(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset + 8);
- }
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static unsafe void Narrow16To8AndAndWriteTo(Vector128<ushort> wideVector, byte* pDest, nuint destOffset)
- {
- Vector128<byte> narrow = Vector128.Narrow(wideVector, wideVector);
-
- if (Sse2.IsSupported)
- {
- // MOVQ is supported even on x86, unaligned accesses allowed
- Sse2.StoreScalar((ulong*)(pDest + destOffset), narrow.AsUInt64());
- }
- else if (Vector64.IsHardwareAccelerated)
- {
- narrow.GetLower().StoreUnsafe(ref *pDest, destOffset);
- }
- else
- {
- Unsafe.WriteUnaligned<ulong>(pDest + destOffset, narrow.AsUInt64().ToScalar());
- }
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void ChangeWidthAndWriteTo<TFrom, TTo>(Vector128<TFrom> vector, TTo* pDest, nuint elementOffset)
where TFrom : unmanaged
where TTo : unmanaged
}
else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1)
{
- // narrowing operation required
- // since we know data is all-ASCII, special-case SSE2 to avoid unneeded PAND in Narrow call
- Vector128<byte> narrow = (Sse2.IsSupported)
- ? Sse2.PackUnsignedSaturate(vector.AsInt16(), vector.AsInt16())
- : Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16());
- narrow.GetLower().StoreUnsafe(ref *(byte*)pDest, elementOffset);
+ // narrowing operation required, we know data is all-ASCII so use extract helper
+ Vector128<byte> narrow = ExtractAsciiVector(vector.AsUInt16(), vector.AsUInt16());
+ narrow.StoreLowerUnsafe(ref *(byte*)pDest, elementOffset);
}
else
{
}
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static unsafe Vector128<TTo> NarrowOrWidenLowerVectorUnsigned<TFrom, TTo>(Vector128<TFrom> vector)
- where TFrom : unmanaged
- where TTo : unmanaged
- {
- if (sizeof(TFrom) == 1 && sizeof(TTo) == 2)
- {
- return Vector128.WidenLower(vector.AsByte()).As<ushort, TTo>();
- }
- else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1)
- {
- return Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()).As<byte, TTo>();
- }
- else
- {
- throw new NotSupportedException();
- }
- }
-
private struct ToUpperConversion { }
private struct ToLowerConversion { }
}
}
}
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
{
// This method contains logic optimized using vector instructions for both x64 and Arm64.
ref byte asciiBuffer = ref *pAsciiBuffer;
Vector128<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
- asciiVector.GetLower().StoreUnsafe(ref asciiBuffer);
+ asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0);
nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
// We're going to get the best performance when we have aligned writes, so we'll take the
// Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
- asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+ asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
}
// Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
- asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+ asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
currentOffsetInElements += SizeOfVector128 / 2;
goto Finish;