{
byte* toMaskPtr = toMaskBeg;
byte* toMaskEnd = toMaskBeg + toMask.Length;
- byte* maskPtr = (byte*)&mask;
if (toMaskEnd - toMaskPtr >= sizeof(int))
{
- // align our pointer to sizeof(int)
-
- while ((ulong)toMaskPtr % sizeof(int) != 0)
- {
- Debug.Assert(toMaskPtr < toMaskEnd);
-
- *toMaskPtr++ ^= maskPtr[maskIndex];
- maskIndex = (maskIndex + 1) & 3;
- }
-
- int rolledMask;
- if (BitConverter.IsLittleEndian)
- {
- rolledMask = (int)BitOperations.RotateRight((uint)mask, maskIndex * 8);
- }
- else
- {
- rolledMask = (int)BitOperations.RotateLeft((uint)mask, maskIndex * 8);
- }
-
- // use SIMD if possible.
+ int rolledMask = BitConverter.IsLittleEndian ?
+ (int)BitOperations.RotateRight((uint)mask, maskIndex * 8) :
+ (int)BitOperations.RotateLeft((uint)mask, maskIndex * 8);
- if (Vector.IsHardwareAccelerated && Vector<byte>.Count % sizeof(int) == 0 && (toMaskEnd - toMaskPtr) >= Vector<byte>.Count)
+ // Process Vector<byte>.Count bytes at a time.
+ if (Vector.IsHardwareAccelerated && (toMaskEnd - toMaskPtr) >= Vector<byte>.Count)
{
- // align our pointer to Vector<byte>.Count
-
- while ((ulong)toMaskPtr % (uint)Vector<byte>.Count != 0)
- {
- Debug.Assert(toMaskPtr < toMaskEnd);
-
- *(int*)toMaskPtr ^= rolledMask;
- toMaskPtr += sizeof(int);
- }
-
- // use SIMD.
-
- if (toMaskEnd - toMaskPtr >= Vector<byte>.Count)
+ Vector<byte> maskVector = Vector.AsVectorByte(new Vector<int>(rolledMask));
+ do
{
- Vector<byte> maskVector = Vector.AsVectorByte(new Vector<int>(rolledMask));
-
- do
- {
- *(Vector<byte>*)toMaskPtr ^= maskVector;
- toMaskPtr += Vector<byte>.Count;
- }
- while (toMaskEnd - toMaskPtr >= Vector<byte>.Count);
+ *(Vector<byte>*)toMaskPtr ^= maskVector;
+ toMaskPtr += Vector<byte>.Count;
}
+ while (toMaskEnd - toMaskPtr >= Vector<byte>.Count);
}
- // process remaining data (or all, if couldn't use SIMD) 4 bytes at a time.
-
+ // Process 4 bytes at a time.
while (toMaskEnd - toMaskPtr >= sizeof(int))
{
*(int*)toMaskPtr ^= rolledMask;
}
}
- // do any remaining data a byte at a time.
-
+ // Process 1 byte at a time.
+ byte* maskPtr = (byte*)&mask;
while (toMaskPtr != toMaskEnd)
{
*toMaskPtr++ ^= maskPtr[maskIndex];