using System.Security;
using System.Runtime;
+#if BIT64
+ using nuint = System.UInt64;
+#else // BIT64
+ using nuint = System.UInt32;
+#endif // BIT64
+
[System.Runtime.InteropServices.ComVisible(true)]
public static class Buffer
{
// This method has different signature for x64 and other platforms and is done for performance reasons.
[System.Security.SecurityCritical]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
-#if BIT64
- internal unsafe static void Memmove(byte* dest, byte* src, ulong len)
-#else
- internal unsafe static void Memmove(byte* dest, byte* src, uint len)
-#endif
+ internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
{
// P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
// This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
-#if BIT64
- if ((ulong)dest - (ulong)src < len) goto PInvoke;
-#else
- if (((uint)dest - (uint)src) < len) goto PInvoke;
-#endif
- //
+
+ if ((nuint)dest - (nuint)src < len) goto PInvoke;
+
// This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
//
// Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as
// possible yet and so we have this implementation here for now.
- //
+
+ // Note: It's important that this switch handles lengths at least up to 22.
+ // See notes below near the main loop for why.
+
+ // The switch will be very fast since it can be implemented using a jump
+ // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info.
switch (len)
{
*dest = *src;
return;
case 2:
- *(short *)dest = *(short *)src;
+ *(short*)dest = *(short*)src;
return;
case 3:
- *(short *)dest = *(short *)src;
+ *(short*)dest = *(short*)src;
*(dest + 2) = *(src + 2);
return;
case 4:
- *(int *)dest = *(int *)src;
+ *(int*)dest = *(int*)src;
return;
case 5:
*(int*)dest = *(int*)src;
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
return;
- default:
- break;
+ case 17:
+#if BIT64
+ *(long*)dest = *(long*)src;
+ *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+ *(int*)dest = *(int*)src;
+ *(int*)(dest + 4) = *(int*)(src + 4);
+ *(int*)(dest + 8) = *(int*)(src + 8);
+ *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+ *(dest + 16) = *(src + 16);
+ return;
+ case 18:
+#if BIT64
+ *(long*)dest = *(long*)src;
+ *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+ *(int*)dest = *(int*)src;
+ *(int*)(dest + 4) = *(int*)(src + 4);
+ *(int*)(dest + 8) = *(int*)(src + 8);
+ *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+ *(short*)(dest + 16) = *(short*)(src + 16);
+ return;
+ case 19:
+#if BIT64
+ *(long*)dest = *(long*)src;
+ *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+ *(int*)dest = *(int*)src;
+ *(int*)(dest + 4) = *(int*)(src + 4);
+ *(int*)(dest + 8) = *(int*)(src + 8);
+ *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+ *(short*)(dest + 16) = *(short*)(src + 16);
+ *(dest + 18) = *(src + 18);
+ return;
+ case 20:
+#if BIT64
+ *(long*)dest = *(long*)src;
+ *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+ *(int*)dest = *(int*)src;
+ *(int*)(dest + 4) = *(int*)(src + 4);
+ *(int*)(dest + 8) = *(int*)(src + 8);
+ *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+ *(int*)(dest + 16) = *(int*)(src + 16);
+ return;
+ case 21:
+#if BIT64
+ *(long*)dest = *(long*)src;
+ *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+ *(int*)dest = *(int*)src;
+ *(int*)(dest + 4) = *(int*)(src + 4);
+ *(int*)(dest + 8) = *(int*)(src + 8);
+ *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+ *(int*)(dest + 16) = *(int*)(src + 16);
+ *(dest + 20) = *(src + 20);
+ return;
+ case 22:
+#if BIT64
+ *(long*)dest = *(long*)src;
+ *(long*)(dest + 8) = *(long*)(src + 8);
+#else
+ *(int*)dest = *(int*)src;
+ *(int*)(dest + 4) = *(int*)(src + 4);
+ *(int*)(dest + 8) = *(int*)(src + 8);
+ *(int*)(dest + 12) = *(int*)(src + 12);
+#endif
+ *(int*)(dest + 16) = *(int*)(src + 16);
+ *(short*)(dest + 20) = *(short*)(src + 20);
+ return;
}
// P/Invoke into the native version for large lengths
if (len >= 512) goto PInvoke;
+ nuint i = 0; // byte offset at which we're copying
+
if (((int)dest & 3) != 0)
{
if (((int)dest & 1) != 0)
{
- *dest = *src;
- src++;
- dest++;
- len--;
- if (((int)dest & 2) == 0)
- goto Aligned;
+ *(dest + i) = *(src + i);
+ i += 1;
+ if (((int)dest & 2) != 0)
+ goto IntAligned;
}
- *(short *)dest = *(short *)src;
- src += 2;
- dest += 2;
- len -= 2;
- Aligned: ;
+ *(short*)(dest + i) = *(short*)(src + i);
+ i += 2;
}
+ IntAligned:
+
#if BIT64
- if (((int)dest & 4) != 0)
+ // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If
+ // (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1
+ // bytes to the next aligned address (respectively), so do nothing. On the other hand,
+ // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until
+ // we're aligned.
+ // The thing 1, 2, 3, and 4 have in common that the others don't is that if you
+ // subtract one from them, their 3rd lsb will not be set. Hence, the below check.
+
+ if ((((int)dest - 1) & 4) == 0)
{
- *(int *)dest = *(int *)src;
- src += 4;
- dest += 4;
- len -= 4;
+ *(int*)(dest + i) = *(int*)(src + i);
+ i += 4;
}
-#endif
+#endif // BIT64
-#if BIT64
- ulong count = len / 16;
-#else
- uint count = len / 16;
-#endif
- while (count > 0)
+ nuint end = len - 16;
+ len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop
+
+ // We know due to the above switch-case that this loop will always run 1 iteration; max
+ // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so
+ // the switch handles lengths 0-22.
+ Contract.Assert(end >= 7 && i <= end);
+
+ // This is separated out into a different variable, so the i + 16 addition can be
+ // performed at the start of the pipeline and the loop condition does not have
+ // a dependency on the writes.
+ nuint counter;
+
+ do
{
+ counter = i + 16;
+
+ // This loop looks very costly since there appear to be a bunch of temporary values
+ // being created with the adds, but the jit (for x86 anyways) will convert each of
+ // these to use memory addressing operands.
+
+ // So the only cost is a bit of code size, which is made up for by the fact that
+ // we save on writes to dest/src.
+
#if BIT64
- ((long*)dest)[0] = ((long*)src)[0];
- ((long*)dest)[1] = ((long*)src)[1];
+ *(long*)(dest + i) = *(long*)(src + i);
+ *(long*)(dest + i + 8) = *(long*)(src + i + 8);
#else
- ((int*)dest)[0] = ((int*)src)[0];
- ((int*)dest)[1] = ((int*)src)[1];
- ((int*)dest)[2] = ((int*)src)[2];
- ((int*)dest)[3] = ((int*)src)[3];
+ *(int*)(dest + i) = *(int*)(src + i);
+ *(int*)(dest + i + 4) = *(int*)(src + i + 4);
+ *(int*)(dest + i + 8) = *(int*)(src + i + 8);
+ *(int*)(dest + i + 12) = *(int*)(src + i + 12);
#endif
- dest += 16;
- src += 16;
- count--;
+
+ i = counter;
+
+ // See notes above for why this wasn't used instead
+ // i += 16;
}
+ while (counter <= end);
if ((len & 8) != 0)
{
#if BIT64
- ((long*)dest)[0] = ((long*)src)[0];
+ *(long*)(dest + i) = *(long*)(src + i);
#else
- ((int*)dest)[0] = ((int*)src)[0];
- ((int*)dest)[1] = ((int*)src)[1];
+ *(int*)(dest + i) = *(int*)(src + i);
+ *(int*)(dest + i + 4) = *(int*)(src + i + 4);
#endif
- dest += 8;
- src += 8;
- }
- if ((len & 4) != 0)
- {
- ((int*)dest)[0] = ((int*)src)[0];
- dest += 4;
- src += 4;
- }
- if ((len & 2) != 0)
- {
- ((short*)dest)[0] = ((short*)src)[0];
- dest += 2;
- src += 2;
- }
- if ((len & 1) != 0)
- *dest = *src;
+ i += 8;
+ }
+ if ((len & 4) != 0)
+ {
+ *(int*)(dest + i) = *(int*)(src + i);
+ i += 4;
+ }
+ if ((len & 2) != 0)
+ {
+ *(short*)(dest + i) = *(short*)(src + i);
+ i += 2;
+ }
+ if ((len & 1) != 0)
+ {
+ *(dest + i) = *(src + i);
+ // We're not using i after this, so not needed
+ // i += 1;
+ }
return;
[SecurityCritical]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
[MethodImplAttribute(MethodImplOptions.NoInlining)]
-#if BIT64
- private unsafe static void _Memmove(byte* dest, byte* src, ulong len)
-#else
- private unsafe static void _Memmove(byte* dest, byte* src, uint len)
-#endif
+ private unsafe static void _Memmove(byte* dest, byte* src, nuint len)
{
__Memmove(dest, src, len);
}
[SuppressUnmanagedCodeSecurity]
[SecurityCritical]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
-#if BIT64
- extern private unsafe static void __Memmove(byte* dest, byte* src, ulong len);
-#else
- extern private unsafe static void __Memmove(byte* dest, byte* src, uint len);
-#endif
-
+ extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len);
// The attributes on this method are chosen for best JIT performance.
// Please do not edit unless intentional.
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy);
}
-#if BIT64
- Memmove((byte*)destination, (byte*)source, checked((ulong) sourceBytesToCopy));
-#else
- Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
-#endif // BIT64
+ Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy));
}
}
#if BIT64
Memmove((byte*)destination, (byte*)source, sourceBytesToCopy);
-#else
+#else // BIT64
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
#endif // BIT64
}