// Feature flags bit positions. They are mostly based on the CPUID spec.
// (We assign CPUID itself to one of the currently reserved bits --
// feel free to change this if needed.)
-enum CpuFeature { SSE3 = 32, // x86
+// On X86/X64, values below 32 are bits in EDX, values above 32 are bits in ECX.
+enum CpuFeature { SSE4_1 = 32 + 19, // x86
+ SSE3 = 32 + 0, // x86
SSE2 = 26, // x86
CMOV = 15, // x86
RDTSC = 4, // x86
}
+void Assembler::movntdqa(XMMRegister dst, const Operand& src) {
+ ASSERT(CpuFeatures::IsEnabled(SSE4_1));
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ EMIT(0x66);
+ EMIT(0x0F);
+ EMIT(0x38);
+ EMIT(0x2A);
+ emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movntdq(const Operand& dst, XMMRegister src) {
+ ASSERT(CpuFeatures::IsEnabled(SSE2));
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ EMIT(0x66);
+ EMIT(0x0F);
+ EMIT(0xE7);
+ emit_sse_operand(src, dst);
+}
+
+
+void Assembler::prefetch(const Operand& src, int level) {
+ ASSERT(is_uint2(level));
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ EMIT(0x0F);
+ EMIT(0x18);
+ XMMRegister code = { level }; // Emit hint number in Reg position of RegR/M.
+ emit_sse_operand(code, src);
+}
+
+
void Assembler::movdbl(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
last_pc_ = pc_;
emit_sse_operand(dst, src);
}
-
void Assembler::emit_sse_operand(XMMRegister reg, const Operand& adr) {
Register ireg = { reg.code() };
emit_operand(ireg, adr);
void pxor(XMMRegister dst, XMMRegister src);
void ptest(XMMRegister dst, XMMRegister src);
+ // Parallel XMM operations.
+ void movntdqa(XMMRegister src, const Operand& dst);
+ void movntdq(const Operand& dst, XMMRegister src);
+ // Prefetch src position into cache level.
+ // Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a
+ // non-temporal
+ void prefetch(const Operand& src, int level);
+ // TODO(lrn): Need SFENCE for movnt?
+
// Debugging
void Print();
#undef __
+#define __ masm.
+
+MemCopyFunction CreateMemCopyFunction() {
+ size_t actual_size;
+ byte* buffer = static_cast<byte*>(OS::Allocate(Assembler::kMinimalBufferSize,
+ &actual_size,
+ true));
+ CHECK(buffer);
+ HandleScope handles;
+ MacroAssembler masm(buffer, static_cast<int>(actual_size));
+
+ // Generated code is put into a fixed, unmovable, buffer, and not into
+ // the V8 heap. We can't, and don't, refer to any relocatable addresses
+ // (e.g. the JavaScript nan-object).
+
+ // 32-bit C declaration function calls pass arguments on stack.
+
+ // Stack layout:
+ // esp[12]: Third argument, size.
+ // esp[8]: Second argument, source pointer.
+ // esp[4]: First argument, destination pointer.
+ // esp[0]: return address
+
+ const int kDestinationOffset = 1 * kPointerSize;
+ const int kSourceOffset = 2 * kPointerSize;
+ const int kSizeOffset = 3 * kPointerSize;
+
+ int stack_offset = 0; // Update if we change the stack height.
+
+ if (FLAG_debug_code) {
+ __ cmp(Operand(esp, kSizeOffset + stack_offset),
+ Immediate(kMinComplexMemCopy));
+ Label ok;
+ __ j(greater_equal, &ok);
+ __ int3();
+ __ bind(&ok);
+ }
+ if (CpuFeatures::IsSupported(SSE2)) {
+ CpuFeatures::Scope enable(SSE2);
+ __ push(edi);
+ __ push(esi);
+ stack_offset += 2 * kPointerSize;
+ Register dst = edi;
+ Register src = esi;
+ Register count = ecx;
+ __ mov(dst, Operand(esp, stack_offset + kDestinationOffset));
+ __ mov(src, Operand(esp, stack_offset + kSourceOffset));
+ __ mov(count, Operand(esp, stack_offset + kSizeOffset));
+
+
+ __ movdqu(xmm0, Operand(src, 0));
+ __ movdqu(Operand(dst, 0), xmm0);
+ __ mov(edx, dst);
+ __ and_(edx, 0xF);
+ __ neg(edx);
+ __ add(Operand(edx), Immediate(16));
+ __ add(dst, Operand(edx));
+ __ add(src, Operand(edx));
+ __ sub(Operand(count), edx);
+
+ // edi is now aligned. Check if esi is also aligned.
+ Label unaligned_source;
+ __ test(Operand(src), Immediate(0x0F));
+ __ j(not_zero, &unaligned_source);
+ {
+ __ IncrementCounter(&Counters::memcopy_aligned, 1);
+ // Copy loop for aligned source and destination.
+ __ mov(edx, count);
+ Register loop_count = ecx;
+ Register count = edx;
+ __ shr(loop_count, 5);
+ {
+ // Main copy loop.
+ Label loop;
+ __ bind(&loop);
+ __ prefetch(Operand(src, 0x20), 1);
+ __ movdqa(xmm0, Operand(src, 0x00));
+ __ movdqa(xmm1, Operand(src, 0x10));
+ __ add(Operand(src), Immediate(0x20));
+
+ __ movdqa(Operand(dst, 0x00), xmm0);
+ __ movdqa(Operand(dst, 0x10), xmm1);
+ __ add(Operand(dst), Immediate(0x20));
+
+ __ dec(loop_count);
+ __ j(not_zero, &loop);
+ }
+
+ // At most 31 bytes to copy.
+ Label move_less_16;
+ __ test(Operand(count), Immediate(0x10));
+ __ j(zero, &move_less_16);
+ __ movdqa(xmm0, Operand(src, 0));
+ __ add(Operand(src), Immediate(0x10));
+ __ movdqa(Operand(dst, 0), xmm0);
+ __ add(Operand(dst), Immediate(0x10));
+ __ bind(&move_less_16);
+
+ // At most 15 bytes to copy. Copy 16 bytes at end of string.
+ __ and_(count, 0xF);
+ __ movdqu(xmm0, Operand(src, count, times_1, -0x10));
+ __ movdqu(Operand(dst, count, times_1, -0x10), xmm0);
+
+ __ pop(esi);
+ __ pop(edi);
+ __ ret(0);
+ }
+ __ Align(16);
+ {
+ // Copy loop for unaligned source and aligned destination.
+ // If source is not aligned, we can't read it as efficiently.
+ __ bind(&unaligned_source);
+ __ IncrementCounter(&Counters::memcopy_unaligned, 1);
+ __ mov(edx, ecx);
+ Register loop_count = ecx;
+ Register count = edx;
+ __ shr(loop_count, 5);
+ {
+ // Main copy loop
+ Label loop;
+ __ bind(&loop);
+ __ prefetch(Operand(src, 0x20), 1);
+ __ movdqu(xmm0, Operand(src, 0x00));
+ __ movdqu(xmm1, Operand(src, 0x10));
+ __ add(Operand(src), Immediate(0x20));
+
+ __ movdqa(Operand(dst, 0x00), xmm0);
+ __ movdqa(Operand(dst, 0x10), xmm1);
+ __ add(Operand(dst), Immediate(0x20));
+
+ __ dec(loop_count);
+ __ j(not_zero, &loop);
+ }
+
+ // At most 31 bytes to copy.
+ Label move_less_16;
+ __ test(Operand(count), Immediate(0x10));
+ __ j(zero, &move_less_16);
+ __ movdqu(xmm0, Operand(src, 0));
+ __ add(Operand(src), Immediate(0x10));
+ __ movdqa(Operand(dst, 0), xmm0);
+ __ add(Operand(dst), Immediate(0x10));
+ __ bind(&move_less_16);
+
+ // At most 15 bytes to copy. Copy 16 bytes at end of string.
+ __ and_(count, 0x0F);
+ __ movdqu(xmm0, Operand(src, count, times_1, -0x10));
+ __ movdqu(Operand(dst, count, times_1, -0x10), xmm0);
+
+ __ pop(esi);
+ __ pop(edi);
+ __ ret(0);
+ }
+
+ } else {
+ __ IncrementCounter(&Counters::memcopy_noxmm, 1);
+ // SSE2 not supported. Unlikely to happen in practice.
+ __ push(edi);
+ __ push(esi);
+ stack_offset += 2 * kPointerSize;
+ __ cld();
+ Register dst = edi;
+ Register src = esi;
+ Register count = ecx;
+ __ mov(dst, Operand(esp, stack_offset + kDestinationOffset));
+ __ mov(src, Operand(esp, stack_offset + kSourceOffset));
+ __ mov(count, Operand(esp, stack_offset + kSizeOffset));
+
+ // Copy the first word.
+ __ mov(eax, Operand(src, 0));
+ __ mov(Operand(dst, 0), eax);
+
+ // Increment src,dstso that dst is aligned.
+ __ mov(edx, dst);
+ __ and_(edx, 0x03);
+ __ neg(edx);
+ __ add(Operand(edx), Immediate(4)); // edx = 4 - (dst & 3)
+ __ add(dst, Operand(edx));
+ __ add(src, Operand(edx));
+ __ sub(Operand(count), edx);
+ // edi is now aligned, ecx holds number of remaning bytes to copy.
+
+ __ mov(edx, count);
+ count = edx;
+ __ shr(ecx, 2); // Make word count instead of byte count.
+ __ rep_movs();
+
+ // At most 3 bytes left to copy. Copy 4 bytes at end of string.
+ __ and_(count, 3);
+ __ mov(eax, Operand(src, count, times_1, -4));
+ __ mov(Operand(dst, count, times_1, -4), eax);
+
+ __ pop(esi);
+ __ pop(edi);
+ __ ret(0);
+ }
+
+ CodeDesc desc;
+ masm.GetCode(&desc);
+ // Call the function from C++.
+ return FUNCTION_CAST<MemCopyFunction>(buffer);
+}
+
+#undef __
+
} } // namespace v8::internal
#endif // V8_TARGET_ARCH_IA32
// Returns NULL if the instruction is not handled here.
static const char* F0Mnem(byte f0byte) {
switch (f0byte) {
+ case 0x18: return "prefetch";
case 0xA2: return "cpuid";
case 0x31: return "rdtsc";
case 0xBE: return "movsx_b";
case 0x0F:
{ byte f0byte = *(data+1);
const char* f0mnem = F0Mnem(f0byte);
- if (f0byte == 0xA2 || f0byte == 0x31) {
+ if (f0byte == 0x18) {
+ int mod, regop, rm;
+ get_modrm(*data, &mod, ®op, &rm);
+ const char* suffix[] = {"nta", "1", "2", "3"};
+ AppendToBuffer("%s%s ", f0mnem, suffix[regop & 0x03]);
+ data += PrintRightOperand(data);
+ } else if (f0byte == 0xA2 || f0byte == 0x31) {
AppendToBuffer("%s", f0mnem);
data += 2;
} else if ((f0byte & 0xF0) == 0x80) {
NameOfXMMRegister(regop),
NameOfXMMRegister(rm));
data++;
+ } else if (*data == 0x2A) {
+ // movntdqa
+ data++;
+ int mod, regop, rm;
+ get_modrm(*data, &mod, ®op, &rm);
+ AppendToBuffer("movntdqa %s,", NameOfXMMRegister(regop));
+ data += PrintRightOperand(data);
} else {
UnimplementedInstruction();
}
get_modrm(*data, &mod, ®op, &rm);
data += PrintRightOperand(data);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
+ } else if (*data == 0xE7) {
+ AppendToBuffer("movntdq ");
+ data++;
+ int mod, regop, rm;
+ get_modrm(*data, &mod, ®op, &rm);
+ data += PrintRightOperand(data);
+ AppendToBuffer(",%s", NameOfXMMRegister(regop));
} else if (*data == 0xEF) {
data++;
int mod, regop, rm;
};
+// Custom memcpy implementation for platforms where the standard version
+// may not be good enough.
+// TODO(lrn): Check whether some IA32 platforms should be excluded.
+#if defined(V8_TARGET_ARCH_IA32)
+
+// TODO(lrn): Extend to other platforms as needed.
+
+typedef void (*MemCopyFunction)(void* dest, const void* src, size_t size);
+
+// Implemented in codegen-<arch>.cc.
+MemCopyFunction CreateMemCopyFunction();
+
+// Copy memory area to disjoint memory area.
+static inline void MemCopy(void* dest, const void* src, size_t size) {
+ static MemCopyFunction memcopy = CreateMemCopyFunction();
+ (*memcopy)(dest, src, size);
+#ifdef DEBUG
+ CHECK_EQ(0, memcmp(dest, src, size));
+#endif
+}
+
+
+// Limit below which the extra overhead of the MemCopy function is likely
+// to outweigh the benefits of faster copying.
+// TODO(lrn): Try to find a more precise value.
+static const int kMinComplexMemCopy = 256;
+
+#else // V8_TARGET_ARCH_IA32
+
+static inline void MemCopy(void* dest, const void* src, size_t size) {
+ memcpy(dest, src, size);
+}
+
+static const int kMinComplexMemCopy = 256;
+
+#endif // V8_TARGET_ARCH_IA32
+
+
// Copy from ASCII/16bit chars to ASCII/16bit chars.
template <typename sourcechar, typename sinkchar>
static inline void CopyChars(sinkchar* dest, const sourcechar* src, int chars) {
sinkchar* limit = dest + chars;
#ifdef V8_HOST_CAN_READ_UNALIGNED
if (sizeof(*dest) == sizeof(*src)) {
+ if (chars >= static_cast<int>(kMinComplexMemCopy / sizeof(*dest))) {
+ MemCopy(dest, src, chars * sizeof(*dest));
+ return;
+ }
// Number of characters in a uintptr_t.
static const int kStepSize = sizeof(uintptr_t) / sizeof(*dest); // NOLINT
while (dest <= limit - kStepSize) {
SC(array_function_runtime, V8.ArrayFunctionRuntime) \
SC(array_function_native, V8.ArrayFunctionNative) \
SC(for_in, V8.ForIn) \
+ SC(memcopy_aligned, V8.MemCopyAligned) \
+ SC(memcopy_unaligned, V8.MemCopyUnaligned) \
+ SC(memcopy_noxmm, V8.MemCopyNoXMM) \
SC(enum_cache_hits, V8.EnumCacheHits) \
SC(enum_cache_misses, V8.EnumCacheMisses) \
SC(reloc_info_count, V8.RelocInfoCount) \
buffer.Dispose();
}
}
+
+
+void TestMemCopy(Vector<byte> src,
+ Vector<byte> dst,
+ int source_alignment,
+ int destination_alignment,
+ int length_alignment) {
+ memset(dst.start(), 0xFF, dst.length());
+ byte* to = dst.start() + 32 + destination_alignment;
+ byte* from = src.start() + source_alignment;
+ int length = kMinComplexMemCopy + length_alignment;
+ MemCopy(to, from, static_cast<size_t>(length));
+ printf("[%d,%d,%d]\n",
+ source_alignment, destination_alignment, length_alignment);
+ for (int i = 0; i < length; i++) {
+ CHECK_EQ(from[i], to[i]);
+ }
+ CHECK_EQ(0xFF, to[-1]);
+ CHECK_EQ(0xFF, to[length]);
+}
+
+
+
+TEST(MemCopy) {
+ const int N = kMinComplexMemCopy + 128;
+ Vector<byte> buffer1 = Vector<byte>::New(N);
+ Vector<byte> buffer2 = Vector<byte>::New(N);
+
+ for (int i = 0; i < N; i++) {
+ buffer1[i] = static_cast<byte>(i & 0x7F);
+ }
+
+ // Same alignment.
+ for (int i = 0; i < 32; i++) {
+ TestMemCopy(buffer1, buffer2, i, i, i * 2);
+ }
+
+ // Different alignment.
+ for (int i = 0; i < 32; i++) {
+ for (int j = 1; j < 32; j++) {
+ TestMemCopy(buffer1, buffer2, i, (i + j) & 0x1F , 0);
+ }
+ }
+
+ // Different lengths
+ for (int i = 0; i < 32; i++) {
+ TestMemCopy(buffer1, buffer2, 3, 7, i);
+ }
+
+ buffer2.Dispose();
+ buffer1.Dispose();
+}