From: lrn@chromium.org Date: Fri, 4 Jun 2010 11:30:55 +0000 (+0000) Subject: Add optimized version of memcpy on ia32. X-Git-Tag: upstream/4.7.83~21689 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d3d295efa799515edd6609364b3655ce07111234;p=platform%2Fupstream%2Fv8.git Add optimized version of memcpy on ia32. Only used in one place right now. Still room for tweaking. Review URL: http://codereview.chromium.org/2582001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4796 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- diff --git a/src/globals.h b/src/globals.h index 24ff8cb..6cf2626 100644 --- a/src/globals.h +++ b/src/globals.h @@ -647,7 +647,9 @@ F FUNCTION_CAST(Address addr) { // Feature flags bit positions. They are mostly based on the CPUID spec. // (We assign CPUID itself to one of the currently reserved bits -- // feel free to change this if needed.) -enum CpuFeature { SSE3 = 32, // x86 +// On X86/X64, values below 32 are bits in EDX, values above 32 are bits in ECX. +enum CpuFeature { SSE4_1 = 32 + 19, // x86 + SSE3 = 32 + 0, // x86 SSE2 = 26, // x86 CMOV = 15, // x86 RDTSC = 4, // x86 diff --git a/src/ia32/assembler-ia32.cc b/src/ia32/assembler-ia32.cc index 0d31dd7..a436827 100644 --- a/src/ia32/assembler-ia32.cc +++ b/src/ia32/assembler-ia32.cc @@ -2230,6 +2230,40 @@ void Assembler::movdqu(XMMRegister dst, const Operand& src) { } +void Assembler::movntdqa(XMMRegister dst, const Operand& src) { + ASSERT(CpuFeatures::IsEnabled(SSE4_1)); + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0x66); + EMIT(0x0F); + EMIT(0x38); + EMIT(0x2A); + emit_sse_operand(dst, src); +} + + +void Assembler::movntdq(const Operand& dst, XMMRegister src) { + ASSERT(CpuFeatures::IsEnabled(SSE2)); + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0x66); + EMIT(0x0F); + EMIT(0xE7); + emit_sse_operand(src, dst); +} + + +void Assembler::prefetch(const Operand& src, int level) { + ASSERT(is_uint2(level)); + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0x0F); + EMIT(0x18); + XMMRegister code = { level }; // Emit hint number in Reg position of RegR/M. + emit_sse_operand(code, src); +} + + void Assembler::movdbl(XMMRegister dst, const Operand& src) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -2309,7 +2343,6 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) { emit_sse_operand(dst, src); } - void Assembler::emit_sse_operand(XMMRegister reg, const Operand& adr) { Register ireg = { reg.code() }; emit_operand(ireg, adr); diff --git a/src/ia32/assembler-ia32.h b/src/ia32/assembler-ia32.h index cf09896..76dfe7c 100644 --- a/src/ia32/assembler-ia32.h +++ b/src/ia32/assembler-ia32.h @@ -791,6 +791,15 @@ class Assembler : public Malloced { void pxor(XMMRegister dst, XMMRegister src); void ptest(XMMRegister dst, XMMRegister src); + // Parallel XMM operations. + void movntdqa(XMMRegister src, const Operand& dst); + void movntdq(const Operand& dst, XMMRegister src); + // Prefetch src position into cache level. + // Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a + // non-temporal + void prefetch(const Operand& src, int level); + // TODO(lrn): Need SFENCE for movnt? + // Debugging void Print(); diff --git a/src/ia32/codegen-ia32.cc b/src/ia32/codegen-ia32.cc index a72bbd6..7cd83de 100644 --- a/src/ia32/codegen-ia32.cc +++ b/src/ia32/codegen-ia32.cc @@ -13498,6 +13498,211 @@ void StringCompareStub::Generate(MacroAssembler* masm) { #undef __ +#define __ masm. + +MemCopyFunction CreateMemCopyFunction() { + size_t actual_size; + byte* buffer = static_cast(OS::Allocate(Assembler::kMinimalBufferSize, + &actual_size, + true)); + CHECK(buffer); + HandleScope handles; + MacroAssembler masm(buffer, static_cast(actual_size)); + + // Generated code is put into a fixed, unmovable, buffer, and not into + // the V8 heap. We can't, and don't, refer to any relocatable addresses + // (e.g. the JavaScript nan-object). + + // 32-bit C declaration function calls pass arguments on stack. + + // Stack layout: + // esp[12]: Third argument, size. + // esp[8]: Second argument, source pointer. + // esp[4]: First argument, destination pointer. + // esp[0]: return address + + const int kDestinationOffset = 1 * kPointerSize; + const int kSourceOffset = 2 * kPointerSize; + const int kSizeOffset = 3 * kPointerSize; + + int stack_offset = 0; // Update if we change the stack height. + + if (FLAG_debug_code) { + __ cmp(Operand(esp, kSizeOffset + stack_offset), + Immediate(kMinComplexMemCopy)); + Label ok; + __ j(greater_equal, &ok); + __ int3(); + __ bind(&ok); + } + if (CpuFeatures::IsSupported(SSE2)) { + CpuFeatures::Scope enable(SSE2); + __ push(edi); + __ push(esi); + stack_offset += 2 * kPointerSize; + Register dst = edi; + Register src = esi; + Register count = ecx; + __ mov(dst, Operand(esp, stack_offset + kDestinationOffset)); + __ mov(src, Operand(esp, stack_offset + kSourceOffset)); + __ mov(count, Operand(esp, stack_offset + kSizeOffset)); + + + __ movdqu(xmm0, Operand(src, 0)); + __ movdqu(Operand(dst, 0), xmm0); + __ mov(edx, dst); + __ and_(edx, 0xF); + __ neg(edx); + __ add(Operand(edx), Immediate(16)); + __ add(dst, Operand(edx)); + __ add(src, Operand(edx)); + __ sub(Operand(count), edx); + + // edi is now aligned. Check if esi is also aligned. + Label unaligned_source; + __ test(Operand(src), Immediate(0x0F)); + __ j(not_zero, &unaligned_source); + { + __ IncrementCounter(&Counters::memcopy_aligned, 1); + // Copy loop for aligned source and destination. + __ mov(edx, count); + Register loop_count = ecx; + Register count = edx; + __ shr(loop_count, 5); + { + // Main copy loop. + Label loop; + __ bind(&loop); + __ prefetch(Operand(src, 0x20), 1); + __ movdqa(xmm0, Operand(src, 0x00)); + __ movdqa(xmm1, Operand(src, 0x10)); + __ add(Operand(src), Immediate(0x20)); + + __ movdqa(Operand(dst, 0x00), xmm0); + __ movdqa(Operand(dst, 0x10), xmm1); + __ add(Operand(dst), Immediate(0x20)); + + __ dec(loop_count); + __ j(not_zero, &loop); + } + + // At most 31 bytes to copy. + Label move_less_16; + __ test(Operand(count), Immediate(0x10)); + __ j(zero, &move_less_16); + __ movdqa(xmm0, Operand(src, 0)); + __ add(Operand(src), Immediate(0x10)); + __ movdqa(Operand(dst, 0), xmm0); + __ add(Operand(dst), Immediate(0x10)); + __ bind(&move_less_16); + + // At most 15 bytes to copy. Copy 16 bytes at end of string. + __ and_(count, 0xF); + __ movdqu(xmm0, Operand(src, count, times_1, -0x10)); + __ movdqu(Operand(dst, count, times_1, -0x10), xmm0); + + __ pop(esi); + __ pop(edi); + __ ret(0); + } + __ Align(16); + { + // Copy loop for unaligned source and aligned destination. + // If source is not aligned, we can't read it as efficiently. + __ bind(&unaligned_source); + __ IncrementCounter(&Counters::memcopy_unaligned, 1); + __ mov(edx, ecx); + Register loop_count = ecx; + Register count = edx; + __ shr(loop_count, 5); + { + // Main copy loop + Label loop; + __ bind(&loop); + __ prefetch(Operand(src, 0x20), 1); + __ movdqu(xmm0, Operand(src, 0x00)); + __ movdqu(xmm1, Operand(src, 0x10)); + __ add(Operand(src), Immediate(0x20)); + + __ movdqa(Operand(dst, 0x00), xmm0); + __ movdqa(Operand(dst, 0x10), xmm1); + __ add(Operand(dst), Immediate(0x20)); + + __ dec(loop_count); + __ j(not_zero, &loop); + } + + // At most 31 bytes to copy. + Label move_less_16; + __ test(Operand(count), Immediate(0x10)); + __ j(zero, &move_less_16); + __ movdqu(xmm0, Operand(src, 0)); + __ add(Operand(src), Immediate(0x10)); + __ movdqa(Operand(dst, 0), xmm0); + __ add(Operand(dst), Immediate(0x10)); + __ bind(&move_less_16); + + // At most 15 bytes to copy. Copy 16 bytes at end of string. + __ and_(count, 0x0F); + __ movdqu(xmm0, Operand(src, count, times_1, -0x10)); + __ movdqu(Operand(dst, count, times_1, -0x10), xmm0); + + __ pop(esi); + __ pop(edi); + __ ret(0); + } + + } else { + __ IncrementCounter(&Counters::memcopy_noxmm, 1); + // SSE2 not supported. Unlikely to happen in practice. + __ push(edi); + __ push(esi); + stack_offset += 2 * kPointerSize; + __ cld(); + Register dst = edi; + Register src = esi; + Register count = ecx; + __ mov(dst, Operand(esp, stack_offset + kDestinationOffset)); + __ mov(src, Operand(esp, stack_offset + kSourceOffset)); + __ mov(count, Operand(esp, stack_offset + kSizeOffset)); + + // Copy the first word. + __ mov(eax, Operand(src, 0)); + __ mov(Operand(dst, 0), eax); + + // Increment src,dstso that dst is aligned. + __ mov(edx, dst); + __ and_(edx, 0x03); + __ neg(edx); + __ add(Operand(edx), Immediate(4)); // edx = 4 - (dst & 3) + __ add(dst, Operand(edx)); + __ add(src, Operand(edx)); + __ sub(Operand(count), edx); + // edi is now aligned, ecx holds number of remaning bytes to copy. + + __ mov(edx, count); + count = edx; + __ shr(ecx, 2); // Make word count instead of byte count. + __ rep_movs(); + + // At most 3 bytes left to copy. Copy 4 bytes at end of string. + __ and_(count, 3); + __ mov(eax, Operand(src, count, times_1, -4)); + __ mov(Operand(dst, count, times_1, -4), eax); + + __ pop(esi); + __ pop(edi); + __ ret(0); + } + + CodeDesc desc; + masm.GetCode(&desc); + // Call the function from C++. + return FUNCTION_CAST(buffer); +} + +#undef __ + } } // namespace v8::internal #endif // V8_TARGET_ARCH_IA32 diff --git a/src/ia32/disasm-ia32.cc b/src/ia32/disasm-ia32.cc index 58c22af..44afdd6 100644 --- a/src/ia32/disasm-ia32.cc +++ b/src/ia32/disasm-ia32.cc @@ -817,6 +817,7 @@ int DisassemblerIA32::RegisterFPUInstruction(int escape_opcode, // Returns NULL if the instruction is not handled here. static const char* F0Mnem(byte f0byte) { switch (f0byte) { + case 0x18: return "prefetch"; case 0xA2: return "cpuid"; case 0x31: return "rdtsc"; case 0xBE: return "movsx_b"; @@ -942,7 +943,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector out_buffer, case 0x0F: { byte f0byte = *(data+1); const char* f0mnem = F0Mnem(f0byte); - if (f0byte == 0xA2 || f0byte == 0x31) { + if (f0byte == 0x18) { + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + const char* suffix[] = {"nta", "1", "2", "3"}; + AppendToBuffer("%s%s ", f0mnem, suffix[regop & 0x03]); + data += PrintRightOperand(data); + } else if (f0byte == 0xA2 || f0byte == 0x31) { AppendToBuffer("%s", f0mnem); data += 2; } else if ((f0byte & 0xF0) == 0x80) { @@ -1070,6 +1077,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector out_buffer, NameOfXMMRegister(regop), NameOfXMMRegister(rm)); data++; + } else if (*data == 0x2A) { + // movntdqa + data++; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + AppendToBuffer("movntdqa %s,", NameOfXMMRegister(regop)); + data += PrintRightOperand(data); } else { UnimplementedInstruction(); } @@ -1122,6 +1136,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector out_buffer, get_modrm(*data, &mod, ®op, &rm); data += PrintRightOperand(data); AppendToBuffer(",%s", NameOfXMMRegister(regop)); + } else if (*data == 0xE7) { + AppendToBuffer("movntdq "); + data++; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + data += PrintRightOperand(data); + AppendToBuffer(",%s", NameOfXMMRegister(regop)); } else if (*data == 0xEF) { data++; int mod, regop, rm; diff --git a/src/utils.h b/src/utils.h index 7c81867..1c7120e 100644 --- a/src/utils.h +++ b/src/utils.h @@ -525,12 +525,54 @@ class StringBuilder { }; +// Custom memcpy implementation for platforms where the standard version +// may not be good enough. +// TODO(lrn): Check whether some IA32 platforms should be excluded. +#if defined(V8_TARGET_ARCH_IA32) + +// TODO(lrn): Extend to other platforms as needed. + +typedef void (*MemCopyFunction)(void* dest, const void* src, size_t size); + +// Implemented in codegen-.cc. +MemCopyFunction CreateMemCopyFunction(); + +// Copy memory area to disjoint memory area. +static inline void MemCopy(void* dest, const void* src, size_t size) { + static MemCopyFunction memcopy = CreateMemCopyFunction(); + (*memcopy)(dest, src, size); +#ifdef DEBUG + CHECK_EQ(0, memcmp(dest, src, size)); +#endif +} + + +// Limit below which the extra overhead of the MemCopy function is likely +// to outweigh the benefits of faster copying. +// TODO(lrn): Try to find a more precise value. +static const int kMinComplexMemCopy = 256; + +#else // V8_TARGET_ARCH_IA32 + +static inline void MemCopy(void* dest, const void* src, size_t size) { + memcpy(dest, src, size); +} + +static const int kMinComplexMemCopy = 256; + +#endif // V8_TARGET_ARCH_IA32 + + // Copy from ASCII/16bit chars to ASCII/16bit chars. template static inline void CopyChars(sinkchar* dest, const sourcechar* src, int chars) { sinkchar* limit = dest + chars; #ifdef V8_HOST_CAN_READ_UNALIGNED if (sizeof(*dest) == sizeof(*src)) { + if (chars >= static_cast(kMinComplexMemCopy / sizeof(*dest))) { + MemCopy(dest, src, chars * sizeof(*dest)); + return; + } // Number of characters in a uintptr_t. static const int kStepSize = sizeof(uintptr_t) / sizeof(*dest); // NOLINT while (dest <= limit - kStepSize) { diff --git a/src/v8-counters.h b/src/v8-counters.h index 8c69cf3..00e8f43 100644 --- a/src/v8-counters.h +++ b/src/v8-counters.h @@ -157,6 +157,9 @@ namespace internal { SC(array_function_runtime, V8.ArrayFunctionRuntime) \ SC(array_function_native, V8.ArrayFunctionNative) \ SC(for_in, V8.ForIn) \ + SC(memcopy_aligned, V8.MemCopyAligned) \ + SC(memcopy_unaligned, V8.MemCopyUnaligned) \ + SC(memcopy_noxmm, V8.MemCopyNoXMM) \ SC(enum_cache_hits, V8.EnumCacheHits) \ SC(enum_cache_misses, V8.EnumCacheMisses) \ SC(reloc_info_count, V8.RelocInfoCount) \ diff --git a/test/cctest/test-utils.cc b/test/cctest/test-utils.cc index 24b3c90..bcb185d 100644 --- a/test/cctest/test-utils.cc +++ b/test/cctest/test-utils.cc @@ -79,3 +79,55 @@ TEST(SNPrintF) { buffer.Dispose(); } } + + +void TestMemCopy(Vector src, + Vector dst, + int source_alignment, + int destination_alignment, + int length_alignment) { + memset(dst.start(), 0xFF, dst.length()); + byte* to = dst.start() + 32 + destination_alignment; + byte* from = src.start() + source_alignment; + int length = kMinComplexMemCopy + length_alignment; + MemCopy(to, from, static_cast(length)); + printf("[%d,%d,%d]\n", + source_alignment, destination_alignment, length_alignment); + for (int i = 0; i < length; i++) { + CHECK_EQ(from[i], to[i]); + } + CHECK_EQ(0xFF, to[-1]); + CHECK_EQ(0xFF, to[length]); +} + + + +TEST(MemCopy) { + const int N = kMinComplexMemCopy + 128; + Vector buffer1 = Vector::New(N); + Vector buffer2 = Vector::New(N); + + for (int i = 0; i < N; i++) { + buffer1[i] = static_cast(i & 0x7F); + } + + // Same alignment. + for (int i = 0; i < 32; i++) { + TestMemCopy(buffer1, buffer2, i, i, i * 2); + } + + // Different alignment. + for (int i = 0; i < 32; i++) { + for (int j = 1; j < 32; j++) { + TestMemCopy(buffer1, buffer2, i, (i + j) & 0x1F , 0); + } + } + + // Different lengths + for (int i = 0; i < 32; i++) { + TestMemCopy(buffer1, buffer2, 3, 7, i); + } + + buffer2.Dispose(); + buffer1.Dispose(); +}