From: lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Fri, 4 Jun 2010 11:30:55 +0000 (+0000)
Subject: Add optimized version of memcpy on ia32.
X-Git-Tag: upstream/4.7.83~21689
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d3d295efa799515edd6609364b3655ce07111234;p=platform%2Fupstream%2Fv8.git

Add optimized version of memcpy on ia32.
Only used in one place right now.
Still room for tweaking.

Review URL: http://codereview.chromium.org/2582001

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4796 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---

diff --git a/src/globals.h b/src/globals.h
index 24ff8cb..6cf2626 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -647,7 +647,9 @@ F FUNCTION_CAST(Address addr) {
 // Feature flags bit positions. They are mostly based on the CPUID spec.
 // (We assign CPUID itself to one of the currently reserved bits --
 // feel free to change this if needed.)
-enum CpuFeature { SSE3 = 32,   // x86
+// On X86/X64, values below 32 are bits in EDX, values above 32 are bits in ECX.
+enum CpuFeature { SSE4_1 = 32 + 19,  // x86
+                  SSE3 = 32 + 0,     // x86
                   SSE2 = 26,   // x86
                   CMOV = 15,   // x86
                   RDTSC = 4,   // x86
diff --git a/src/ia32/assembler-ia32.cc b/src/ia32/assembler-ia32.cc
index 0d31dd7..a436827 100644
--- a/src/ia32/assembler-ia32.cc
+++ b/src/ia32/assembler-ia32.cc
@@ -2230,6 +2230,40 @@ void Assembler::movdqu(XMMRegister dst, const Operand& src) {
 }
 
 
+void Assembler::movntdqa(XMMRegister dst, const Operand& src) {
+  ASSERT(CpuFeatures::IsEnabled(SSE4_1));
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  EMIT(0x66);
+  EMIT(0x0F);
+  EMIT(0x38);
+  EMIT(0x2A);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movntdq(const Operand& dst, XMMRegister src) {
+  ASSERT(CpuFeatures::IsEnabled(SSE2));
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  EMIT(0x66);
+  EMIT(0x0F);
+  EMIT(0xE7);
+  emit_sse_operand(src, dst);
+}
+
+
+void Assembler::prefetch(const Operand& src, int level) {
+  ASSERT(is_uint2(level));
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  EMIT(0x0F);
+  EMIT(0x18);
+  XMMRegister code = { level };  // Emit hint number in Reg position of RegR/M.
+  emit_sse_operand(code, src);
+}
+
+
 void Assembler::movdbl(XMMRegister dst, const Operand& src) {
   EnsureSpace ensure_space(this);
   last_pc_ = pc_;
@@ -2309,7 +2343,6 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   emit_sse_operand(dst, src);
 }
 
-
 void Assembler::emit_sse_operand(XMMRegister reg, const Operand& adr) {
   Register ireg = { reg.code() };
   emit_operand(ireg, adr);
diff --git a/src/ia32/assembler-ia32.h b/src/ia32/assembler-ia32.h
index cf09896..76dfe7c 100644
--- a/src/ia32/assembler-ia32.h
+++ b/src/ia32/assembler-ia32.h
@@ -791,6 +791,15 @@ class Assembler : public Malloced {
   void pxor(XMMRegister dst, XMMRegister src);
   void ptest(XMMRegister dst, XMMRegister src);
 
+  // Parallel XMM operations.
+  void movntdqa(XMMRegister src, const Operand& dst);
+  void movntdq(const Operand& dst, XMMRegister src);
+  // Prefetch src position into cache level.
+  // Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a
+  // non-temporal
+  void prefetch(const Operand& src, int level);
+  // TODO(lrn): Need SFENCE for movnt?
+
   // Debugging
   void Print();
 
diff --git a/src/ia32/codegen-ia32.cc b/src/ia32/codegen-ia32.cc
index a72bbd6..7cd83de 100644
--- a/src/ia32/codegen-ia32.cc
+++ b/src/ia32/codegen-ia32.cc
@@ -13498,6 +13498,211 @@ void StringCompareStub::Generate(MacroAssembler* masm) {
 
 #undef __
 
+#define __ masm.
+
+MemCopyFunction CreateMemCopyFunction() {
+  size_t actual_size;
+  byte* buffer = static_cast<byte*>(OS::Allocate(Assembler::kMinimalBufferSize,
+                                                 &actual_size,
+                                                 true));
+  CHECK(buffer);
+  HandleScope handles;
+  MacroAssembler masm(buffer, static_cast<int>(actual_size));
+
+  // Generated code is put into a fixed, unmovable, buffer, and not into
+  // the V8 heap. We can't, and don't, refer to any relocatable addresses
+  // (e.g. the JavaScript nan-object).
+
+  // 32-bit C declaration function calls pass arguments on stack.
+
+  // Stack layout:
+  // esp[12]: Third argument, size.
+  // esp[8]: Second argument, source pointer.
+  // esp[4]: First argument, destination pointer.
+  // esp[0]: return address
+
+  const int kDestinationOffset = 1 * kPointerSize;
+  const int kSourceOffset = 2 * kPointerSize;
+  const int kSizeOffset = 3 * kPointerSize;
+
+  int stack_offset = 0;  // Update if we change the stack height.
+
+  if (FLAG_debug_code) {
+    __ cmp(Operand(esp, kSizeOffset + stack_offset),
+           Immediate(kMinComplexMemCopy));
+    Label ok;
+    __ j(greater_equal, &ok);
+    __ int3();
+    __ bind(&ok);
+  }
+  if (CpuFeatures::IsSupported(SSE2)) {
+    CpuFeatures::Scope enable(SSE2);
+    __ push(edi);
+    __ push(esi);
+    stack_offset += 2 * kPointerSize;
+    Register dst = edi;
+    Register src = esi;
+    Register count = ecx;
+    __ mov(dst, Operand(esp, stack_offset + kDestinationOffset));
+    __ mov(src, Operand(esp, stack_offset + kSourceOffset));
+    __ mov(count, Operand(esp, stack_offset + kSizeOffset));
+
+
+    __ movdqu(xmm0, Operand(src, 0));
+    __ movdqu(Operand(dst, 0), xmm0);
+    __ mov(edx, dst);
+    __ and_(edx, 0xF);
+    __ neg(edx);
+    __ add(Operand(edx), Immediate(16));
+    __ add(dst, Operand(edx));
+    __ add(src, Operand(edx));
+    __ sub(Operand(count), edx);
+
+    // edi is now aligned. Check if esi is also aligned.
+    Label unaligned_source;
+    __ test(Operand(src), Immediate(0x0F));
+    __ j(not_zero, &unaligned_source);
+    {
+      __ IncrementCounter(&Counters::memcopy_aligned, 1);
+      // Copy loop for aligned source and destination.
+      __ mov(edx, count);
+      Register loop_count = ecx;
+      Register count = edx;
+      __ shr(loop_count, 5);
+      {
+        // Main copy loop.
+        Label loop;
+        __ bind(&loop);
+        __ prefetch(Operand(src, 0x20), 1);
+        __ movdqa(xmm0, Operand(src, 0x00));
+        __ movdqa(xmm1, Operand(src, 0x10));
+        __ add(Operand(src), Immediate(0x20));
+
+        __ movdqa(Operand(dst, 0x00), xmm0);
+        __ movdqa(Operand(dst, 0x10), xmm1);
+        __ add(Operand(dst), Immediate(0x20));
+
+        __ dec(loop_count);
+        __ j(not_zero, &loop);
+      }
+
+      // At most 31 bytes to copy.
+      Label move_less_16;
+      __ test(Operand(count), Immediate(0x10));
+      __ j(zero, &move_less_16);
+      __ movdqa(xmm0, Operand(src, 0));
+      __ add(Operand(src), Immediate(0x10));
+      __ movdqa(Operand(dst, 0), xmm0);
+      __ add(Operand(dst), Immediate(0x10));
+      __ bind(&move_less_16);
+
+      // At most 15 bytes to copy. Copy 16 bytes at end of string.
+      __ and_(count, 0xF);
+      __ movdqu(xmm0, Operand(src, count, times_1, -0x10));
+      __ movdqu(Operand(dst, count, times_1, -0x10), xmm0);
+
+      __ pop(esi);
+      __ pop(edi);
+      __ ret(0);
+    }
+    __ Align(16);
+    {
+      // Copy loop for unaligned source and aligned destination.
+      // If source is not aligned, we can't read it as efficiently.
+      __ bind(&unaligned_source);
+      __ IncrementCounter(&Counters::memcopy_unaligned, 1);
+      __ mov(edx, ecx);
+      Register loop_count = ecx;
+      Register count = edx;
+      __ shr(loop_count, 5);
+      {
+        // Main copy loop
+        Label loop;
+        __ bind(&loop);
+        __ prefetch(Operand(src, 0x20), 1);
+        __ movdqu(xmm0, Operand(src, 0x00));
+        __ movdqu(xmm1, Operand(src, 0x10));
+        __ add(Operand(src), Immediate(0x20));
+
+        __ movdqa(Operand(dst, 0x00), xmm0);
+        __ movdqa(Operand(dst, 0x10), xmm1);
+        __ add(Operand(dst), Immediate(0x20));
+
+        __ dec(loop_count);
+        __ j(not_zero, &loop);
+      }
+
+      // At most 31 bytes to copy.
+      Label move_less_16;
+      __ test(Operand(count), Immediate(0x10));
+      __ j(zero, &move_less_16);
+      __ movdqu(xmm0, Operand(src, 0));
+      __ add(Operand(src), Immediate(0x10));
+      __ movdqa(Operand(dst, 0), xmm0);
+      __ add(Operand(dst), Immediate(0x10));
+      __ bind(&move_less_16);
+
+      // At most 15 bytes to copy. Copy 16 bytes at end of string.
+      __ and_(count, 0x0F);
+      __ movdqu(xmm0, Operand(src, count, times_1, -0x10));
+      __ movdqu(Operand(dst, count, times_1, -0x10), xmm0);
+
+      __ pop(esi);
+      __ pop(edi);
+      __ ret(0);
+    }
+
+  } else {
+    __ IncrementCounter(&Counters::memcopy_noxmm, 1);
+    // SSE2 not supported. Unlikely to happen in practice.
+    __ push(edi);
+    __ push(esi);
+    stack_offset += 2 * kPointerSize;
+    __ cld();
+    Register dst = edi;
+    Register src = esi;
+    Register count = ecx;
+    __ mov(dst, Operand(esp, stack_offset + kDestinationOffset));
+    __ mov(src, Operand(esp, stack_offset + kSourceOffset));
+    __ mov(count, Operand(esp, stack_offset + kSizeOffset));
+
+    // Copy the first word.
+    __ mov(eax, Operand(src, 0));
+    __ mov(Operand(dst, 0), eax);
+
+    // Increment src,dstso that dst is aligned.
+    __ mov(edx, dst);
+    __ and_(edx, 0x03);
+    __ neg(edx);
+    __ add(Operand(edx), Immediate(4));  // edx = 4 - (dst & 3)
+    __ add(dst, Operand(edx));
+    __ add(src, Operand(edx));
+    __ sub(Operand(count), edx);
+    // edi is now aligned, ecx holds number of remaning bytes to copy.
+
+    __ mov(edx, count);
+    count = edx;
+    __ shr(ecx, 2);  // Make word count instead of byte count.
+    __ rep_movs();
+
+    // At most 3 bytes left to copy. Copy 4 bytes at end of string.
+    __ and_(count, 3);
+    __ mov(eax, Operand(src, count, times_1, -4));
+    __ mov(Operand(dst, count, times_1, -4), eax);
+
+    __ pop(esi);
+    __ pop(edi);
+    __ ret(0);
+  }
+
+  CodeDesc desc;
+  masm.GetCode(&desc);
+  // Call the function from C++.
+  return FUNCTION_CAST<MemCopyFunction>(buffer);
+}
+
+#undef __
+
 } }  // namespace v8::internal
 
 #endif  // V8_TARGET_ARCH_IA32
diff --git a/src/ia32/disasm-ia32.cc b/src/ia32/disasm-ia32.cc
index 58c22af..44afdd6 100644
--- a/src/ia32/disasm-ia32.cc
+++ b/src/ia32/disasm-ia32.cc
@@ -817,6 +817,7 @@ int DisassemblerIA32::RegisterFPUInstruction(int escape_opcode,
 // Returns NULL if the instruction is not handled here.
 static const char* F0Mnem(byte f0byte) {
   switch (f0byte) {
+    case 0x18: return "prefetch";
     case 0xA2: return "cpuid";
     case 0x31: return "rdtsc";
     case 0xBE: return "movsx_b";
@@ -942,7 +943,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
       case 0x0F:
         { byte f0byte = *(data+1);
           const char* f0mnem = F0Mnem(f0byte);
-          if (f0byte == 0xA2 || f0byte == 0x31) {
+          if (f0byte == 0x18) {
+            int mod, regop, rm;
+            get_modrm(*data, &mod, &regop, &rm);
+            const char* suffix[] = {"nta", "1", "2", "3"};
+            AppendToBuffer("%s%s ", f0mnem, suffix[regop & 0x03]);
+            data += PrintRightOperand(data);
+          } else if (f0byte == 0xA2 || f0byte == 0x31) {
             AppendToBuffer("%s", f0mnem);
             data += 2;
           } else if ((f0byte & 0xF0) == 0x80) {
@@ -1070,6 +1077,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
                              NameOfXMMRegister(regop),
                              NameOfXMMRegister(rm));
               data++;
+            } else if (*data == 0x2A) {
+              // movntdqa
+              data++;
+              int mod, regop, rm;
+              get_modrm(*data, &mod, &regop, &rm);
+              AppendToBuffer("movntdqa %s,", NameOfXMMRegister(regop));
+              data += PrintRightOperand(data);
             } else {
               UnimplementedInstruction();
             }
@@ -1122,6 +1136,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
             get_modrm(*data, &mod, &regop, &rm);
             data += PrintRightOperand(data);
             AppendToBuffer(",%s", NameOfXMMRegister(regop));
+          } else if (*data == 0xE7) {
+            AppendToBuffer("movntdq ");
+            data++;
+            int mod, regop, rm;
+            get_modrm(*data, &mod, &regop, &rm);
+            data += PrintRightOperand(data);
+            AppendToBuffer(",%s", NameOfXMMRegister(regop));
           } else if (*data == 0xEF) {
              data++;
              int mod, regop, rm;
diff --git a/src/utils.h b/src/utils.h
index 7c81867..1c7120e 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -525,12 +525,54 @@ class StringBuilder {
 };
 
 
+// Custom memcpy implementation for platforms where the standard version
+// may not be good enough.
+// TODO(lrn): Check whether some IA32 platforms should be excluded.
+#if defined(V8_TARGET_ARCH_IA32)
+
+// TODO(lrn): Extend to other platforms as needed.
+
+typedef void (*MemCopyFunction)(void* dest, const void* src, size_t size);
+
+// Implemented in codegen-<arch>.cc.
+MemCopyFunction CreateMemCopyFunction();
+
+// Copy memory area to disjoint memory area.
+static inline void MemCopy(void* dest, const void* src, size_t size) {
+  static MemCopyFunction memcopy = CreateMemCopyFunction();
+  (*memcopy)(dest, src, size);
+#ifdef DEBUG
+  CHECK_EQ(0, memcmp(dest, src, size));
+#endif
+}
+
+
+// Limit below which the extra overhead of the MemCopy function is likely
+// to outweigh the benefits of faster copying.
+// TODO(lrn): Try to find a more precise value.
+static const int kMinComplexMemCopy = 256;
+
+#else  // V8_TARGET_ARCH_IA32
+
+static inline void MemCopy(void* dest, const void* src, size_t size) {
+  memcpy(dest, src, size);
+}
+
+static const int kMinComplexMemCopy = 256;
+
+#endif  // V8_TARGET_ARCH_IA32
+
+
 // Copy from ASCII/16bit chars to ASCII/16bit chars.
 template <typename sourcechar, typename sinkchar>
 static inline void CopyChars(sinkchar* dest, const sourcechar* src, int chars) {
   sinkchar* limit = dest + chars;
 #ifdef V8_HOST_CAN_READ_UNALIGNED
   if (sizeof(*dest) == sizeof(*src)) {
+    if (chars >= static_cast<int>(kMinComplexMemCopy / sizeof(*dest))) {
+      MemCopy(dest, src, chars * sizeof(*dest));
+      return;
+    }
     // Number of characters in a uintptr_t.
     static const int kStepSize = sizeof(uintptr_t) / sizeof(*dest);  // NOLINT
     while (dest <= limit - kStepSize) {
diff --git a/src/v8-counters.h b/src/v8-counters.h
index 8c69cf3..00e8f43 100644
--- a/src/v8-counters.h
+++ b/src/v8-counters.h
@@ -157,6 +157,9 @@ namespace internal {
   SC(array_function_runtime, V8.ArrayFunctionRuntime)                 \
   SC(array_function_native, V8.ArrayFunctionNative)                   \
   SC(for_in, V8.ForIn)                                                \
+  SC(memcopy_aligned, V8.MemCopyAligned)                              \
+  SC(memcopy_unaligned, V8.MemCopyUnaligned)                          \
+  SC(memcopy_noxmm, V8.MemCopyNoXMM)                                  \
   SC(enum_cache_hits, V8.EnumCacheHits)                               \
   SC(enum_cache_misses, V8.EnumCacheMisses)                           \
   SC(reloc_info_count, V8.RelocInfoCount)                             \
diff --git a/test/cctest/test-utils.cc b/test/cctest/test-utils.cc
index 24b3c90..bcb185d 100644
--- a/test/cctest/test-utils.cc
+++ b/test/cctest/test-utils.cc
@@ -79,3 +79,55 @@ TEST(SNPrintF) {
     buffer.Dispose();
   }
 }
+
+
+void TestMemCopy(Vector<byte> src,
+                 Vector<byte> dst,
+                 int source_alignment,
+                 int destination_alignment,
+                 int length_alignment) {
+  memset(dst.start(), 0xFF, dst.length());
+  byte* to = dst.start() + 32 + destination_alignment;
+  byte* from = src.start() + source_alignment;
+  int length = kMinComplexMemCopy + length_alignment;
+  MemCopy(to, from, static_cast<size_t>(length));
+  printf("[%d,%d,%d]\n",
+         source_alignment, destination_alignment, length_alignment);
+  for (int i = 0; i < length; i++) {
+    CHECK_EQ(from[i], to[i]);
+  }
+  CHECK_EQ(0xFF, to[-1]);
+  CHECK_EQ(0xFF, to[length]);
+}
+
+
+
+TEST(MemCopy) {
+  const int N = kMinComplexMemCopy + 128;
+  Vector<byte> buffer1 = Vector<byte>::New(N);
+  Vector<byte> buffer2 = Vector<byte>::New(N);
+
+  for (int i = 0; i < N; i++) {
+    buffer1[i] = static_cast<byte>(i & 0x7F);
+  }
+
+  // Same alignment.
+  for (int i = 0; i < 32; i++) {
+    TestMemCopy(buffer1, buffer2, i, i, i * 2);
+  }
+
+  // Different alignment.
+  for (int i = 0; i < 32; i++) {
+    for (int j = 1; j < 32; j++) {
+      TestMemCopy(buffer1, buffer2, i, (i + j) & 0x1F , 0);
+    }
+  }
+
+  // Different lengths
+  for (int i = 0; i < 32; i++) {
+    TestMemCopy(buffer1, buffer2, 3, 7, i);
+  }
+
+  buffer2.Dispose();
+  buffer1.Dispose();
+}