* sysdeps/x86_64/cacheinfo.c (__x86_64_data_cache_size_half): Renamed

author Ulrich Drepper <drepper@redhat.com>

Sat, 22 Sep 2007 05:54:03 +0000 (05:54 +0000)

committer Ulrich Drepper <drepper@redhat.com>

Sat, 22 Sep 2007 05:54:03 +0000 (05:54 +0000)
author Ulrich Drepper <drepper@redhat.com>
Sat, 22 Sep 2007 05:54:03 +0000 (05:54 +0000)
committer Ulrich Drepper <drepper@redhat.com>
Sat, 22 Sep 2007 05:54:03 +0000 (05:54 +0000)
diff --git a/ChangeLog b/ChangeLog

index 3845390..d0dcf62 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2007-09-21  Ulrich Drepper  <drepper@redhat.com>
+
+       * sysdeps/x86_64/cacheinfo.c (__x86_64_data_cache_size_half): Renamed
+       from __x86_64_core_cache_size_half.
+       (init_cacheinfo): Compute shared cache size for AMD processors with
+       shared L3 correctly.
+       * sysdeps/x86_64/memcpy.S: Adjust for __x86_64_data_cache_size_half
+       name change.
+       Patch in large parts by Evandro Menezes.
+
  2007-09-19  Ulrich Drepper  <drepper@redhat.com>
  
         * elf/dl-lookup.c (add_dependency): Handle failing memory
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c

index 793dc2d..5b92bd5 100644 (file)
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -398,13 +398,13 @@ __cache_sysconf (int name)
  }
  
  
-/* Half the core cache size for use in memory and string routines, typically
-   L1 size. */
-long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2;
+/* Half the data cache size for use in memory and string routines, typically
+   L1 size.  */
+long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
  /* Shared cache size for use in memory and string routines, typically
-   L2 or L3 size. */
+   L2 or L3 size.  */
  long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
-/* PREFETCHW support flag for use in memory and string routines. */
+/* PREFETCHW support flag for use in memory and string routines.  */
  int __x86_64_prefetchw attribute_hidden;
  
  
@@ -419,7 +419,7 @@ init_cacheinfo (void)
    unsigned int edx;
    int max_cpuid;
    int max_cpuid_ex;
-  long int core = -1;
+  long int data = -1;
    long int shared = -1;
    unsigned int level;
    unsigned int threads = 0;
@@ -431,26 +431,26 @@ init_cacheinfo (void)
    /* This spells out "GenuineIntel".  */
    if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
      {
-      core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+      data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
  
-      /* Try L3 first. */
+      /* Try L3 first.  */
        level  = 3;
        shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
  
        if (shared <= 0)
          {
-         /* Try L2 otherwise. */
+         /* Try L2 otherwise.  */
            level  = 2;
            shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
         }
  
        /* Figure out the number of logical threads that share the
-        highest cache level. */
+        highest cache level.  */
        if (max_cpuid >= 4)
          {
           int i = 0;
  
-         /* Query until desired cache level is enumerated. */
+         /* Query until desired cache level is enumerated.  */
           do
             {
                asm volatile ("cpuid"
@@ -463,7 +463,7 @@ init_cacheinfo (void)
         }
        else
          {
-         /* Assume that all logical threads share the highest cache level. */
+         /* Assume that all logical threads share the highest cache level.  */
            asm volatile ("cpuid"
                         : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
                         : "0" (1));
@@ -472,33 +472,73 @@ init_cacheinfo (void)
         }
  
        /* Cap usage of highest cache level to the number of supported
-        threads. */
+        threads.  */
        if (shared > 0 && threads > 0)
          shared /= threads;
      }
    /* This spells out "AuthenticAMD".  */
    else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
      {
-      core   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
-      shared = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+      data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+      long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+      shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
  
+      /* Get maximum extended function. */
        asm volatile ("cpuid"
                     : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
                     : "0" (0x80000000));
  
+      if (shared <= 0)
+       /* No shared L3 cache.  All we have is the L2 cache.  */
+       shared = core;
+      else
+       {
+         /* Figure out the number of logical threads that share L3.  */
+         if (max_cpuid_ex >= 0x80000008)
+           {
+             /* Get width of APIC ID.  */
+             asm volatile ("cpuid"
+                           : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx),
+                             "=d" (edx)
+                           : "0" (0x80000008));
+             threads = 1 << ((ecx >> 12) & 0x0f);
+           }
+
+         if (threads == 0)
+           {
+             /* If APIC ID width is not available, use logical
+                processor count.  */
+             asm volatile ("cpuid"
+                           : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx),
+                             "=d" (edx)
+                           : "0" (0x00000001));
+
+             if ((edx & (1 << 28)) != 0)
+               threads = (ebx >> 16) & 0xff;
+           }
+
+         /* Cap usage of highest cache level to the number of
+            supported threads.  */
+         if (threads > 0)
+           shared /= threads;
+
+         /* Account for exclusive L2 and L3 caches.  */
+         shared += core;
+       }
+
        if (max_cpuid_ex >= 0x80000001)
         {
           asm volatile ("cpuid"
                         : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
                         : "0" (0x80000001));
-         /*  PREFETCHW     || 3DNow! */
+         /*  PREFETCHW     || 3DNow!  */
           if ((ecx & 0x100) || (edx & 0x80000000))
             __x86_64_prefetchw = -1;
         }
      }
  
-  if (core > 0)
-    __x86_64_core_cache_size_half = core / 2;
+  if (data > 0)
+    __x86_64_data_cache_size_half = data / 2;
  
    if (shared > 0)
      __x86_64_shared_cache_size_half = shared / 2;
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S

index 2313298..b25646b 100644 (file)
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -114,15 +114,15 @@ L(1d):                                    /* 16-byte loop */
         .p2align 4
  
  L(1loop):
-       movq      (%rsi), %rcx
-       movq    8 (%rsi), %r8
-       movq    %rcx,   (%rdi)
-       movq     %r8, 8 (%rdi)
+       movq     (%rsi), %rcx
+       movq    8(%rsi), %r8
+       movq    %rcx,  (%rdi)
+       movq     %r8, 8(%rdi)
  
         subl    $16, %edx
  
-       leaq    16 (%rsi), %rsi
-       leaq    16 (%rdi), %rdi
+       leaq    16(%rsi), %rsi
+       leaq    16(%rdi), %rdi
  
         jnz     L(1loop)
  
@@ -140,19 +140,19 @@ L(exit):                          /* exit */
  
  L(1after):
  #ifndef USE_AS_MEMPCPY
-       movq    %rax, RETVAL (%rsp)     /* save return value */
+       movq    %rax, RETVAL(%rsp)      /* save return value */
  #endif
  
  /* Align to the natural word size. */
  
  L(aligntry):
-       movl    %esi, %ecx              /* align by destination */
+       movl    %esi, %ecx              /* align by source */
  
         andl    $7, %ecx
         jz      L(alignafter)           /* already aligned */
  
  L(align):                              /* align */
-       leaq    -8 (%rcx, %rdx), %rdx   /* calculate remaining bytes */
+       leaq    -8(%rcx, %rdx), %rdx    /* calculate remaining bytes */
         subl    $8, %ecx
  
         .p2align 4
@@ -163,8 +163,8 @@ L(alignloop):                               /* 1-byte alignment loop */
  
         incl    %ecx
  
-       leaq    1 (%rsi), %rsi
-       leaq    1 (%rdi), %rdi
+       leaq    1(%rsi), %rsi
+       leaq    1(%rdi), %rdi
  
         jnz     L(alignloop)
  
@@ -172,7 +172,7 @@ L(alignloop):                               /* 1-byte alignment loop */
  
  L(alignafter):
  
-/* Loop to handle mid-sized blocks. */
+/* Handle mid-sized blocks. */
  
  L(32try):                              /* up to 1KB */
         cmpq    $1024, %rdx
@@ -188,15 +188,15 @@ L(32):                                    /* 32-byte loop */
  L(32loop):
         decl    %ecx
  
-       movq    (%rsi), %rax
-       movq     8 (%rsi), %r8
-       movq    16 (%rsi), %r9
-       movq    24 (%rsi), %r10
+       movq      (%rsi), %rax
+       movq     8(%rsi), %r8
+       movq    16(%rsi), %r9
+       movq    24(%rsi), %r10
  
-       movq    %rax, (%rdi)
-       movq     %r8,  8 (%rdi)
-       movq     %r9, 16 (%rdi)
-       movq    %r10, 24 (%rdi)
+       movq    %rax,   (%rdi)
+       movq     %r8,  8(%rdi)
+       movq     %r9, 16(%rdi)
+       movq    %r10, 24(%rdi)
  
         leaq    32(%rsi), %rsi
         leaq    32(%rdi), %rdi
@@ -205,18 +205,18 @@ L(32loop):
  
         decl    %ecx
  
-       movq       (%rsi), %rax
-       movq     8 (%rsi), %r8
-       movq    16 (%rsi), %r9
-       movq    24 (%rsi), %r10
+       movq      (%rsi), %rax
+       movq     8(%rsi), %r8
+       movq    16(%rsi), %r9
+       movq    24(%rsi), %r10
  
-       movq    %rax,    (%rdi)
-       movq     %r8,  8 (%rdi)
-       movq     %r9, 16 (%rdi)
-       movq    %r10, 24 (%rdi)
+       movq    %rax,   (%rdi)
+       movq     %r8,  8(%rdi)
+       movq     %r9, 16(%rdi)
+       movq    %r10, 24(%rdi)
  
-       leaq    32 (%rsi), %rsi
-       leaq    32 (%rdi), %rdi
+       leaq    32(%rsi), %rsi
+       leaq    32(%rdi), %rdi
  
         jnz     L(32loop)
  
@@ -229,9 +229,9 @@ L(32skip):
  
         movq    %rdi, %rax
  #else
-       movq    RETVAL (%rsp), %rax
+       movq    RETVAL(%rsp), %rax
         jnz     L(1)
-       
+
         rep
  #endif
         retq                            /* exit */
@@ -245,11 +245,11 @@ L(32after):
         larger blocks are excluded when building for RTLD.
  */
  
-/* Handle large blocks smaller than 1/2 L1. */
+/* Handle blocks smaller than 1/2 L1. */
  
  L(fasttry):                            /* first 1/2 L1 */
  #ifndef NOT_IN_libc                    /* only up to this algorithm outside of libc.so */
-       movq    __x86_64_core_cache_size_half (%rip), %r11
+       movq    __x86_64_data_cache_size_half(%rip), %r11
         cmpq    %rdx, %r11              /* calculate the smaller of */
         cmovaq  %rdx, %r11              /* remaining bytes and 1/2 L1 */
  #endif
@@ -282,7 +282,7 @@ L(fastskip):
  
         movq    %rdi, %rax
  #else
-       movq    RETVAL (%rsp), %rax
+       movq    RETVAL(%rsp), %rax
         jnz     L(1)
  
         rep
@@ -308,16 +308,16 @@ L(pre):                                   /* 64-byte with prefetching */
         shrq    $6, %rcx
         jz      L(preskip)
  
-       movq    %r14, SAVE0 (%rsp)
+       movq    %r14, SAVE0(%rsp)
         cfi_rel_offset (%r14, SAVE0)
-       movq    %r13, SAVE1 (%rsp)
+       movq    %r13, SAVE1(%rsp)
         cfi_rel_offset (%r13, SAVE1)
-       movq    %r12, SAVE2 (%rsp)
+       movq    %r12, SAVE2(%rsp)
         cfi_rel_offset (%r12, SAVE2)
-       movq    %rbx, SAVE3 (%rsp)
+       movq    %rbx, SAVE3(%rsp)
         cfi_rel_offset (%rbx, SAVE3)
  
-       cmpl    $0, __x86_64_prefetchw (%rip)
+       cmpl    $0, __x86_64_prefetchw(%rip)
         jz      L(preloop)              /* check if PREFETCHW OK */
  
         .p2align 4
@@ -339,45 +339,45 @@ L(prewloop):                              /* cache-line in state M */
         prefetcht0       0 + 896 (%rsi)
         prefetcht0      64 + 896 (%rsi)
  
-       movq    %rax,    (%rdi)
-       movq    %rbx,  8 (%rdi)
-       movq     %r9, 16 (%rdi)
-       movq    %r10, 24 (%rdi)
-       movq    %r11, 32 (%rdi)
-       movq    %r12, 40 (%rdi)
-       movq    %r13, 48 (%rdi)
-       movq    %r14, 56 (%rdi)
+       movq    %rax,   (%rdi)
+       movq    %rbx,  8(%rdi)
+       movq     %r9, 16(%rdi)
+       movq    %r10, 24(%rdi)
+       movq    %r11, 32(%rdi)
+       movq    %r12, 40(%rdi)
+       movq    %r13, 48(%rdi)
+       movq    %r14, 56(%rdi)
  
-       leaq    64 (%rsi), %rsi
-       leaq    64 (%rdi), %rdi
+       leaq    64(%rsi), %rsi
+       leaq    64(%rdi), %rdi
  
         jz      L(prebail)
  
         decq    %rcx
  
-       movq       (%rsi), %rax
-       movq     8 (%rsi), %rbx
-       movq    16 (%rsi), %r9
-       movq    24 (%rsi), %r10
-       movq    32 (%rsi), %r11
-       movq    40 (%rsi), %r12
-       movq    48 (%rsi), %r13
-       movq    56 (%rsi), %r14
-
-       movq    %rax,    (%rdi)
-       movq    %rbx,  8 (%rdi)
-       movq     %r9, 16 (%rdi)
-       movq    %r10, 24 (%rdi)
-       movq    %r11, 32 (%rdi)
-       movq    %r12, 40 (%rdi)
-       movq    %r13, 48 (%rdi)
-       movq    %r14, 56 (%rdi)
-
-       prefetchw       896 - 64 (%rdi)
-       prefetchw       896 -  0 (%rdi)
-
-       leaq    64 (%rsi), %rsi
-       leaq    64 (%rdi), %rdi
+       movq      (%rsi), %rax
+       movq     8(%rsi), %rbx
+       movq    16(%rsi), %r9
+       movq    24(%rsi), %r10
+       movq    32(%rsi), %r11
+       movq    40(%rsi), %r12
+       movq    48(%rsi), %r13
+       movq    56(%rsi), %r14
+
+       movq    %rax,   (%rdi)
+       movq    %rbx,  8(%rdi)
+       movq     %r9, 16(%rdi)
+       movq    %r10, 24(%rdi)
+       movq    %r11, 32(%rdi)
+       movq    %r12, 40(%rdi)
+       movq    %r13, 48(%rdi)
+       movq    %r14, 56(%rdi)
+
+       prefetchw       896 - 64(%rdi)
+       prefetchw       896 -  0(%rdi)
+
+       leaq    64(%rsi), %rsi
+       leaq    64(%rdi), %rdi
  
         jnz     L(prewloop)
         jmp     L(prebail)
@@ -389,26 +389,26 @@ L(prewloop):                              /* cache-line in state M */
  L(preloop):                            /* cache-line in state E */
         decq    %rcx
  
-       movq       (%rsi), %rax
-       movq     8 (%rsi), %rbx
-       movq    16 (%rsi), %r9
-       movq    24 (%rsi), %r10
-       movq    32 (%rsi), %r11
-       movq    40 (%rsi), %r12
-       movq    48 (%rsi), %r13
-       movq    56 (%rsi), %r14
-
-       prefetcht0      896 +  0 (%rsi)
-       prefetcht0      896 + 64 (%rsi)
-
-       movq    %rax,    (%rdi)
-       movq    %rbx,  8 (%rdi)
-       movq     %r9, 16 (%rdi)
-       movq    %r10, 24 (%rdi)
-       movq    %r11, 32 (%rdi)
-       movq    %r12, 40 (%rdi)
-       movq    %r13, 48 (%rdi)
-       movq    %r14, 56 (%rdi)
+       movq      (%rsi), %rax
+       movq     8(%rsi), %rbx
+       movq    16(%rsi), %r9
+       movq    24(%rsi), %r10
+       movq    32(%rsi), %r11
+       movq    40(%rsi), %r12
+       movq    48(%rsi), %r13
+       movq    56(%rsi), %r14
+
+       prefetcht0      896 +  0(%rsi)
+       prefetcht0      896 + 64(%rsi)
+
+       movq    %rax,   (%rdi)
+       movq    %rbx,  8(%rdi)
+       movq     %r9, 16(%rdi)
+       movq    %r10, 24(%rdi)
+       movq    %r11, 32(%rdi)
+       movq    %r12, 40(%rdi)
+       movq    %r13, 48(%rdi)
+       movq    %r14, 56(%rdi)
  
         leaq    64 (%rsi), %rsi
         leaq    64 (%rdi), %rdi
@@ -417,40 +417,40 @@ L(preloop):                               /* cache-line in state E */
  
         decq    %rcx
  
-       movq       (%rsi), %rax
-       movq     8 (%rsi), %rbx
-       movq    16 (%rsi), %r9
-       movq    24 (%rsi), %r10
-       movq    32 (%rsi), %r11
-       movq    40 (%rsi), %r12
-       movq    48 (%rsi), %r13
-       movq    56 (%rsi), %r14
-
-       prefetcht0      896 - 64 (%rdi)
-       prefetcht0      896 -  0 (%rdi)
-
-       movq    %rax,    (%rdi)
-       movq    %rbx,  8 (%rdi)
-       movq     %r9, 16 (%rdi)
-       movq    %r10, 24 (%rdi)
-       movq    %r11, 32 (%rdi)
-       movq    %r12, 40 (%rdi)
-       movq    %r13, 48 (%rdi)
-       movq    %r14, 56 (%rdi)
-
-       leaq    64 (%rsi), %rsi
-       leaq    64 (%rdi), %rdi
+       movq      (%rsi), %rax
+       movq     8(%rsi), %rbx
+       movq    16(%rsi), %r9
+       movq    24(%rsi), %r10
+       movq    32(%rsi), %r11
+       movq    40(%rsi), %r12
+       movq    48(%rsi), %r13
+       movq    56(%rsi), %r14
+
+       prefetcht0      896 - 64(%rdi)
+       prefetcht0      896 -  0(%rdi)
+
+       movq    %rax,   (%rdi)
+       movq    %rbx,  8(%rdi)
+       movq     %r9, 16(%rdi)
+       movq    %r10, 24(%rdi)
+       movq    %r11, 32(%rdi)
+       movq    %r12, 40(%rdi)
+       movq    %r13, 48(%rdi)
+       movq    %r14, 56(%rdi)
+
+       leaq    64(%rsi), %rsi
+       leaq    64(%rdi), %rdi
  
         jnz     L(preloop)
  
  L(prebail):
-       movq    SAVE3 (%rsp), %rbx
+       movq    SAVE3(%rsp), %rbx
         cfi_restore (%rbx)
-       movq    SAVE2 (%rsp), %r12
+       movq    SAVE2(%rsp), %r12
         cfi_restore (%r12)
-       movq    SAVE1 (%rsp), %r13
+       movq    SAVE1(%rsp), %r13
         cfi_restore (%r13)
-       movq    SAVE0 (%rsp), %r14
+       movq    SAVE0(%rsp), %r14
         cfi_restore (%r14)
  
  /*       .p2align 4 */
@@ -466,7 +466,7 @@ L(preskip):
  
         movq    %rdi, %rax
  #else
-       movq    RETVAL (%rsp), %rax
+       movq    RETVAL(%rsp), %rax
         jnz     L(1)
  
         rep
@@ -477,7 +477,7 @@ L(preskip):
  
  L(preafter):
  
-/* Loop to handle huge blocks. */
+/* Handle huge blocks. */
  
  L(NTtry):
  
@@ -486,69 +486,69 @@ L(NT):                                    /* non-temporal 128-byte */
         shrq    $7, %rcx
         jz      L(NTskip)
  
-       movq    %r14, SAVE0 (%rsp)
+       movq    %r14, SAVE0(%rsp)
         cfi_rel_offset (%r14, SAVE0)
-       movq    %r13, SAVE1 (%rsp)
+       movq    %r13, SAVE1(%rsp)
         cfi_rel_offset (%r13, SAVE1)
-       movq    %r12, SAVE2 (%rsp)
+       movq    %r12, SAVE2(%rsp)
         cfi_rel_offset (%r12, SAVE2)
  
         .p2align 4
  
  L(NTloop):
-       prefetchnta     768 (%rsi)
-       prefetchnta     832 (%rsi)
+       prefetchnta     768(%rsi)
+       prefetchnta     832(%rsi)
  
         decq    %rcx
  
-       movq       (%rsi), %rax
-       movq     8 (%rsi), %r8
-       movq    16 (%rsi), %r9
-       movq    24 (%rsi), %r10
-       movq    32 (%rsi), %r11
-       movq    40 (%rsi), %r12
-       movq    48 (%rsi), %r13
-       movq    56 (%rsi), %r14
-
-       movntiq %rax,    (%rdi)
-       movntiq  %r8,  8 (%rdi)
-       movntiq  %r9, 16 (%rdi)
-       movntiq %r10, 24 (%rdi)
-       movntiq %r11, 32 (%rdi)
-       movntiq %r12, 40 (%rdi)
-       movntiq %r13, 48 (%rdi)
-       movntiq %r14, 56 (%rdi)
-
-       movq     64 (%rsi), %rax
-       movq     72 (%rsi), %r8
-       movq     80 (%rsi), %r9
-       movq     88 (%rsi), %r10
-       movq     96 (%rsi), %r11
-       movq    104 (%rsi), %r12
-       movq    112 (%rsi), %r13
-       movq    120 (%rsi), %r14
-
-       movntiq %rax,  64 (%rdi)
-       movntiq  %r8,  72 (%rdi)
-       movntiq  %r9,  80 (%rdi)
-       movntiq %r10,  88 (%rdi)
-       movntiq %r11,  96 (%rdi)
-       movntiq %r12, 104 (%rdi)
-       movntiq %r13, 112 (%rdi)
-       movntiq %r14, 120 (%rdi)
-
-       leaq    128 (%rsi), %rsi
-       leaq    128 (%rdi), %rdi
+       movq      (%rsi), %rax
+       movq     8(%rsi), %r8
+       movq    16(%rsi), %r9
+       movq    24(%rsi), %r10
+       movq    32(%rsi), %r11
+       movq    40(%rsi), %r12
+       movq    48(%rsi), %r13
+       movq    56(%rsi), %r14
+
+       movntiq %rax,   (%rdi)
+       movntiq  %r8,  8(%rdi)
+       movntiq  %r9, 16(%rdi)
+       movntiq %r10, 24(%rdi)
+       movntiq %r11, 32(%rdi)
+       movntiq %r12, 40(%rdi)
+       movntiq %r13, 48(%rdi)
+       movntiq %r14, 56(%rdi)
+
+       movq     64(%rsi), %rax
+       movq     72(%rsi), %r8
+       movq     80(%rsi), %r9
+       movq     88(%rsi), %r10
+       movq     96(%rsi), %r11
+       movq    104(%rsi), %r12
+       movq    112(%rsi), %r13
+       movq    120(%rsi), %r14
+
+       movntiq %rax,  64(%rdi)
+       movntiq  %r8,  72(%rdi)
+       movntiq  %r9,  80(%rdi)
+       movntiq %r10,  88(%rdi)
+       movntiq %r11,  96(%rdi)
+       movntiq %r12, 104(%rdi)
+       movntiq %r13, 112(%rdi)
+       movntiq %r14, 120(%rdi)
+
+       leaq    128(%rsi), %rsi
+       leaq    128(%rdi), %rdi
  
         jnz     L(NTloop)
  
         sfence                          /* serialize memory stores */
  
-       movq    SAVE2 (%rsp), %r12
+       movq    SAVE2(%rsp), %r12
         cfi_restore (%r12)
-       movq    SAVE1 (%rsp), %r13
+       movq    SAVE1(%rsp), %r13
         cfi_restore (%r13)
-       movq    SAVE0 (%rsp), %r14
+       movq    SAVE0(%rsp), %r14
         cfi_restore (%r14)
  
  L(NTskip):
@@ -558,7 +558,7 @@ L(NTskip):
  
         movq    %rdi, %rax
  #else
-       movq    RETVAL (%rsp), %rax
+       movq    RETVAL(%rsp), %rax
         jnz     L(1)
  
         rep
author	Ulrich Drepper <drepper@redhat.com>
	Sat, 22 Sep 2007 05:54:03 +0000 (05:54 +0000)
committer	Ulrich Drepper <drepper@redhat.com>
	Sat, 22 Sep 2007 05:54:03 +0000 (05:54 +0000)
ChangeLog		patch \| blob \| history
sysdeps/x86_64/cacheinfo.c		patch \| blob \| history
sysdeps/x86_64/memcpy.S		patch \| blob \| history