}
-/* Half the core cache size for use in memory and string routines, typically
- L1 size. */
-long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2;
+/* Half the data cache size for use in memory and string routines, typically
+ L1 size. */
+long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
/* Shared cache size for use in memory and string routines, typically
- L2 or L3 size. */
+ L2 or L3 size. */
long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
-/* PREFETCHW support flag for use in memory and string routines. */
+/* PREFETCHW support flag for use in memory and string routines. */
int __x86_64_prefetchw attribute_hidden;
unsigned int edx;
int max_cpuid;
int max_cpuid_ex;
- long int core = -1;
+ long int data = -1;
long int shared = -1;
unsigned int level;
unsigned int threads = 0;
/* This spells out "GenuineIntel". */
if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
{
- core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+ data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
- /* Try L3 first. */
+ /* Try L3 first. */
level = 3;
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
if (shared <= 0)
{
- /* Try L2 otherwise. */
+ /* Try L2 otherwise. */
level = 2;
shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
}
/* Figure out the number of logical threads that share the
- highest cache level. */
+ highest cache level. */
if (max_cpuid >= 4)
{
int i = 0;
- /* Query until desired cache level is enumerated. */
+ /* Query until desired cache level is enumerated. */
do
{
asm volatile ("cpuid"
}
else
{
- /* Assume that all logical threads share the highest cache level. */
+ /* Assume that all logical threads share the highest cache level. */
asm volatile ("cpuid"
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
: "0" (1));
}
/* Cap usage of highest cache level to the number of supported
- threads. */
+ threads. */
if (shared > 0 && threads > 0)
shared /= threads;
}
/* This spells out "AuthenticAMD". */
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
{
- core = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
- shared = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+ data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+ long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+ shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+ /* Get maximum extended function. */
asm volatile ("cpuid"
: "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
: "0" (0x80000000));
+ if (shared <= 0)
+ /* No shared L3 cache. All we have is the L2 cache. */
+ shared = core;
+ else
+ {
+ /* Figure out the number of logical threads that share L3. */
+ if (max_cpuid_ex >= 0x80000008)
+ {
+ /* Get width of APIC ID. */
+ asm volatile ("cpuid"
+ : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx),
+ "=d" (edx)
+ : "0" (0x80000008));
+ threads = 1 << ((ecx >> 12) & 0x0f);
+ }
+
+ if (threads == 0)
+ {
+ /* If APIC ID width is not available, use logical
+ processor count. */
+ asm volatile ("cpuid"
+ : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx),
+ "=d" (edx)
+ : "0" (0x00000001));
+
+ if ((edx & (1 << 28)) != 0)
+ threads = (ebx >> 16) & 0xff;
+ }
+
+ /* Cap usage of highest cache level to the number of
+ supported threads. */
+ if (threads > 0)
+ shared /= threads;
+
+ /* Account for exclusive L2 and L3 caches. */
+ shared += core;
+ }
+
if (max_cpuid_ex >= 0x80000001)
{
asm volatile ("cpuid"
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
: "0" (0x80000001));
- /* PREFETCHW || 3DNow! */
+ /* PREFETCHW || 3DNow! */
if ((ecx & 0x100) || (edx & 0x80000000))
__x86_64_prefetchw = -1;
}
}
- if (core > 0)
- __x86_64_core_cache_size_half = core / 2;
+ if (data > 0)
+ __x86_64_data_cache_size_half = data / 2;
if (shared > 0)
__x86_64_shared_cache_size_half = shared / 2;
.p2align 4
L(1loop):
- movq (%rsi), %rcx
- movq 8 (%rsi), %r8
- movq %rcx, (%rdi)
- movq %r8, 8 (%rdi)
+ movq (%rsi), %rcx
+ movq 8(%rsi), %r8
+ movq %rcx, (%rdi)
+ movq %r8, 8(%rdi)
subl $16, %edx
- leaq 16 (%rsi), %rsi
- leaq 16 (%rdi), %rdi
+ leaq 16(%rsi), %rsi
+ leaq 16(%rdi), %rdi
jnz L(1loop)
L(1after):
#ifndef USE_AS_MEMPCPY
- movq %rax, RETVAL (%rsp) /* save return value */
+ movq %rax, RETVAL(%rsp) /* save return value */
#endif
/* Align to the natural word size. */
L(aligntry):
- movl %esi, %ecx /* align by destination */
+ movl %esi, %ecx /* align by source */
andl $7, %ecx
jz L(alignafter) /* already aligned */
L(align): /* align */
- leaq -8 (%rcx, %rdx), %rdx /* calculate remaining bytes */
+ leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
subl $8, %ecx
.p2align 4
incl %ecx
- leaq 1 (%rsi), %rsi
- leaq 1 (%rdi), %rdi
+ leaq 1(%rsi), %rsi
+ leaq 1(%rdi), %rdi
jnz L(alignloop)
L(alignafter):
-/* Loop to handle mid-sized blocks. */
+/* Handle mid-sized blocks. */
L(32try): /* up to 1KB */
cmpq $1024, %rdx
L(32loop):
decl %ecx
- movq (%rsi), %rax
- movq 8 (%rsi), %r8
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
+ movq (%rsi), %rax
+ movq 8(%rsi), %r8
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
- movq %rax, (%rdi)
- movq %r8, 8 (%rdi)
- movq %r9, 16 (%rdi)
- movq %r10, 24 (%rdi)
+ movq %rax, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
leaq 32(%rsi), %rsi
leaq 32(%rdi), %rdi
decl %ecx
- movq (%rsi), %rax
- movq 8 (%rsi), %r8
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
+ movq (%rsi), %rax
+ movq 8(%rsi), %r8
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
- movq %rax, (%rdi)
- movq %r8, 8 (%rdi)
- movq %r9, 16 (%rdi)
- movq %r10, 24 (%rdi)
+ movq %rax, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
- leaq 32 (%rsi), %rsi
- leaq 32 (%rdi), %rdi
+ leaq 32(%rsi), %rsi
+ leaq 32(%rdi), %rdi
jnz L(32loop)
movq %rdi, %rax
#else
- movq RETVAL (%rsp), %rax
+ movq RETVAL(%rsp), %rax
jnz L(1)
-
+
rep
#endif
retq /* exit */
larger blocks are excluded when building for RTLD.
*/
-/* Handle large blocks smaller than 1/2 L1. */
+/* Handle blocks smaller than 1/2 L1. */
L(fasttry): /* first 1/2 L1 */
#ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
- movq __x86_64_core_cache_size_half (%rip), %r11
+ movq __x86_64_data_cache_size_half(%rip), %r11
cmpq %rdx, %r11 /* calculate the smaller of */
cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
#endif
movq %rdi, %rax
#else
- movq RETVAL (%rsp), %rax
+ movq RETVAL(%rsp), %rax
jnz L(1)
rep
shrq $6, %rcx
jz L(preskip)
- movq %r14, SAVE0 (%rsp)
+ movq %r14, SAVE0(%rsp)
cfi_rel_offset (%r14, SAVE0)
- movq %r13, SAVE1 (%rsp)
+ movq %r13, SAVE1(%rsp)
cfi_rel_offset (%r13, SAVE1)
- movq %r12, SAVE2 (%rsp)
+ movq %r12, SAVE2(%rsp)
cfi_rel_offset (%r12, SAVE2)
- movq %rbx, SAVE3 (%rsp)
+ movq %rbx, SAVE3(%rsp)
cfi_rel_offset (%rbx, SAVE3)
- cmpl $0, __x86_64_prefetchw (%rip)
+ cmpl $0, __x86_64_prefetchw(%rip)
jz L(preloop) /* check if PREFETCHW OK */
.p2align 4
prefetcht0 0 + 896 (%rsi)
prefetcht0 64 + 896 (%rsi)
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %r9, 16 (%rdi)
- movq %r10, 24 (%rdi)
- movq %r11, 32 (%rdi)
- movq %r12, 40 (%rdi)
- movq %r13, 48 (%rdi)
- movq %r14, 56 (%rdi)
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
jz L(prebail)
decq %rcx
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
- movq 32 (%rsi), %r11
- movq 40 (%rsi), %r12
- movq 48 (%rsi), %r13
- movq 56 (%rsi), %r14
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %r9, 16 (%rdi)
- movq %r10, 24 (%rdi)
- movq %r11, 32 (%rdi)
- movq %r12, 40 (%rdi)
- movq %r13, 48 (%rdi)
- movq %r14, 56 (%rdi)
-
- prefetchw 896 - 64 (%rdi)
- prefetchw 896 - 0 (%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
+ movq (%rsi), %rax
+ movq 8(%rsi), %rbx
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
+
+ prefetchw 896 - 64(%rdi)
+ prefetchw 896 - 0(%rdi)
+
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
jnz L(prewloop)
jmp L(prebail)
L(preloop): /* cache-line in state E */
decq %rcx
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
- movq 32 (%rsi), %r11
- movq 40 (%rsi), %r12
- movq 48 (%rsi), %r13
- movq 56 (%rsi), %r14
-
- prefetcht0 896 + 0 (%rsi)
- prefetcht0 896 + 64 (%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %r9, 16 (%rdi)
- movq %r10, 24 (%rdi)
- movq %r11, 32 (%rdi)
- movq %r12, 40 (%rdi)
- movq %r13, 48 (%rdi)
- movq %r14, 56 (%rdi)
+ movq (%rsi), %rax
+ movq 8(%rsi), %rbx
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ prefetcht0 896 + 0(%rsi)
+ prefetcht0 896 + 64(%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi
decq %rcx
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
- movq 32 (%rsi), %r11
- movq 40 (%rsi), %r12
- movq 48 (%rsi), %r13
- movq 56 (%rsi), %r14
-
- prefetcht0 896 - 64 (%rdi)
- prefetcht0 896 - 0 (%rdi)
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %r9, 16 (%rdi)
- movq %r10, 24 (%rdi)
- movq %r11, 32 (%rdi)
- movq %r12, 40 (%rdi)
- movq %r13, 48 (%rdi)
- movq %r14, 56 (%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
+ movq (%rsi), %rax
+ movq 8(%rsi), %rbx
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ prefetcht0 896 - 64(%rdi)
+ prefetcht0 896 - 0(%rdi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ movq %r11, 32(%rdi)
+ movq %r12, 40(%rdi)
+ movq %r13, 48(%rdi)
+ movq %r14, 56(%rdi)
+
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
jnz L(preloop)
L(prebail):
- movq SAVE3 (%rsp), %rbx
+ movq SAVE3(%rsp), %rbx
cfi_restore (%rbx)
- movq SAVE2 (%rsp), %r12
+ movq SAVE2(%rsp), %r12
cfi_restore (%r12)
- movq SAVE1 (%rsp), %r13
+ movq SAVE1(%rsp), %r13
cfi_restore (%r13)
- movq SAVE0 (%rsp), %r14
+ movq SAVE0(%rsp), %r14
cfi_restore (%r14)
/* .p2align 4 */
movq %rdi, %rax
#else
- movq RETVAL (%rsp), %rax
+ movq RETVAL(%rsp), %rax
jnz L(1)
rep
L(preafter):
-/* Loop to handle huge blocks. */
+/* Handle huge blocks. */
L(NTtry):
shrq $7, %rcx
jz L(NTskip)
- movq %r14, SAVE0 (%rsp)
+ movq %r14, SAVE0(%rsp)
cfi_rel_offset (%r14, SAVE0)
- movq %r13, SAVE1 (%rsp)
+ movq %r13, SAVE1(%rsp)
cfi_rel_offset (%r13, SAVE1)
- movq %r12, SAVE2 (%rsp)
+ movq %r12, SAVE2(%rsp)
cfi_rel_offset (%r12, SAVE2)
.p2align 4
L(NTloop):
- prefetchnta 768 (%rsi)
- prefetchnta 832 (%rsi)
+ prefetchnta 768(%rsi)
+ prefetchnta 832(%rsi)
decq %rcx
- movq (%rsi), %rax
- movq 8 (%rsi), %r8
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
- movq 32 (%rsi), %r11
- movq 40 (%rsi), %r12
- movq 48 (%rsi), %r13
- movq 56 (%rsi), %r14
-
- movntiq %rax, (%rdi)
- movntiq %r8, 8 (%rdi)
- movntiq %r9, 16 (%rdi)
- movntiq %r10, 24 (%rdi)
- movntiq %r11, 32 (%rdi)
- movntiq %r12, 40 (%rdi)
- movntiq %r13, 48 (%rdi)
- movntiq %r14, 56 (%rdi)
-
- movq 64 (%rsi), %rax
- movq 72 (%rsi), %r8
- movq 80 (%rsi), %r9
- movq 88 (%rsi), %r10
- movq 96 (%rsi), %r11
- movq 104 (%rsi), %r12
- movq 112 (%rsi), %r13
- movq 120 (%rsi), %r14
-
- movntiq %rax, 64 (%rdi)
- movntiq %r8, 72 (%rdi)
- movntiq %r9, 80 (%rdi)
- movntiq %r10, 88 (%rdi)
- movntiq %r11, 96 (%rdi)
- movntiq %r12, 104 (%rdi)
- movntiq %r13, 112 (%rdi)
- movntiq %r14, 120 (%rdi)
-
- leaq 128 (%rsi), %rsi
- leaq 128 (%rdi), %rdi
+ movq (%rsi), %rax
+ movq 8(%rsi), %r8
+ movq 16(%rsi), %r9
+ movq 24(%rsi), %r10
+ movq 32(%rsi), %r11
+ movq 40(%rsi), %r12
+ movq 48(%rsi), %r13
+ movq 56(%rsi), %r14
+
+ movntiq %rax, (%rdi)
+ movntiq %r8, 8(%rdi)
+ movntiq %r9, 16(%rdi)
+ movntiq %r10, 24(%rdi)
+ movntiq %r11, 32(%rdi)
+ movntiq %r12, 40(%rdi)
+ movntiq %r13, 48(%rdi)
+ movntiq %r14, 56(%rdi)
+
+ movq 64(%rsi), %rax
+ movq 72(%rsi), %r8
+ movq 80(%rsi), %r9
+ movq 88(%rsi), %r10
+ movq 96(%rsi), %r11
+ movq 104(%rsi), %r12
+ movq 112(%rsi), %r13
+ movq 120(%rsi), %r14
+
+ movntiq %rax, 64(%rdi)
+ movntiq %r8, 72(%rdi)
+ movntiq %r9, 80(%rdi)
+ movntiq %r10, 88(%rdi)
+ movntiq %r11, 96(%rdi)
+ movntiq %r12, 104(%rdi)
+ movntiq %r13, 112(%rdi)
+ movntiq %r14, 120(%rdi)
+
+ leaq 128(%rsi), %rsi
+ leaq 128(%rdi), %rdi
jnz L(NTloop)
sfence /* serialize memory stores */
- movq SAVE2 (%rsp), %r12
+ movq SAVE2(%rsp), %r12
cfi_restore (%r12)
- movq SAVE1 (%rsp), %r13
+ movq SAVE1(%rsp), %r13
cfi_restore (%r13)
- movq SAVE0 (%rsp), %r14
+ movq SAVE0(%rsp), %r14
cfi_restore (%r14)
L(NTskip):
movq %rdi, %rax
#else
- movq RETVAL (%rsp), %rax
+ movq RETVAL(%rsp), %rax
jnz L(1)
rep