[PATCH] x86-64: Add __copy_from_user_nocache

author Andi Kleen <ak@suse.de>

Tue, 13 Feb 2007 12:26:19 +0000 (13:26 +0100)

committer Andi Kleen <andi@basil.nowhere.org>

Tue, 13 Feb 2007 12:26:19 +0000 (13:26 +0100)
author Andi Kleen <ak@suse.de>
Tue, 13 Feb 2007 12:26:19 +0000 (13:26 +0100)
committer Andi Kleen <andi@basil.nowhere.org>
Tue, 13 Feb 2007 12:26:19 +0000 (13:26 +0100)
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c

index 6d77e47..23a7da3 100644 (file)
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
  EXPORT_SYMBOL(__put_user_8);
  
  EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
  EXPORT_SYMBOL(copy_from_user);
  EXPORT_SYMBOL(copy_to_user);
  EXPORT_SYMBOL(__copy_from_user_inatomic);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile

index b78d417..8d5f835 100644 (file)
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
  lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
         usercopy.o getuser.o putuser.o  \
         thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S

new file mode 100644 (file)

index 0000000..4620efb
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag       when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+       CFI_STARTPROC
+       pushq %rbx
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rbx, 0
+       pushq %rcx              /* save zero flag */
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rcx, 0
+
+       xorl %eax,%eax          /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+       /* check for bad alignment of destination */
+       movl %edi,%ecx
+       andl $7,%ecx
+       jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+       movq %rdx,%rcx
+
+       movl $64,%ebx
+       shrq $6,%rdx
+       decq %rdx
+       js   .Lhandle_tail
+
+       .p2align 4
+.Lloop:
+.Ls1:  movq (%rsi),%r11
+.Ls2:  movq 1*8(%rsi),%r8
+.Ls3:  movq 2*8(%rsi),%r9
+.Ls4:  movq 3*8(%rsi),%r10
+.Ld1:  movnti %r11,(%rdi)
+.Ld2:  movnti %r8,1*8(%rdi)
+.Ld3:  movnti %r9,2*8(%rdi)
+.Ld4:  movnti %r10,3*8(%rdi)
+
+.Ls5:  movq 4*8(%rsi),%r11
+.Ls6:  movq 5*8(%rsi),%r8
+.Ls7:  movq 6*8(%rsi),%r9
+.Ls8:  movq 7*8(%rsi),%r10
+.Ld5:  movnti %r11,4*8(%rdi)
+.Ld6:  movnti %r8,5*8(%rdi)
+.Ld7:  movnti %r9,6*8(%rdi)
+.Ld8:  movnti %r10,7*8(%rdi)
+
+       dec  %rdx
+
+       leaq 64(%rsi),%rsi
+       leaq 64(%rdi),%rdi
+
+       jns  .Lloop
+
+       .p2align 4
+.Lhandle_tail:
+       movl %ecx,%edx
+       andl $63,%ecx
+       shrl $3,%ecx
+       jz   .Lhandle_7
+       movl $8,%ebx
+       .p2align 4
+.Lloop_8:
+.Ls9:  movq (%rsi),%r8
+.Ld9:  movnti %r8,(%rdi)
+       decl %ecx
+       leaq 8(%rdi),%rdi
+       leaq 8(%rsi),%rsi
+       jnz .Lloop_8
+
+.Lhandle_7:
+       movl %edx,%ecx
+       andl $7,%ecx
+       jz   .Lende
+       .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+       incq %rdi
+       incq %rsi
+       decl %ecx
+       jnz .Lloop_1
+
+       CFI_REMEMBER_STATE
+.Lende:
+       popq %rcx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE %rcx
+       popq %rbx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rbx
+       ret
+       CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+       /* align destination */
+       .p2align 4
+.Lbad_alignment:
+       movl $8,%r9d
+       subl %ecx,%r9d
+       movl %r9d,%ecx
+       cmpq %r9,%rdx
+       jz   .Lhandle_7
+       js   .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .Lalign_1
+       subq %r9,%rdx
+       jmp .Lafter_bad_alignment
+#endif
+
+       /* table sorted by exception address */
+       .section __ex_table,"a"
+       .align 8
+       .quad .Ls1,.Ls1e
+       .quad .Ls2,.Ls2e
+       .quad .Ls3,.Ls3e
+       .quad .Ls4,.Ls4e
+       .quad .Ld1,.Ls1e
+       .quad .Ld2,.Ls2e
+       .quad .Ld3,.Ls3e
+       .quad .Ld4,.Ls4e
+       .quad .Ls5,.Ls5e
+       .quad .Ls6,.Ls6e
+       .quad .Ls7,.Ls7e
+       .quad .Ls8,.Ls8e
+       .quad .Ld5,.Ls5e
+       .quad .Ld6,.Ls6e
+       .quad .Ld7,.Ls7e
+       .quad .Ld8,.Ls8e
+       .quad .Ls9,.Le_quad
+       .quad .Ld9,.Le_quad
+       .quad .Ls10,.Le_byte
+       .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+       .quad .Ls11,.Lzero_rest
+       .quad .Ld11,.Lzero_rest
+#endif
+       .quad .Le5,.Le_zero
+       .previous
+
+       /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+          pessimistic side. this is gross. it would be better to fix the
+          interface. */
+       /* eax: zero, ebx: 64 */
+.Ls1e:         addl $8,%eax
+.Ls2e:         addl $8,%eax
+.Ls3e:         addl $8,%eax
+.Ls4e:         addl $8,%eax
+.Ls5e:         addl $8,%eax
+.Ls6e:         addl $8,%eax
+.Ls7e:         addl $8,%eax
+.Ls8e:         addl $8,%eax
+       addq %rbx,%rdi  /* +64 */
+       subq %rax,%rdi  /* correct destination with computed offset */
+
+       shlq $6,%rdx    /* loop counter * 64 (stride length) */
+       addq %rax,%rdx  /* add offset to loopcnt */
+       andl $63,%ecx   /* remaining bytes */
+       addq %rcx,%rdx  /* add them */
+       jmp .Lzero_rest
+
+       /* exception on quad word loop in tail handling */
+       /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+       shll $3,%ecx
+       andl $7,%edx
+       addl %ecx,%edx
+       /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+       cmpl $0,(%rsp)  /* zero flag set? */
+       jz   .Le_zero
+       movq %rdx,%rcx
+.Le_byte:
+       xorl %eax,%eax
+.Le5:  rep
+       stosb
+       /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+       movq %rdx,%rax
+       jmp .Lende
+       CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h

index 8079e29..1981f70 100644 (file)
--- a/include/asm-x86_64/uaccess.h
+++ b/include/asm-x86_64/uaccess.h
@@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
         return copy_user_generic((__force void *)dst, src, size);
  }
  
+#define ARCH_HAS_NOCACHE_UACCESS 1
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
+{
+       might_sleep();
+       return __copy_user_nocache(dst, (__force void *)src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size)
+{
+       return __copy_user_nocache(dst, (__force void *)src, size, 0);
+}
+
  #endif /* __X86_64_UACCESS_H */
author	Andi Kleen <ak@suse.de>
	Tue, 13 Feb 2007 12:26:19 +0000 (13:26 +0100)
committer	Andi Kleen <andi@basil.nowhere.org>
	Tue, 13 Feb 2007 12:26:19 +0000 (13:26 +0100)
arch/x86_64/kernel/x8664_ksyms.c		patch \| blob \| history
arch/x86_64/lib/Makefile		patch \| blob \| history
arch/x86_64/lib/copy_user_nocache.S	[new file with mode: 0644]	patch \| blob
include/asm-x86_64/uaccess.h		patch \| blob \| history