x86: improve on the non-rep 'clear_user' function
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 16 Apr 2023 21:06:58 +0000 (14:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 19 Apr 2023 00:05:28 +0000 (17:05 -0700)
The old version was oddly written to have the repeat count in multiple
registers.  So instead of taking advantage of %rax being zero, it had
some sub-counts in it.  All just for a "single word clearing" loop,
which isn't even efficient to begin with.

So get rid of those games, and just keep all the state in the same
registers we got it in (and that we should return things in).  That not
only makes this act much more like 'rep stos' (which this function is
replacing), but makes it much easier to actually do the obvious loop
unrolling.

Also rename the function from the now nonsensical 'clear_user_original'
to what it now clearly is: 'rep_stos_alternative'.

End result: if we don't have a fast 'rep stosb', at least we can have a
fast fallback for it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
arch/x86/include/asm/uaccess_64.h
arch/x86/lib/clear_page_64.S
tools/objtool/check.c

index 8cc918a..a0533e6 100644 (file)
@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
  */
 
 __must_check unsigned long
-clear_user_original(void __user *addr, unsigned long len);
+rep_stos_alternative(void __user *addr, unsigned long len);
 
 static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
 {
@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep stosb",
-                           "call clear_user_original", ALT_NOT(X86_FEATURE_FSRS))
+                           "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
                "2:\n"
               _ASM_EXTABLE_UA(1b, 2b)
               : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
index fcd01b9..f74a3e7 100644 (file)
@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
  * Input:
  * rdi destination
  * rcx count
+ * rax is zero
  *
  * Output:
  * rcx: uncleared bytes or 0 if successful.
  */
-SYM_FUNC_START(clear_user_original)
-       /*
-        * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
-        * i.e., no need for a 'q' suffix and thus a REX prefix.
-        */
-       mov %ecx,%eax
-       shr $3,%rcx
-       jz .Lrest_bytes
+SYM_FUNC_START(rep_stos_alternative)
+       cmpq $64,%rcx
+       jae .Lunrolled
 
-       # do the qwords first
-       .p2align 4
-.Lqwords:
-       movq $0,(%rdi)
-       lea 8(%rdi),%rdi
-       dec %rcx
-       jnz .Lqwords
+       cmp $8,%ecx
+       jae .Lword
 
-.Lrest_bytes:
-       and $7,  %eax
-       jz .Lexit
+       testl %ecx,%ecx
+       je .Lexit
 
-       # now do the rest bytes
-.Lbytes:
-       movb $0,(%rdi)
+.Lclear_user_tail:
+0:     movb %al,(%rdi)
        inc %rdi
-       dec %eax
-       jnz .Lbytes
-
+       dec %rcx
+       jnz .Lclear_user_tail
 .Lexit:
-       /*
-        * %rax still needs to be cleared in the exception case because this function is called
-        * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
-        * in case it might reuse it somewhere.
-        */
-        xor %eax,%eax
-        RET
+       RET
 
-.Lqwords_exception:
-        # convert remaining qwords back into bytes to return to caller
-        shl $3, %rcx
-        and $7, %eax
-        add %rax,%rcx
-        jmp .Lexit
+       _ASM_EXTABLE_UA( 0b, .Lexit)
 
-.Lbytes_exception:
-        mov %eax,%ecx
-        jmp .Lexit
+.Lword:
+1:     movq %rax,(%rdi)
+       addq $8,%rdi
+       sub $8,%ecx
+       je .Lexit
+       cmp $8,%ecx
+       jae .Lword
+       jmp .Lclear_user_tail
 
-        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
-        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
-SYM_FUNC_END(clear_user_original)
-EXPORT_SYMBOL(clear_user_original)
+       .p2align 4
+.Lunrolled:
+10:    movq %rax,(%rdi)
+11:    movq %rax,8(%rdi)
+12:    movq %rax,16(%rdi)
+13:    movq %rax,24(%rdi)
+14:    movq %rax,32(%rdi)
+15:    movq %rax,40(%rdi)
+16:    movq %rax,48(%rdi)
+17:    movq %rax,56(%rdi)
+       addq $64,%rdi
+       subq $64,%rcx
+       cmpq $64,%rcx
+       jae .Lunrolled
+       cmpl $8,%ecx
+       jae .Lword
+       testl %ecx,%ecx
+       jne .Lclear_user_tail
+       RET
+
+       /*
+        * If we take an exception on any of the
+        * word stores, we know that %rcx isn't zero,
+        * so we can just go to the tail clearing to
+        * get the exact count.
+        *
+        * The unrolled case might end up clearing
+        * some bytes twice. Don't care.
+        *
+        * We could use the value in %rdi to avoid
+        * a second fault on the exact count case,
+        * but do we really care? No.
+        *
+        * Finally, we could try to align %rdi at the
+        * top of the unrolling. But unaligned stores
+        * just aren't that common or expensive.
+        */
+       _ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(10b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(11b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(12b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(13b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(14b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(15b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(16b, .Lclear_user_tail)
+       _ASM_EXTABLE_UA(17b, .Lclear_user_tail)
+SYM_FUNC_END(rep_stos_alternative)
+EXPORT_SYMBOL(rep_stos_alternative)
index 44817bb..ac96c99 100644 (file)
@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
        "copy_mc_fragile_handle_tail",
        "copy_mc_enhanced_fast_string",
        "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
-       "clear_user_original",
+       "rep_stos_alternative",
        "copy_user_generic_unrolled",
        "__copy_user_nocache",
        NULL