arch/x86/lib/memset_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /* Copyright 2002 Andi Kleen, SuSE Labs */
   3
   4 #include <linux/linkage.h>
   5 #include <asm/cpufeatures.h>
   6 #include <asm/alternative.h>
   7 #include <asm/export.h>
   8
   9 .section .noinstr.text, "ax"
  10
  11 /*
  12  * ISO C memset - set a memory block to a byte value. This function uses fast
  13  * string to get better performance than the original function. The code is
  14  * simpler and shorter than the original function as well.
  15  *
  16  * rdi   destination
  17  * rsi   value (char)
  18  * rdx   count (bytes)
  19  *
  20  * rax   original destination
  21  *
  22  * The FSRS alternative should be done inline (avoiding the call and
  23  * the disgusting return handling), but that would require some help
  24  * from the compiler for better calling conventions.
  25  *
  26  * The 'rep stosb' itself is small enough to replace the call, but all
  27  * the register moves blow up the code. And two of them are "needed"
  28  * only for the return value that is the same as the source input,
  29  * which the compiler could/should do much better anyway.
  30  */
  31 SYM_FUNC_START(__memset)
  32         ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
  33
  34         movq %rdi,%r9
  35         movb %sil,%al
  36         movq %rdx,%rcx
  37         rep stosb
  38         movq %r9,%rax
  39         RET
  40 SYM_FUNC_END(__memset)
  41 EXPORT_SYMBOL(__memset)
  42
  43 SYM_FUNC_ALIAS(memset, __memset)
  44 EXPORT_SYMBOL(memset)
  45
  46 SYM_FUNC_START_LOCAL(memset_orig)
  47         movq %rdi,%r10
  48
  49         /* expand byte value  */
  50         movzbl %sil,%ecx
  51         movabs $0x0101010101010101,%rax
  52         imulq  %rcx,%rax
  53
  54         /* align dst */
  55         movl  %edi,%r9d
  56         andl  $7,%r9d
  57         jnz  .Lbad_alignment
  58 .Lafter_bad_alignment:
  59
  60         movq  %rdx,%rcx
  61         shrq  $6,%rcx
  62         jz       .Lhandle_tail
  63
  64         .p2align 4
  65 .Lloop_64:
  66         decq  %rcx
  67         movq  %rax,(%rdi)
  68         movq  %rax,8(%rdi)
  69         movq  %rax,16(%rdi)
  70         movq  %rax,24(%rdi)
  71         movq  %rax,32(%rdi)
  72         movq  %rax,40(%rdi)
  73         movq  %rax,48(%rdi)
  74         movq  %rax,56(%rdi)
  75         leaq  64(%rdi),%rdi
  76         jnz    .Lloop_64
  77
  78         /* Handle tail in loops. The loops should be faster than hard
  79            to predict jump tables. */
  80         .p2align 4
  81 .Lhandle_tail:
  82         movl    %edx,%ecx
  83         andl    $63&(~7),%ecx
  84         jz              .Lhandle_7
  85         shrl    $3,%ecx
  86         .p2align 4
  87 .Lloop_8:
  88         decl   %ecx
  89         movq  %rax,(%rdi)
  90         leaq  8(%rdi),%rdi
  91         jnz    .Lloop_8
  92
  93 .Lhandle_7:
  94         andl    $7,%edx
  95         jz      .Lende
  96         .p2align 4
  97 .Lloop_1:
  98         decl    %edx
  99         movb    %al,(%rdi)
 100         leaq    1(%rdi),%rdi
 101         jnz     .Lloop_1
 102
 103 .Lende:
 104         movq    %r10,%rax
 105         RET
 106
 107 .Lbad_alignment:
 108         cmpq $7,%rdx
 109         jbe     .Lhandle_7
 110         movq %rax,(%rdi)        /* unaligned store */
 111         movq $8,%r8
 112         subq %r9,%r8
 113         addq %r8,%rdi
 114         subq %r8,%rdx
 115         jmp .Lafter_bad_alignment
 116 .Lfinal:
 117 SYM_FUNC_END(memset_orig)