arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4 #include <asm/errno.h>
   5 #include <asm/cpufeatures.h>
   6 #include <asm/alternative-asm.h>
   7
   8 /*
   9  * We build a jump to memcpy_orig by default which gets NOPped out on
  10  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  11  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  12  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  13  */
  14
  15 .weak memcpy
  16
  17 /*
  18  * memcpy - Copy a memory block.
  19  *
  20  * Input:
  21  *  rdi destination
  22  *  rsi source
  23  *  rdx count
  24  *
  25  * Output:
  26  * rax original destination
  27  */
  28 ENTRY(__memcpy)
  29 ENTRY(memcpy)
  30         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  31                       "jmp memcpy_erms", X86_FEATURE_ERMS
  32
  33         movq %rdi, %rax
  34         movq %rdx, %rcx
  35         shrq $3, %rcx
  36         andl $7, %edx
  37         rep movsq
  38         movl %edx, %ecx
  39         rep movsb
  40         ret
  41 ENDPROC(memcpy)
  42 ENDPROC(__memcpy)
  43
  44 /*
  45  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  46  * simpler than memcpy. Use memcpy_erms when possible.
  47  */
  48 ENTRY(memcpy_erms)
  49         movq %rdi, %rax
  50         movq %rdx, %rcx
  51         rep movsb
  52         ret
  53 ENDPROC(memcpy_erms)
  54
  55 ENTRY(memcpy_orig)
  56         movq %rdi, %rax
  57
  58         cmpq $0x20, %rdx
  59         jb .Lhandle_tail
  60
  61         /*
  62          * We check whether memory false dependence could occur,
  63          * then jump to corresponding copy mode.
  64          */
  65         cmp  %dil, %sil
  66         jl .Lcopy_backward
  67         subq $0x20, %rdx
  68 .Lcopy_forward_loop:
  69         subq $0x20,     %rdx
  70
  71         /*
  72          * Move in blocks of 4x8 bytes:
  73          */
  74         movq 0*8(%rsi), %r8
  75         movq 1*8(%rsi), %r9
  76         movq 2*8(%rsi), %r10
  77         movq 3*8(%rsi), %r11
  78         leaq 4*8(%rsi), %rsi
  79
  80         movq %r8,       0*8(%rdi)
  81         movq %r9,       1*8(%rdi)
  82         movq %r10,      2*8(%rdi)
  83         movq %r11,      3*8(%rdi)
  84         leaq 4*8(%rdi), %rdi
  85         jae  .Lcopy_forward_loop
  86         addl $0x20,     %edx
  87         jmp  .Lhandle_tail
  88
  89 .Lcopy_backward:
  90         /*
  91          * Calculate copy position to tail.
  92          */
  93         addq %rdx,      %rsi
  94         addq %rdx,      %rdi
  95         subq $0x20,     %rdx
  96         /*
  97          * At most 3 ALU operations in one cycle,
  98          * so append NOPS in the same 16 bytes trunk.
  99          */
 100         .p2align 4
 101 .Lcopy_backward_loop:
 102         subq $0x20,     %rdx
 103         movq -1*8(%rsi),        %r8
 104         movq -2*8(%rsi),        %r9
 105         movq -3*8(%rsi),        %r10
 106         movq -4*8(%rsi),        %r11
 107         leaq -4*8(%rsi),        %rsi
 108         movq %r8,               -1*8(%rdi)
 109         movq %r9,               -2*8(%rdi)
 110         movq %r10,              -3*8(%rdi)
 111         movq %r11,              -4*8(%rdi)
 112         leaq -4*8(%rdi),        %rdi
 113         jae  .Lcopy_backward_loop
 114
 115         /*
 116          * Calculate copy position to head.
 117          */
 118         addl $0x20,     %edx
 119         subq %rdx,      %rsi
 120         subq %rdx,      %rdi
 121 .Lhandle_tail:
 122         cmpl $16,       %edx
 123         jb   .Lless_16bytes
 124
 125         /*
 126          * Move data from 16 bytes to 31 bytes.
 127          */
 128         movq 0*8(%rsi), %r8
 129         movq 1*8(%rsi), %r9
 130         movq -2*8(%rsi, %rdx),  %r10
 131         movq -1*8(%rsi, %rdx),  %r11
 132         movq %r8,       0*8(%rdi)
 133         movq %r9,       1*8(%rdi)
 134         movq %r10,      -2*8(%rdi, %rdx)
 135         movq %r11,      -1*8(%rdi, %rdx)
 136         retq
 137         .p2align 4
 138 .Lless_16bytes:
 139         cmpl $8,        %edx
 140         jb   .Lless_8bytes
 141         /*
 142          * Move data from 8 bytes to 15 bytes.
 143          */
 144         movq 0*8(%rsi), %r8
 145         movq -1*8(%rsi, %rdx),  %r9
 146         movq %r8,       0*8(%rdi)
 147         movq %r9,       -1*8(%rdi, %rdx)
 148         retq
 149         .p2align 4
 150 .Lless_8bytes:
 151         cmpl $4,        %edx
 152         jb   .Lless_3bytes
 153
 154         /*
 155          * Move data from 4 bytes to 7 bytes.
 156          */
 157         movl (%rsi), %ecx
 158         movl -4(%rsi, %rdx), %r8d
 159         movl %ecx, (%rdi)
 160         movl %r8d, -4(%rdi, %rdx)
 161         retq
 162         .p2align 4
 163 .Lless_3bytes:
 164         subl $1, %edx
 165         jb .Lend
 166         /*
 167          * Move data from 1 bytes to 3 bytes.
 168          */
 169         movzbl (%rsi), %ecx
 170         jz .Lstore_1byte
 171         movzbq 1(%rsi), %r8
 172         movzbq (%rsi, %rdx), %r9
 173         movb %r8b, 1(%rdi)
 174         movb %r9b, (%rdi, %rdx)
 175 .Lstore_1byte:
 176         movb %cl, (%rdi)
 177
 178 .Lend:
 179         retq
 180 ENDPROC(memcpy_orig)
 181
 182 #ifndef CONFIG_UML
 183 /*
 184  * memcpy_mcsafe - memory copy with machine check exception handling
 185  * Note that we only catch machine checks when reading the source addresses.
 186  * Writes to target are posted and don't generate machine checks.
 187  */
 188 ENTRY(memcpy_mcsafe)
 189         cmpl $8, %edx
 190         /* Less than 8 bytes? Go to byte copy loop */
 191         jb .L_no_whole_words
 192
 193         /* Check for bad alignment of source */
 194         testl $7, %esi
 195         /* Already aligned */
 196         jz .L_8byte_aligned
 197
 198         /* Copy one byte at a time until source is 8-byte aligned */
 199         movl %esi, %ecx
 200         andl $7, %ecx
 201         subl $8, %ecx
 202         negl %ecx
 203         subl %ecx, %edx
 204 .L_copy_leading_bytes:
 205         movb (%rsi), %al
 206         movb %al, (%rdi)
 207         incq %rsi
 208         incq %rdi
 209         decl %ecx
 210         jnz .L_copy_leading_bytes
 211
 212 .L_8byte_aligned:
 213         /* Figure out how many whole cache lines (64-bytes) to copy */
 214         movl %edx, %ecx
 215         andl $63, %edx
 216         shrl $6, %ecx
 217         jz .L_no_whole_cache_lines
 218
 219         /* Loop copying whole cache lines */
 220 .L_cache_w0: movq (%rsi), %r8
 221 .L_cache_w1: movq 1*8(%rsi), %r9
 222 .L_cache_w2: movq 2*8(%rsi), %r10
 223 .L_cache_w3: movq 3*8(%rsi), %r11
 224         movq %r8, (%rdi)
 225         movq %r9, 1*8(%rdi)
 226         movq %r10, 2*8(%rdi)
 227         movq %r11, 3*8(%rdi)
 228 .L_cache_w4: movq 4*8(%rsi), %r8
 229 .L_cache_w5: movq 5*8(%rsi), %r9
 230 .L_cache_w6: movq 6*8(%rsi), %r10
 231 .L_cache_w7: movq 7*8(%rsi), %r11
 232         movq %r8, 4*8(%rdi)
 233         movq %r9, 5*8(%rdi)
 234         movq %r10, 6*8(%rdi)
 235         movq %r11, 7*8(%rdi)
 236         leaq 64(%rsi), %rsi
 237         leaq 64(%rdi), %rdi
 238         decl %ecx
 239         jnz .L_cache_w0
 240
 241         /* Are there any trailing 8-byte words? */
 242 .L_no_whole_cache_lines:
 243         movl %edx, %ecx
 244         andl $7, %edx
 245         shrl $3, %ecx
 246         jz .L_no_whole_words
 247
 248         /* Copy trailing words */
 249 .L_copy_trailing_words:
 250         movq (%rsi), %r8
 251         mov %r8, (%rdi)
 252         leaq 8(%rsi), %rsi
 253         leaq 8(%rdi), %rdi
 254         decl %ecx
 255         jnz .L_copy_trailing_words
 256
 257         /* Any trailing bytes? */
 258 .L_no_whole_words:
 259         andl %edx, %edx
 260         jz .L_done_memcpy_trap
 261
 262         /* Copy trailing bytes */
 263         movl %edx, %ecx
 264 .L_copy_trailing_bytes:
 265         movb (%rsi), %al
 266         movb %al, (%rdi)
 267         incq %rsi
 268         incq %rdi
 269         decl %ecx
 270         jnz .L_copy_trailing_bytes
 271
 272         /* Copy successful. Return zero */
 273 .L_done_memcpy_trap:
 274         xorq %rax, %rax
 275         ret
 276 ENDPROC(memcpy_mcsafe)
 277
 278         .section .fixup, "ax"
 279         /* Return -EFAULT for any failure */
 280 .L_memcpy_mcsafe_fail:
 281         mov     $-EFAULT, %rax
 282         ret
 283
 284         .previous
 285
 286         _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
 287         _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
 288         _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
 289         _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
 290         _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
 291         _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
 292         _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
 293         _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
 294         _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
 295         _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
 296         _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
 297 #endif