x86_64/memxor.asm

   1 C nettle, low-level cryptographics library
   2 C
   3 C Copyright (C) 2010, Niels Möller
   4 C
   5 C The nettle library is free software; you can redistribute it and/or modify
   6 C it under the terms of the GNU Lesser General Public License as published by
   7 C the Free Software Foundation; either version 2.1 of the License, or (at your
   8 C option) any later version.
   9 C
  10 C The nettle library is distributed in the hope that it will be useful, but
  11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  13 C License for more details.
  14 C
  15 C You should have received a copy of the GNU Lesser General Public License
  16 C along with the nettle library; see the file COPYING.LIB.  If not, write to
  17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 C MA 02111-1301, USA.
  19
  20 C Register usage:
  21 define(<DST>, <%rax>) C Originally in %rdi
  22 define(<AP>, <%rsi>)
  23 define(<BP>, <%rdx>)
  24 define(<N>, <%r10>)
  25 define(<TMP>, <%r8>)
  26 define(<TMP2>, <%r9>)
  27 define(<CNT>, <%rdi>)
  28 define(<S0>, <%r11>)
  29 define(<S1>, <%rdi>) C Overlaps with CNT
  30
  31 define(<USE_SSE2>, <no>)
  32
  33         .file "memxor.asm"
  34
  35         .text
  36
  37         C memxor(uint8_t *dst, const uint8_t *src, size_t n)
  38         C                 %rdi               %rsi      %rdx
  39         ALIGN(16)
  40
  41 PROLOGUE(memxor)
  42         W64_ENTRY(3, 0)
  43         mov     %rdx, %r10
  44         mov     %rdi, %rdx
  45         jmp     .Lmemxor3_entry
  46 EPILOGUE(memxor)
  47
  48         C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
  49         C                 %rdi              %rsi              %rdx      %rcx
  50         ALIGN(16)
  51
  52 PROLOGUE(memxor3)
  53         W64_ENTRY(4, 0)
  54         C %cl needed for shift count, so move away N
  55         mov     %rcx, N
  56 .Lmemxor3_entry:
  57         test    N, N
  58         C Get number of unaligned bytes at the end
  59         C %rdi is used as CNT, %rax as DST and as return value
  60         mov     %rdi, %rax
  61         jz      .Ldone
  62         add     N, CNT
  63         and     $7, CNT
  64
  65         jz      .Laligned
  66
  67         cmp     $8, N
  68         jc      .Lfinal_next
  69
  70         C FIXME: Instead of this loop, could try cmov with memory
  71         C destination, as a sequence of one 8-bit, one 16-bit and one
  72         C 32-bit operations. (Except that cmov can't do 8-bit ops, so
  73         C that step has to use a conditional).
  74 .Lalign_loop:
  75
  76         sub     $1, N
  77         movb    (AP, N), LREG(TMP)
  78         xorb    (BP, N), LREG(TMP)
  79         movb    LREG(TMP), (DST, N)
  80         sub     $1, CNT
  81         jnz     .Lalign_loop
  82
  83 .Laligned:
  84 ifelse(USE_SSE2, yes, <
  85         cmp     $16, N
  86         jnc     .Lsse2_case
  87 >)
  88         C Check for the case that AP and BP have the same alignment,
  89         C but different from DST.
  90         mov     AP, TMP
  91         sub     BP, TMP
  92         test    $7, TMP
  93         jnz     .Lno_shift_case
  94         mov     AP, %rcx
  95         sub     DST, %rcx
  96         and     $7, %rcx
  97         jz      .Lno_shift_case
  98         sub     %rcx, AP
  99         sub     %rcx, BP
 100         shl     $3, %rcx
 101
 102         C Unrolling, with aligned values alternating in S0 and S1
 103         test    $8, N
 104         jnz     .Lshift_odd
 105         mov     (AP, N), S1
 106         xor     (BP, N), S1
 107         jmp     .Lshift_next
 108
 109 .Lshift_odd:
 110         mov     -8(AP, N), S1
 111         mov     (AP, N), S0
 112         xor     -8(BP, N), S1
 113         xor     (BP, N), S0
 114         mov     S1, TMP
 115         shr     %cl, TMP
 116         neg     %cl
 117         shl     %cl, S0
 118         neg     %cl
 119
 120         or      S0, TMP
 121         mov     TMP, -8(DST, N)
 122         sub     $8, N
 123         jz      .Ldone
 124         jmp     .Lshift_next
 125
 126         ALIGN(16)
 127
 128 .Lshift_loop:
 129         mov     8(AP, N), S0
 130         xor     8(BP, N), S0
 131         mov     S0, TMP
 132         shr     %cl, TMP
 133         neg     %cl
 134         shl     %cl, S1
 135         neg     %cl
 136         or      S1, TMP
 137         mov     TMP, 8(DST, N)
 138
 139         mov     (AP, N), S1
 140         xor     (BP, N), S1
 141         mov     S1, TMP
 142         shr     %cl, TMP
 143         neg     %cl
 144         shl     %cl, S0
 145         neg     %cl
 146         or      S0, TMP
 147         mov     TMP, (DST, N)
 148 .Lshift_next:
 149         sub     $16, N
 150         C FIXME: Handle the case N == 16 specially,
 151         C like in the non-shifted case?
 152 C       ja      .Lshift_loop
 153 C       jz      .Ldone
 154         jnc     .Lshift_loop
 155
 156         add     $15, N
 157         jnc     .Ldone
 158
 159         shr     $3, %rcx
 160         add     %rcx, AP
 161         add     %rcx, BP
 162         jmp     .Lfinal_loop
 163
 164 .Lno_shift_case:
 165         C Next destination word is -8(DST, N)
 166         C Setup for unrolling
 167         test    $8, N
 168         jz      .Lword_next
 169
 170         sub     $8, N
 171         jz      .Lone_word
 172
 173         mov     (AP, N), TMP
 174         xor     (BP, N), TMP
 175         mov     TMP, (DST, N)
 176
 177         jmp     .Lword_next
 178
 179         ALIGN(16)
 180
 181 .Lword_loop:
 182         mov     8(AP, N), TMP
 183         mov     (AP, N), TMP2
 184         xor     8(BP, N), TMP
 185         xor     (BP, N), TMP2
 186         mov     TMP, 8(DST, N)
 187         mov     TMP2, (DST, N)
 188
 189 .Lword_next:
 190         sub     $16, N
 191         ja      .Lword_loop     C Not zero and no carry
 192         jnz     .Lfinal
 193
 194         C Final operation is word aligned
 195         mov     8(AP, N), TMP
 196         xor     8(BP, N), TMP
 197         mov     TMP, 8(DST, N)
 198
 199 .Lone_word:
 200         mov     (AP, N), TMP
 201         xor     (BP, N), TMP
 202         mov     TMP, (DST, N)
 203
 204         C ENTRY might have been 3 args, too, but it doesn't matter for the exit
 205         W64_EXIT(4, 0)
 206         ret
 207
 208 .Lfinal:
 209         add     $15, N
 210
 211 .Lfinal_loop:
 212         movb    (AP, N), LREG(TMP)
 213         xorb    (BP, N), LREG(TMP)
 214         movb    LREG(TMP), (DST, N)
 215 .Lfinal_next:
 216         sub     $1, N
 217         jnc     .Lfinal_loop
 218
 219 .Ldone:
 220         C ENTRY might have been 3 args, too, but it doesn't matter for the exit
 221         W64_EXIT(4, 0)
 222         ret
 223
 224 ifelse(USE_SSE2, yes, <
 225
 226 .Lsse2_case:
 227         lea     (DST, N), TMP
 228         test    $8, TMP
 229         jz      .Lsse2_next
 230         sub     $8, N
 231         mov     (AP, N), TMP
 232         xor     (BP, N), TMP
 233         mov     TMP, (DST, N)
 234         jmp     .Lsse2_next
 235
 236         ALIGN(16)
 237 .Lsse2_loop:
 238         movdqu  (AP, N), %xmm0
 239         movdqu  (BP, N), %xmm1
 240         pxor    %xmm0, %xmm1
 241         movdqa  %xmm1, (DST, N)
 242 .Lsse2_next:
 243         sub     $16, N
 244         ja      .Lsse2_loop
 245
 246         C FIXME: See if we can do a full word first, before the
 247         C byte-wise final loop.
 248         jnz     .Lfinal
 249
 250         C Final operation is aligned
 251         movdqu  (AP), %xmm0
 252         movdqu  (BP), %xmm1
 253         pxor    %xmm0, %xmm1
 254         movdqa  %xmm1, (DST)
 255         C ENTRY might have been 3 args, too, but it doesn't matter for the exit
 256         W64_EXIT(4, 0)
 257         ret
 258 >)
 259
 260
 261 EPILOGUE(memxor3)