x86_64/memxor3.asm

   1 C x86_64/memxor3.asm
   2
   3 ifelse(<
   4    Copyright (C) 2010, 2014 Niels Möller
   5
   6    This file is part of GNU Nettle.
   7
   8    GNU Nettle is free software: you can redistribute it and/or
   9    modify it under the terms of either:
  10
  11      * the GNU Lesser General Public License as published by the Free
  12        Software Foundation; either version 3 of the License, or (at your
  13        option) any later version.
  14
  15    or
  16
  17      * the GNU General Public License as published by the Free
  18        Software Foundation; either version 2 of the License, or (at your
  19        option) any later version.
  20
  21    or both in parallel, as here.
  22
  23    GNU Nettle is distributed in the hope that it will be useful,
  24    but WITHOUT ANY WARRANTY; without even the implied warranty of
  25    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26    General Public License for more details.
  27
  28    You should have received copies of the GNU General Public License and
  29    the GNU Lesser General Public License along with this program.  If
  30    not, see http://www.gnu.org/licenses/.
  31 >)
  32
  33 C Register usage:
  34 define(<DST>, <%rax>) C Originally in %rdi
  35 define(<AP>, <%rsi>)
  36 define(<BP>, <%rdx>)
  37 define(<N>, <%r10>)
  38 define(<TMP>, <%r8>)
  39 define(<TMP2>, <%r9>)
  40 define(<CNT>, <%rdi>)
  41 define(<S0>, <%r11>)
  42 define(<S1>, <%rdi>) C Overlaps with CNT
  43
  44 define(<USE_SSE2>, <no>)
  45
  46         .file "memxor3.asm"
  47
  48         .text
  49
  50         C memxor3(void *dst, const void *a, const void *b, size_t n)
  51         C                 %rdi              %rsi              %rdx      %rcx
  52         ALIGN(16)
  53
  54 PROLOGUE(nettle_memxor3)
  55         W64_ENTRY(4, 0)
  56         C %cl needed for shift count, so move away N
  57         mov     %rcx, N
  58 .Lmemxor3_entry:
  59         test    N, N
  60         C Get number of unaligned bytes at the end
  61         C %rdi is used as CNT, %rax as DST and as return value
  62         mov     %rdi, %rax
  63         jz      .Ldone
  64         add     N, CNT
  65         and     $7, CNT
  66
  67         jz      .Laligned
  68
  69         cmp     $8, N
  70         jc      .Lfinal_next
  71
  72         C FIXME: Instead of this loop, could try cmov with memory
  73         C destination, as a sequence of one 8-bit, one 16-bit and one
  74         C 32-bit operations. (Except that cmov can't do 8-bit ops, so
  75         C that step has to use a conditional).
  76 .Lalign_loop:
  77
  78         sub     $1, N
  79         movb    (AP, N), LREG(TMP)
  80         xorb    (BP, N), LREG(TMP)
  81         movb    LREG(TMP), (DST, N)
  82         sub     $1, CNT
  83         jnz     .Lalign_loop
  84
  85 .Laligned:
  86 ifelse(USE_SSE2, yes, <
  87         cmp     $16, N
  88         jnc     .Lsse2_case
  89 >)
  90         C Check for the case that AP and BP have the same alignment,
  91         C but different from DST.
  92         mov     AP, TMP
  93         sub     BP, TMP
  94         test    $7, TMP
  95         jnz     .Lno_shift_case
  96         mov     AP, %rcx
  97         sub     DST, %rcx
  98         and     $7, %rcx
  99         jz      .Lno_shift_case
 100         sub     %rcx, AP
 101         sub     %rcx, BP
 102         shl     $3, %rcx
 103
 104         C Unrolling, with aligned values alternating in S0 and S1
 105         test    $8, N
 106         jnz     .Lshift_odd
 107         mov     (AP, N), S1
 108         xor     (BP, N), S1
 109         jmp     .Lshift_next
 110
 111 .Lshift_odd:
 112         mov     -8(AP, N), S1
 113         mov     (AP, N), S0
 114         xor     -8(BP, N), S1
 115         xor     (BP, N), S0
 116         mov     S1, TMP
 117         shr     %cl, TMP
 118         neg     %cl
 119         shl     %cl, S0
 120         neg     %cl
 121
 122         or      S0, TMP
 123         mov     TMP, -8(DST, N)
 124         sub     $8, N
 125         jz      .Ldone
 126         jmp     .Lshift_next
 127
 128         ALIGN(16)
 129
 130 .Lshift_loop:
 131         mov     8(AP, N), S0
 132         xor     8(BP, N), S0
 133         mov     S0, TMP
 134         shr     %cl, TMP
 135         neg     %cl
 136         shl     %cl, S1
 137         neg     %cl
 138         or      S1, TMP
 139         mov     TMP, 8(DST, N)
 140
 141         mov     (AP, N), S1
 142         xor     (BP, N), S1
 143         mov     S1, TMP
 144         shr     %cl, TMP
 145         neg     %cl
 146         shl     %cl, S0
 147         neg     %cl
 148         or      S0, TMP
 149         mov     TMP, (DST, N)
 150 .Lshift_next:
 151         sub     $16, N
 152         C FIXME: Handle the case N == 16 specially,
 153         C like in the non-shifted case?
 154 C       ja      .Lshift_loop
 155 C       jz      .Ldone
 156         jnc     .Lshift_loop
 157
 158         add     $15, N
 159         jnc     .Ldone
 160
 161         shr     $3, %rcx
 162         add     %rcx, AP
 163         add     %rcx, BP
 164         jmp     .Lfinal_loop
 165
 166 .Lno_shift_case:
 167         C Next destination word is -8(DST, N)
 168         C Setup for unrolling
 169         test    $8, N
 170         jz      .Lword_next
 171
 172         sub     $8, N
 173         jz      .Lone_word
 174
 175         mov     (AP, N), TMP
 176         xor     (BP, N), TMP
 177         mov     TMP, (DST, N)
 178
 179         jmp     .Lword_next
 180
 181         ALIGN(16)
 182
 183 .Lword_loop:
 184         mov     8(AP, N), TMP
 185         mov     (AP, N), TMP2
 186         xor     8(BP, N), TMP
 187         xor     (BP, N), TMP2
 188         mov     TMP, 8(DST, N)
 189         mov     TMP2, (DST, N)
 190
 191 .Lword_next:
 192         sub     $16, N
 193         ja      .Lword_loop     C Not zero and no carry
 194         jnz     .Lfinal
 195
 196         C Final operation is word aligned
 197         mov     8(AP, N), TMP
 198         xor     8(BP, N), TMP
 199         mov     TMP, 8(DST, N)
 200
 201 .Lone_word:
 202         mov     (AP, N), TMP
 203         xor     (BP, N), TMP
 204         mov     TMP, (DST, N)
 205
 206         C ENTRY might have been 3 args, too, but it doesn't matter for the exit
 207         W64_EXIT(4, 0)
 208         ret
 209
 210 .Lfinal:
 211         add     $15, N
 212
 213 .Lfinal_loop:
 214         movb    (AP, N), LREG(TMP)
 215         xorb    (BP, N), LREG(TMP)
 216         movb    LREG(TMP), (DST, N)
 217 .Lfinal_next:
 218         sub     $1, N
 219         jnc     .Lfinal_loop
 220
 221 .Ldone:
 222         C ENTRY might have been 3 args, too, but it doesn't matter for the exit
 223         W64_EXIT(4, 0)
 224         ret
 225
 226 ifelse(USE_SSE2, yes, <
 227
 228 .Lsse2_case:
 229         lea     (DST, N), TMP
 230         test    $8, TMP
 231         jz      .Lsse2_next
 232         sub     $8, N
 233         mov     (AP, N), TMP
 234         xor     (BP, N), TMP
 235         mov     TMP, (DST, N)
 236         jmp     .Lsse2_next
 237
 238         ALIGN(16)
 239 .Lsse2_loop:
 240         movdqu  (AP, N), %xmm0
 241         movdqu  (BP, N), %xmm1
 242         pxor    %xmm0, %xmm1
 243         movdqa  %xmm1, (DST, N)
 244 .Lsse2_next:
 245         sub     $16, N
 246         ja      .Lsse2_loop
 247
 248         C FIXME: See if we can do a full word first, before the
 249         C byte-wise final loop.
 250         jnz     .Lfinal
 251
 252         C Final operation is aligned
 253         movdqu  (AP), %xmm0
 254         movdqu  (BP), %xmm1
 255         pxor    %xmm0, %xmm1
 256         movdqa  %xmm1, (DST)
 257         C ENTRY might have been 3 args, too, but it doesn't matter for the exit
 258         W64_EXIT(4, 0)
 259         ret
 260 >)
 261
 262
 263 EPILOGUE(nettle_memxor3)