x86_64/memxor.asm

   1 C x86_64/memxor.asm
   2
   3 ifelse(<
   4    Copyright (C) 2010, 2014, Niels Möller
   5
   6    This file is part of GNU Nettle.
   7
   8    GNU Nettle is free software: you can redistribute it and/or
   9    modify it under the terms of either:
  10
  11      * the GNU Lesser General Public License as published by the Free
  12        Software Foundation; either version 3 of the License, or (at your
  13        option) any later version.
  14
  15    or
  16
  17      * the GNU General Public License as published by the Free
  18        Software Foundation; either version 2 of the License, or (at your
  19        option) any later version.
  20
  21    or both in parallel, as here.
  22
  23    GNU Nettle is distributed in the hope that it will be useful,
  24    but WITHOUT ANY WARRANTY; without even the implied warranty of
  25    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26    General Public License for more details.
  27
  28    You should have received copies of the GNU General Public License and
  29    the GNU Lesser General Public License along with this program.  If
  30    not, see http://www.gnu.org/licenses/.
  31 >)
  32
  33 C Register usage:
  34 define(<DST>, <%rax>) C Originally in %rdi
  35 define(<SRC>, <%rsi>)
  36 define(<N>, <%rdx>)
  37 define(<TMP>, <%r8>)
  38 define(<TMP2>, <%r9>)
  39 define(<CNT>, <%rdi>)
  40 define(<S0>, <%r11>)
  41 define(<S1>, <%rdi>) C Overlaps with CNT
  42
  43 define(<USE_SSE2>, <no>)
  44
  45         .file "memxor.asm"
  46
  47         .text
  48
  49         C memxor(void *dst, const void *src, size_t n)
  50         C                 %rdi               %rsi      %rdx
  51         ALIGN(16)
  52
  53 PROLOGUE(nettle_memxor)
  54         W64_ENTRY(3, 0)
  55
  56         test    N, N
  57         C Get number of unaligned bytes at the end
  58         C %rdi is used as CNT, %rax as DST and as return value
  59         mov     %rdi, %rax
  60         jz      .Ldone
  61         add     N, CNT
  62         and     $7, CNT
  63
  64         jz      .Laligned
  65
  66         cmp     $8, N
  67         jc      .Lfinal_next
  68
  69         C FIXME: Instead of this loop, could try cmov with memory
  70         C destination, as a sequence of one 8-bit, one 16-bit and one
  71         C 32-bit operations. (Except that cmov can't do 8-bit ops, so
  72         C that step has to use a conditional).
  73 .Lalign_loop:
  74
  75         sub     $1, N
  76         movb    (SRC, N), LREG(TMP)
  77         xorb    LREG(TMP), (DST, N)
  78         sub     $1, CNT
  79         jnz     .Lalign_loop
  80
  81 .Laligned:
  82 ifdef(<USE_SSE2>, <
  83         cmp     $16, N
  84         jnc     .Lsse2_case
  85 >)
  86
  87         C Next destination word is -8(DST, N)
  88         C Setup for unrolling
  89         test    $8, N
  90         jz      .Lword_next
  91
  92         sub     $8, N
  93         jz      .Lone_word
  94
  95         mov     (SRC, N), TMP
  96         xor     TMP, (DST, N)
  97
  98         jmp     .Lword_next
  99
 100         ALIGN(16)
 101
 102 .Lword_loop:
 103         mov     8(SRC, N), TMP
 104         mov     (SRC, N), TMP2
 105         xor     TMP, 8(DST, N)
 106         xor     TMP2, (DST, N)
 107
 108 .Lword_next:
 109         sub     $16, N
 110         ja      .Lword_loop     C Not zero and no carry
 111         jnz     .Lfinal
 112
 113         C Final operation is word aligned
 114         mov     8(SRC, N), TMP
 115         xor     TMP, 8(DST, N)
 116
 117 .Lone_word:
 118         mov     (SRC, N), TMP
 119         xor     TMP, (DST, N)
 120
 121         W64_EXIT(3, 0)
 122         ret
 123
 124 .Lfinal:
 125         add     $15, N
 126
 127 .Lfinal_loop:
 128         movb    (SRC, N), LREG(TMP)
 129         xorb    LREG(TMP), (DST, N)
 130 .Lfinal_next:
 131         sub     $1, N
 132         jnc     .Lfinal_loop
 133
 134 .Ldone:
 135         W64_EXIT(3, 0)
 136         ret
 137
 138 ifdef(<USE_SSE2>, <
 139
 140 .Lsse2_case:
 141         lea     (DST, N), TMP
 142         test    $8, TMP
 143         jz      .Lsse2_next
 144         sub     $8, N
 145         mov     (SRC, N), TMP
 146         xor     TMP, (DST, N)
 147         jmp     .Lsse2_next
 148
 149         ALIGN(16)
 150 .Lsse2_loop:
 151         movdqu  (SRC, N), %xmm0
 152         movdqa  (DST, N), %xmm1
 153         pxor    %xmm0, %xmm1
 154         movdqa  %xmm1, (DST, N)
 155 .Lsse2_next:
 156         sub     $16, N
 157         ja      .Lsse2_loop
 158
 159         C FIXME: See if we can do a full word first, before the
 160         C byte-wise final loop.
 161         jnz     .Lfinal
 162
 163         C Final operation is aligned
 164         movdqu  (SRC), %xmm0
 165         movdqa  (DST), %xmm1
 166         pxor    %xmm0, %xmm1
 167         movdqa  %xmm1, (DST)
 168
 169         W64_EXIT(3, 0)
 170         ret
 171 >)
 172
 173 EPILOGUE(nettle_memxor)