arm/memxor.asm

   1 C arm/memxor.asm
   2
   3 ifelse(<
   4    Copyright (C) 2013 Niels Möller
   5
   6    This file is part of GNU Nettle.
   7
   8    GNU Nettle is free software: you can redistribute it and/or
   9    modify it under the terms of either:
  10
  11      * the GNU Lesser General Public License as published by the Free
  12        Software Foundation; either version 3 of the License, or (at your
  13        option) any later version.
  14
  15    or
  16
  17      * the GNU General Public License as published by the Free
  18        Software Foundation; either version 2 of the License, or (at your
  19        option) any later version.
  20
  21    or both in parallel, as here.
  22
  23    GNU Nettle is distributed in the hope that it will be useful,
  24    but WITHOUT ANY WARRANTY; without even the implied warranty of
  25    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26    General Public License for more details.
  27
  28    You should have received copies of the GNU General Public License and
  29    the GNU Lesser General Public License along with this program.  If
  30    not, see http://www.gnu.org/licenses/.
  31 >)
  32
  33 C Possible speedups:
  34 C
  35 C The ldm instruction can do load two registers per cycle,
  36 C if the address is two-word aligned. Or three registers in two
  37 C cycles, regardless of alignment.
  38
  39 C Register usage:
  40
  41 define(<DST>, <r0>)
  42 define(<SRC>, <r1>)
  43 define(<N>, <r2>)
  44 define(<CNT>, <r6>)
  45 define(<TNC>, <r12>)
  46
  47         .syntax unified
  48
  49         .file "memxor.asm"
  50
  51         .text
  52         .arm
  53
  54         C memxor(void *dst, const void *src, size_t n)
  55         .align 4
  56 PROLOGUE(nettle_memxor)
  57         cmp     N, #0
  58         beq     .Lmemxor_done
  59
  60         cmp     N, #7
  61         bcs     .Lmemxor_large
  62
  63         C Simple byte loop
  64 .Lmemxor_bytes:
  65         ldrb    r3, [SRC], #+1
  66         ldrb    r12, [DST]
  67         eor     r3, r12
  68         strb    r3, [DST], #+1
  69         subs    N, #1
  70         bne     .Lmemxor_bytes
  71
  72 .Lmemxor_done:
  73         bx      lr
  74
  75 .Lmemxor_align_loop:
  76         ldrb    r3, [SRC], #+1
  77         ldrb    r12, [DST]
  78         eor     r3, r12
  79         strb    r3, [DST], #+1
  80         sub     N, #1
  81
  82 .Lmemxor_large:
  83         tst     DST, #3
  84         bne     .Lmemxor_align_loop
  85
  86         C We have at least 4 bytes left to do here.
  87         sub     N, #4
  88
  89         ands    r3, SRC, #3
  90         beq     .Lmemxor_same
  91
  92         C Different alignment case.
  93         C     v original SRC
  94         C +-------+------+
  95         C |SRC    |SRC+4 |
  96         C +---+---+------+
  97         C     |DST    |
  98         C     +-------+
  99         C
 100         C With little-endian, we need to do
 101         C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
 102
 103         push    {r4,r5,r6}
 104
 105         lsl     CNT, r3, #3
 106         bic     SRC, #3
 107         rsb     TNC, CNT, #32
 108
 109         ldr     r4, [SRC], #+4
 110
 111         tst     N, #4
 112         itet    eq
 113         moveq   r5, r4
 114         subne   N, #4
 115         beq     .Lmemxor_odd
 116
 117 .Lmemxor_word_loop:
 118         ldr     r5, [SRC], #+4
 119         ldr     r3, [DST]
 120         eor     r3, r3, r4, lsr CNT
 121         eor     r3, r3, r5, lsl TNC
 122         str     r3, [DST], #+4
 123 .Lmemxor_odd:
 124         ldr     r4, [SRC], #+4
 125         ldr     r3, [DST]
 126         eor     r3, r3, r5, lsr CNT
 127         eor     r3, r3, r4, lsl TNC
 128         str     r3, [DST], #+4
 129         subs    N, #8
 130         bcs     .Lmemxor_word_loop
 131         adds    N, #8
 132         beq     .Lmemxor_odd_done
 133
 134         C We have TNC/8 left-over bytes in r4, high end
 135         lsr     r4, CNT
 136         ldr     r3, [DST]
 137         eor     r3, r4
 138
 139         pop     {r4,r5,r6}
 140
 141         C Store bytes, one by one.
 142 .Lmemxor_leftover:
 143         strb    r3, [DST], #+1
 144         subs    N, #1
 145         beq     .Lmemxor_done
 146         subs    TNC, #8
 147         lsr     r3, #8
 148         bne     .Lmemxor_leftover
 149         b       .Lmemxor_bytes
 150 .Lmemxor_odd_done:
 151         pop     {r4,r5,r6}
 152         bx      lr
 153
 154 .Lmemxor_same:
 155         push    {r4,r5,r6,r7,r8,r10,r11,r14}    C lr is the link register
 156
 157         subs    N, #8
 158         bcc     .Lmemxor_same_end
 159
 160         ldmia   SRC!, {r3, r4, r5}
 161         C Keep address for loads in r14
 162         mov     r14, DST
 163         ldmia   r14!, {r6, r7, r8}
 164         subs    N, #12
 165         eor     r10, r3, r6
 166         eor     r11, r4, r7
 167         eor     r12, r5, r8
 168         bcc     .Lmemxor_same_final_store
 169         subs    N, #12
 170         ldmia   r14!, {r6, r7, r8}
 171         bcc     .Lmemxor_same_wind_down
 172
 173         C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
 174         C loop starts at offset 0x11c in the object file.
 175
 176 .Lmemxor_same_loop:
 177         C r10-r12 contains values to be stored at DST
 178         C r6-r8 contains values read from r14, in advance
 179         ldmia   SRC!, {r3, r4, r5}
 180         subs    N, #12
 181         stmia   DST!, {r10, r11, r12}
 182         eor     r10, r3, r6
 183         eor     r11, r4, r7
 184         eor     r12, r5, r8
 185         ldmia   r14!, {r6, r7, r8}
 186         bcs     .Lmemxor_same_loop
 187
 188 .Lmemxor_same_wind_down:
 189         C Wind down code
 190         ldmia   SRC!, {r3, r4, r5}
 191         stmia   DST!, {r10, r11, r12}
 192         eor     r10, r3, r6
 193         eor     r11, r4, r7
 194         eor     r12, r5, r8
 195 .Lmemxor_same_final_store:
 196         stmia   DST!, {r10, r11, r12}
 197
 198 .Lmemxor_same_end:
 199         C We have 0-11 bytes left to do, and N holds number of bytes -12.
 200         adds    N, #4
 201         bcc     .Lmemxor_same_lt_8
 202         C Do 8 bytes more, leftover is in N
 203         ldmia   SRC!, {r3, r4}
 204         ldmia   DST, {r6, r7}
 205         eor     r3, r6
 206         eor     r4, r7
 207         stmia   DST!, {r3, r4}
 208         pop     {r4,r5,r6,r7,r8,r10,r11,r14}
 209         beq     .Lmemxor_done
 210         b       .Lmemxor_bytes
 211
 212 .Lmemxor_same_lt_8:
 213         pop     {r4,r5,r6,r7,r8,r10,r11,r14}
 214         adds    N, #4
 215         bcc     .Lmemxor_same_lt_4
 216
 217         ldr     r3, [SRC], #+4
 218         ldr     r12, [DST]
 219         eor     r3, r12
 220         str     r3, [DST], #+4
 221         beq     .Lmemxor_done
 222         b       .Lmemxor_bytes
 223
 224 .Lmemxor_same_lt_4:
 225         adds    N, #4
 226         beq     .Lmemxor_done
 227         b       .Lmemxor_bytes
 228
 229 EPILOGUE(nettle_memxor)