arm/memxor3.asm

   1 C arm/memxor3.asm
   2
   3 ifelse(<
   4    Copyright (C) 2013, 2015 Niels Möller
   5
   6    This file is part of GNU Nettle.
   7
   8    GNU Nettle is free software: you can redistribute it and/or
   9    modify it under the terms of either:
  10
  11      * the GNU Lesser General Public License as published by the Free
  12        Software Foundation; either version 3 of the License, or (at your
  13        option) any later version.
  14
  15    or
  16
  17      * the GNU General Public License as published by the Free
  18        Software Foundation; either version 2 of the License, or (at your
  19        option) any later version.
  20
  21    or both in parallel, as here.
  22
  23    GNU Nettle is distributed in the hope that it will be useful,
  24    but WITHOUT ANY WARRANTY; without even the implied warranty of
  25    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26    General Public License for more details.
  27
  28    You should have received copies of the GNU General Public License and
  29    the GNU Lesser General Public License along with this program.  If
  30    not, see http://www.gnu.org/licenses/.
  31 >)
  32
  33 C Possible speedups:
  34 C
  35 C The ldm instruction can do load two registers per cycle,
  36 C if the address is two-word aligned. Or three registers in two
  37 C cycles, regardless of alignment.
  38
  39 C Register usage:
  40
  41 define(<DST>, <r0>)
  42 define(<AP>, <r1>)
  43 define(<BP>, <r2>)
  44 define(<N>, <r3>)
  45
  46 C Temporaries r4-r7
  47 define(<ACNT>, <r8>)
  48 define(<ATNC>, <r10>)
  49 define(<BCNT>, <r11>)
  50 define(<BTNC>, <r12>)
  51
  52         .syntax unified
  53
  54         .file "memxor3.asm"
  55
  56         .text
  57         .arm
  58
  59         C memxor3(void *dst, const void *a, const void *b, size_t n)
  60         .align 2
  61 PROLOGUE(nettle_memxor3)
  62         cmp     N, #0
  63         beq     .Lmemxor3_ret
  64
  65         push    {r4,r5,r6,r7,r8,r10,r11}
  66         cmp     N, #7
  67
  68         add     AP, N
  69         add     BP, N
  70         add     DST, N
  71
  72         bcs     .Lmemxor3_large
  73
  74         C Simple byte loop
  75 .Lmemxor3_bytes:
  76         ldrb    r4, [AP, #-1]!
  77         ldrb    r5, [BP, #-1]!
  78         eor     r4, r5
  79         strb    r4, [DST, #-1]!
  80         subs    N, #1
  81         bne     .Lmemxor3_bytes
  82
  83 .Lmemxor3_done:
  84         pop     {r4,r5,r6,r7,r8,r10,r11}
  85 .Lmemxor3_ret:
  86         bx      lr
  87
  88 .Lmemxor3_align_loop:
  89         ldrb    r4, [AP, #-1]!
  90         ldrb    r5, [BP, #-1]!
  91         eor     r5, r4
  92         strb    r5, [DST, #-1]!
  93         sub     N, #1
  94
  95 .Lmemxor3_large:
  96         tst     DST, #3
  97         bne     .Lmemxor3_align_loop
  98
  99         C We have at least 4 bytes left to do here.
 100         sub     N, #4
 101         ands    ACNT, AP, #3
 102         lsl     ACNT, #3
 103         beq     .Lmemxor3_a_aligned
 104
 105         ands    BCNT, BP, #3
 106         lsl     BCNT, #3
 107         bne     .Lmemxor3_uu
 108
 109         C Swap
 110         mov     r4, AP
 111         mov     AP, BP
 112         mov     BP, r4
 113
 114 .Lmemxor3_au:
 115         C NOTE: We have the relevant shift count in ACNT, not BCNT
 116
 117         C AP is aligned, BP is not
 118         C           v original SRC
 119         C +-------+------+
 120         C |SRC-4  |SRC   |
 121         C +---+---+------+
 122         C     |DST-4  |
 123         C     +-------+
 124         C
 125         C With little-endian, we need to do
 126         C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
 127         rsb     ATNC, ACNT, #32
 128         bic     BP, #3
 129
 130         ldr     r4, [BP]
 131
 132         tst     N, #4
 133         itet    eq
 134         moveq   r5, r4
 135         subne   N, #4
 136         beq     .Lmemxor3_au_odd
 137
 138 .Lmemxor3_au_loop:
 139         ldr     r5, [BP, #-4]!
 140         ldr     r6, [AP, #-4]!
 141         eor     r6, r6, r4, lsl ATNC
 142         eor     r6, r6, r5, lsr ACNT
 143         str     r6, [DST, #-4]!
 144 .Lmemxor3_au_odd:
 145         ldr     r4, [BP, #-4]!
 146         ldr     r6, [AP, #-4]!
 147         eor     r6, r6, r5, lsl ATNC
 148         eor     r6, r6, r4, lsr ACNT
 149         str     r6, [DST, #-4]!
 150         subs    N, #8
 151         bcs     .Lmemxor3_au_loop
 152         adds    N, #8
 153         beq     .Lmemxor3_done
 154
 155         C Leftover bytes in r4, low end
 156         ldr     r5, [AP, #-4]
 157         eor     r4, r5, r4, lsl ATNC
 158
 159 .Lmemxor3_au_leftover:
 160         C Store a byte at a time
 161         ror     r4, #24
 162         strb    r4, [DST, #-1]!
 163         subs    N, #1
 164         beq     .Lmemxor3_done
 165         subs    ACNT, #8
 166         sub     AP, #1
 167         bne     .Lmemxor3_au_leftover
 168         b       .Lmemxor3_bytes
 169
 170 .Lmemxor3_a_aligned:
 171         ands    ACNT, BP, #3
 172         lsl     ACNT, #3
 173         bne     .Lmemxor3_au ;
 174
 175         C a, b and dst all have the same alignment.
 176         subs    N, #8
 177         bcc     .Lmemxor3_aligned_word_end
 178
 179         C This loop runs at 8 cycles per iteration. It has been
 180         C observed running at only 7 cycles, for this speed, the loop
 181         C started at offset 0x2ac in the object file.
 182
 183         C FIXME: consider software pipelining, similarly to the memxor
 184         C loop.
 185
 186 .Lmemxor3_aligned_word_loop:
 187         ldmdb   AP!, {r4,r5,r6}
 188         ldmdb   BP!, {r7,r8,r10}
 189         subs    N, #12
 190         eor     r4, r7
 191         eor     r5, r8
 192         eor     r6, r10
 193         stmdb   DST!, {r4, r5,r6}
 194         bcs     .Lmemxor3_aligned_word_loop
 195
 196 .Lmemxor3_aligned_word_end:
 197         C We have 0-11 bytes left to do, and N holds number of bytes -12.
 198         adds    N, #4
 199         bcc     .Lmemxor3_aligned_lt_8
 200         C Do 8 bytes more, leftover is in N
 201         ldmdb   AP!, {r4, r5}
 202         ldmdb   BP!, {r6, r7}
 203         eor     r4, r6
 204         eor     r5, r7
 205         stmdb   DST!, {r4,r5}
 206         beq     .Lmemxor3_done
 207         b       .Lmemxor3_bytes
 208
 209 .Lmemxor3_aligned_lt_8:
 210         adds    N, #4
 211         bcc     .Lmemxor3_aligned_lt_4
 212
 213         ldr     r4, [AP,#-4]!
 214         ldr     r5, [BP,#-4]!
 215         eor     r4, r5
 216         str     r4, [DST,#-4]!
 217         beq     .Lmemxor3_done
 218         b       .Lmemxor3_bytes
 219
 220 .Lmemxor3_aligned_lt_4:
 221         adds    N, #4
 222         beq     .Lmemxor3_done
 223         b       .Lmemxor3_bytes
 224
 225 .Lmemxor3_uu:
 226
 227         cmp     ACNT, BCNT
 228         bic     AP, #3
 229         bic     BP, #3
 230         rsb     ATNC, ACNT, #32
 231
 232         bne     .Lmemxor3_uud
 233
 234         C AP and BP are unaligned in the same way
 235
 236         ldr     r4, [AP]
 237         ldr     r6, [BP]
 238         eor     r4, r6
 239
 240         tst     N, #4
 241         itet    eq
 242         moveq   r5, r4
 243         subne   N, #4
 244         beq     .Lmemxor3_uu_odd
 245
 246 .Lmemxor3_uu_loop:
 247         ldr     r5, [AP, #-4]!
 248         ldr     r6, [BP, #-4]!
 249         eor     r5, r6
 250         lsl     r4, ATNC
 251         eor     r4, r4, r5, lsr ACNT
 252         str     r4, [DST, #-4]!
 253 .Lmemxor3_uu_odd:
 254         ldr     r4, [AP, #-4]!
 255         ldr     r6, [BP, #-4]!
 256         eor     r4, r6
 257         lsl     r5, ATNC
 258         eor     r5, r5, r4, lsr ACNT
 259         str     r5, [DST, #-4]!
 260         subs    N, #8
 261         bcs     .Lmemxor3_uu_loop
 262         adds    N, #8
 263         beq     .Lmemxor3_done
 264
 265         C Leftover bytes in a4, low end
 266         ror     r4, ACNT
 267 .Lmemxor3_uu_leftover:
 268         ror     r4, #24
 269         strb    r4, [DST, #-1]!
 270         subs    N, #1
 271         beq     .Lmemxor3_done
 272         subs    ACNT, #8
 273         bne     .Lmemxor3_uu_leftover
 274         b       .Lmemxor3_bytes
 275
 276 .Lmemxor3_uud:
 277         C Both AP and BP unaligned, and in different ways
 278         rsb     BTNC, BCNT, #32
 279
 280         ldr     r4, [AP]
 281         ldr     r6, [BP]
 282
 283         tst     N, #4
 284         ittet   eq
 285         moveq   r5, r4
 286         moveq   r7, r6
 287         subne   N, #4
 288         beq     .Lmemxor3_uud_odd
 289
 290 .Lmemxor3_uud_loop:
 291         ldr     r5, [AP, #-4]!
 292         ldr     r7, [BP, #-4]!
 293         lsl     r4, ATNC
 294         eor     r4, r4, r6, lsl BTNC
 295         eor     r4, r4, r5, lsr ACNT
 296         eor     r4, r4, r7, lsr BCNT
 297         str     r4, [DST, #-4]!
 298 .Lmemxor3_uud_odd:
 299         ldr     r4, [AP, #-4]!
 300         ldr     r6, [BP, #-4]!
 301         lsl     r5, ATNC
 302         eor     r5, r5, r7, lsl BTNC
 303         eor     r5, r5, r4, lsr ACNT
 304         eor     r5, r5, r6, lsr BCNT
 305         str     r5, [DST, #-4]!
 306         subs    N, #8
 307         bcs     .Lmemxor3_uud_loop
 308         adds    N, #8
 309         beq     .Lmemxor3_done
 310
 311         C FIXME: More clever left-over handling? For now, just adjust pointers.
 312         add     AP, AP, ACNT, lsr #3
 313         add     BP, BP, BCNT, lsr #3
 314         b       .Lmemxor3_bytes
 315 EPILOGUE(nettle_memxor3)