arm/memxor.asm

   1 C -*- mode: asm; asm-comment-char: ?C; -*-
   2 C nettle, low-level cryptographics library
   3 C
   4 C Copyright (C) 2013, Niels Möller
   5 C
   6 C The nettle library is free software; you can redistribute it and/or modify
   7 C it under the terms of the GNU Lesser General Public License as published by
   8 C the Free Software Foundation; either version 2.1 of the License, or (at your
   9 C option) any later version.
  10 C
  11 C The nettle library is distributed in the hope that it will be useful, but
  12 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14 C License for more details.
  15 C
  16 C You should have received a copy of the GNU Lesser General Public License
  17 C along with the nettle library; see the file COPYING.LIB.  If not, write to
  18 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  19 C MA 02111-1301, USA.
  20
  21 C Possible speedups:
  22 C
  23 C The ldm instruction can do load two registers per cycle,
  24 C if the address is two-word aligned. Or three registers in two
  25 C cycles, regardless of alignment.
  26
  27 C Register usage:
  28
  29 define(<DST>, <r0>)
  30 define(<SRC>, <r1>)
  31 define(<N>, <r2>)
  32 define(<CNT>, <r6>)
  33 define(<TNC>, <r12>)
  34
  35         .syntax unified
  36
  37         .file "memxor.asm"
  38
  39         .text
  40         .arm
  41
  42         C memxor(uint8_t *dst, const uint8_t *src, size_t n)
  43         .align 4
  44 PROLOGUE(memxor)
  45         cmp     N, #0
  46         beq     .Lmemxor_done
  47
  48         cmp     N, #7
  49         bcs     .Lmemxor_large
  50
  51         C Simple byte loop
  52 .Lmemxor_bytes:
  53         ldrb    r3, [SRC], #+1
  54         ldrb    r12, [DST]
  55         eor     r3, r12
  56         strb    r3, [DST], #+1
  57         subs    N, #1
  58         bne     .Lmemxor_bytes
  59
  60 .Lmemxor_done:
  61         bx      lr
  62
  63 .Lmemxor_align_loop:
  64         ldrb    r3, [SRC], #+1
  65         ldrb    r12, [DST]
  66         eor     r3, r12
  67         strb    r3, [DST], #+1
  68         sub     N, #1
  69
  70 .Lmemxor_large:
  71         tst     DST, #3
  72         bne     .Lmemxor_align_loop
  73
  74         C We have at least 4 bytes left to do here.
  75         sub     N, #4
  76
  77         ands    r3, SRC, #3
  78         beq     .Lmemxor_same
  79
  80         C Different alignment case.
  81         C     v original SRC
  82         C +-------+------+
  83         C |SRC    |SRC+4 |
  84         C +---+---+------+
  85         C     |DST    |
  86         C     +-------+
  87         C
  88         C With little-endian, we need to do
  89         C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
  90
  91         push    {r4,r5,r6}
  92
  93         lsl     CNT, r3, #3
  94         bic     SRC, #3
  95         rsb     TNC, CNT, #32
  96
  97         ldr     r4, [SRC], #+4
  98
  99         tst     N, #4
 100         itet    eq
 101         moveq   r5, r4
 102         subne   N, #4
 103         beq     .Lmemxor_odd
 104
 105 .Lmemxor_word_loop:
 106         ldr     r5, [SRC], #+4
 107         ldr     r3, [DST]
 108         eor     r3, r3, r4, lsr CNT
 109         eor     r3, r3, r5, lsl TNC
 110         str     r3, [DST], #+4
 111 .Lmemxor_odd:
 112         ldr     r4, [SRC], #+4
 113         ldr     r3, [DST]
 114         eor     r3, r3, r5, lsr CNT
 115         eor     r3, r3, r4, lsl TNC
 116         str     r3, [DST], #+4
 117         subs    N, #8
 118         bcs     .Lmemxor_word_loop
 119         adds    N, #8
 120         beq     .Lmemxor_odd_done
 121
 122         C We have TNC/8 left-over bytes in r4, high end
 123         lsr     r4, CNT
 124         ldr     r3, [DST]
 125         eor     r3, r4
 126
 127         pop     {r4,r5,r6}
 128
 129         C Store bytes, one by one.
 130 .Lmemxor_leftover:
 131         strb    r3, [DST], #+1
 132         subs    N, #1
 133         beq     .Lmemxor_done
 134         subs    TNC, #8
 135         lsr     r3, #8
 136         bne     .Lmemxor_leftover
 137         b       .Lmemxor_bytes
 138 .Lmemxor_odd_done:
 139         pop     {r4,r5,r6}
 140         bx      lr
 141
 142 .Lmemxor_same:
 143         push    {r4,r5,r6,r7,r8,r10,r11,r14}    C lr is the link register
 144
 145         subs    N, #8
 146         bcc     .Lmemxor_same_end
 147
 148         ldmia   SRC!, {r3, r4, r5}
 149         C Keep address for loads in r14
 150         mov     r14, DST
 151         ldmia   r14!, {r6, r7, r8}
 152         subs    N, #12
 153         eor     r10, r3, r6
 154         eor     r11, r4, r7
 155         eor     r12, r5, r8
 156         bcc     .Lmemxor_same_final_store
 157         subs    N, #12
 158         ldmia   r14!, {r6, r7, r8}
 159         bcc     .Lmemxor_same_wind_down
 160
 161         C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
 162         C loop starts at offset 0x11c in the object file.
 163
 164 .Lmemxor_same_loop:
 165         C r10-r12 contains values to be stored at DST
 166         C r6-r8 contains values read from r14, in advance
 167         ldmia   SRC!, {r3, r4, r5}
 168         subs    N, #12
 169         stmia   DST!, {r10, r11, r12}
 170         eor     r10, r3, r6
 171         eor     r11, r4, r7
 172         eor     r12, r5, r8
 173         ldmia   r14!, {r6, r7, r8}
 174         bcs     .Lmemxor_same_loop
 175
 176 .Lmemxor_same_wind_down:
 177         C Wind down code
 178         ldmia   SRC!, {r3, r4, r5}
 179         stmia   DST!, {r10, r11, r12}
 180         eor     r10, r3, r6
 181         eor     r11, r4, r7
 182         eor     r12, r5, r8
 183 .Lmemxor_same_final_store:
 184         stmia   DST!, {r10, r11, r12}
 185
 186 .Lmemxor_same_end:
 187         C We have 0-11 bytes left to do, and N holds number of bytes -12.
 188         adds    N, #4
 189         bcc     .Lmemxor_same_lt_8
 190         C Do 8 bytes more, leftover is in N
 191         ldmia   SRC!, {r3, r4}
 192         ldmia   DST, {r6, r7}
 193         eor     r3, r6
 194         eor     r4, r7
 195         stmia   DST!, {r3, r4}
 196         pop     {r4,r5,r6,r7,r8,r10,r11,r14}
 197         beq     .Lmemxor_done
 198         b       .Lmemxor_bytes
 199
 200 .Lmemxor_same_lt_8:
 201         pop     {r4,r5,r6,r7,r8,r10,r11,r14}
 202         adds    N, #4
 203         bcc     .Lmemxor_same_lt_4
 204
 205         ldr     r3, [SRC], #+4
 206         ldr     r12, [DST]
 207         eor     r3, r12
 208         str     r3, [DST], #+4
 209         beq     .Lmemxor_done
 210         b       .Lmemxor_bytes
 211
 212 .Lmemxor_same_lt_4:
 213         adds    N, #4
 214         beq     .Lmemxor_done
 215         b       .Lmemxor_bytes
 216
 217 EPILOGUE(memxor)
 218
 219 define(<DST>, <r0>)
 220 define(<AP>, <r1>)
 221 define(<BP>, <r2>)
 222 define(<N>, <r3>)
 223 undefine(<CNT>)
 224 undefine(<TNC>)
 225
 226 C Temporaries r4-r7
 227 define(<ACNT>, <r8>)
 228 define(<ATNC>, <r10>)
 229 define(<BCNT>, <r11>)
 230 define(<BTNC>, <r12>)
 231
 232         C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
 233         .align 2
 234 PROLOGUE(memxor3)
 235         cmp     N, #0
 236         beq     .Lmemxor3_ret
 237
 238         push    {r4,r5,r6,r7,r8,r10,r11}
 239         cmp     N, #7
 240
 241         add     AP, N
 242         add     BP, N
 243         add     DST, N
 244
 245         bcs     .Lmemxor3_large
 246
 247         C Simple byte loop
 248 .Lmemxor3_bytes:
 249         ldrb    r4, [AP, #-1]!
 250         ldrb    r5, [BP, #-1]!
 251         eor     r4, r5
 252         strb    r4, [DST, #-1]!
 253         subs    N, #1
 254         bne     .Lmemxor3_bytes
 255
 256 .Lmemxor3_done:
 257         pop     {r4,r5,r6,r7,r8,r10,r11}
 258 .Lmemxor3_ret:
 259         bx      lr
 260
 261 .Lmemxor3_align_loop:
 262         ldrb    r4, [AP, #-1]!
 263         ldrb    r5, [BP, #-1]!
 264         eor     r5, r4
 265         strb    r5, [DST, #-1]!
 266         sub     N, #1
 267
 268 .Lmemxor3_large:
 269         tst     DST, #3
 270         bne     .Lmemxor3_align_loop
 271
 272         C We have at least 4 bytes left to do here.
 273         sub     N, #4
 274         ands    ACNT, AP, #3
 275         lsl     ACNT, #3
 276         beq     .Lmemxor3_a_aligned
 277
 278         ands    BCNT, BP, #3
 279         lsl     BCNT, #3
 280         bne     .Lmemxor3_uu
 281
 282         C Swap
 283         mov     r4, AP
 284         mov     AP, BP
 285         mov     BP, r4
 286
 287 .Lmemxor3_au:
 288         C NOTE: We have the relevant shift count in ACNT, not BCNT
 289
 290         C AP is aligned, BP is not
 291         C           v original SRC
 292         C +-------+------+
 293         C |SRC-4  |SRC   |
 294         C +---+---+------+
 295         C     |DST-4  |
 296         C     +-------+
 297         C
 298         C With little-endian, we need to do
 299         C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
 300         rsb     ATNC, ACNT, #32
 301         bic     BP, #3
 302
 303         ldr     r4, [BP]
 304
 305         tst     N, #4
 306         itet    eq
 307         moveq   r5, r4
 308         subne   N, #4
 309         beq     .Lmemxor3_au_odd
 310
 311 .Lmemxor3_au_loop:
 312         ldr     r5, [BP, #-4]!
 313         ldr     r6, [AP, #-4]!
 314         eor     r6, r6, r4, lsl ATNC
 315         eor     r6, r6, r5, lsr ACNT
 316         str     r6, [DST, #-4]!
 317 .Lmemxor3_au_odd:
 318         ldr     r4, [BP, #-4]!
 319         ldr     r6, [AP, #-4]!
 320         eor     r6, r6, r5, lsl ATNC
 321         eor     r6, r6, r4, lsr ACNT
 322         str     r6, [DST, #-4]!
 323         subs    N, #8
 324         bcs     .Lmemxor3_au_loop
 325         adds    N, #8
 326         beq     .Lmemxor3_done
 327
 328         C Leftover bytes in r4, low end
 329         ldr     r5, [AP, #-4]
 330         eor     r4, r5, r4, lsl ATNC
 331
 332 .Lmemxor3_au_leftover:
 333         C Store a byte at a time
 334         ror     r4, #24
 335         strb    r4, [DST, #-1]!
 336         subs    N, #1
 337         beq     .Lmemxor3_done
 338         subs    ACNT, #8
 339         sub     AP, #1
 340         bne     .Lmemxor3_au_leftover
 341         b       .Lmemxor3_bytes
 342
 343 .Lmemxor3_a_aligned:
 344         ands    ACNT, BP, #3
 345         lsl     ACNT, #3
 346         bne     .Lmemxor3_au ;
 347
 348         C a, b and dst all have the same alignment.
 349         subs    N, #8
 350         bcc     .Lmemxor3_aligned_word_end
 351
 352         C This loop runs at 8 cycles per iteration. It has been
 353         C observed running at only 7 cycles, for this speed, the loop
 354         C started at offset 0x2ac in the object file.
 355
 356         C FIXME: consider software pipelining, similarly to the memxor
 357         C loop.
 358
 359 .Lmemxor3_aligned_word_loop:
 360         ldmdb   AP!, {r4,r5,r6}
 361         ldmdb   BP!, {r7,r8,r10}
 362         subs    N, #12
 363         eor     r4, r7
 364         eor     r5, r8
 365         eor     r6, r10
 366         stmdb   DST!, {r4, r5,r6}
 367         bcs     .Lmemxor3_aligned_word_loop
 368
 369 .Lmemxor3_aligned_word_end:
 370         C We have 0-11 bytes left to do, and N holds number of bytes -12.
 371         adds    N, #4
 372         bcc     .Lmemxor3_aligned_lt_8
 373         C Do 8 bytes more, leftover is in N
 374         ldmdb   AP!, {r4, r5}
 375         ldmdb   BP!, {r6, r7}
 376         eor     r4, r6
 377         eor     r5, r7
 378         stmdb   DST!, {r4,r5}
 379         beq     .Lmemxor3_done
 380         b       .Lmemxor3_bytes
 381
 382 .Lmemxor3_aligned_lt_8:
 383         adds    N, #4
 384         bcc     .Lmemxor3_aligned_lt_4
 385
 386         ldr     r4, [AP,#-4]!
 387         ldr     r5, [BP,#-4]!
 388         eor     r4, r5
 389         str     r4, [DST,#-4]!
 390         beq     .Lmemxor3_done
 391         b       .Lmemxor3_bytes
 392
 393 .Lmemxor3_aligned_lt_4:
 394         adds    N, #4
 395         beq     .Lmemxor3_done
 396         b       .Lmemxor3_bytes
 397
 398 .Lmemxor3_uu:
 399
 400         cmp     ACNT, BCNT
 401         bic     AP, #3
 402         bic     BP, #3
 403         rsb     ATNC, ACNT, #32
 404
 405         bne     .Lmemxor3_uud
 406
 407         C AP and BP are unaligned in the same way
 408
 409         ldr     r4, [AP]
 410         ldr     r6, [BP]
 411         eor     r4, r6
 412
 413         tst     N, #4
 414         itet    eq
 415         moveq   r5, r4
 416         subne   N, #4
 417         beq     .Lmemxor3_uu_odd
 418
 419 .Lmemxor3_uu_loop:
 420         ldr     r5, [AP, #-4]!
 421         ldr     r6, [BP, #-4]!
 422         eor     r5, r6
 423         lsl     r4, ATNC
 424         eor     r4, r4, r5, lsr ACNT
 425         str     r4, [DST, #-4]!
 426 .Lmemxor3_uu_odd:
 427         ldr     r4, [AP, #-4]!
 428         ldr     r6, [BP, #-4]!
 429         eor     r4, r6
 430         lsl     r5, ATNC
 431         eor     r5, r5, r4, lsr ACNT
 432         str     r5, [DST, #-4]!
 433         subs    N, #8
 434         bcs     .Lmemxor3_uu_loop
 435         adds    N, #8
 436         beq     .Lmemxor3_done
 437
 438         C Leftover bytes in a4, low end
 439         ror     r4, ACNT
 440 .Lmemxor3_uu_leftover:
 441         ror     r4, #24
 442         strb    r4, [DST, #-1]!
 443         subs    N, #1
 444         beq     .Lmemxor3_done
 445         subs    ACNT, #8
 446         bne     .Lmemxor3_uu_leftover
 447         b       .Lmemxor3_bytes
 448
 449 .Lmemxor3_uud:
 450         C Both AP and BP unaligned, and in different ways
 451         rsb     BTNC, BCNT, #32
 452
 453         ldr     r4, [AP]
 454         ldr     r6, [BP]
 455
 456         tst     N, #4
 457         ittet   eq
 458         moveq   r5, r4
 459         moveq   r7, r6
 460         subne   N, #4
 461         beq     .Lmemxor3_uud_odd
 462
 463 .Lmemxor3_uud_loop:
 464         ldr     r5, [AP, #-4]!
 465         ldr     r7, [BP, #-4]!
 466         lsl     r4, ATNC
 467         eor     r4, r4, r6, lsl BTNC
 468         eor     r4, r4, r5, lsr ACNT
 469         eor     r4, r4, r7, lsr BCNT
 470         str     r4, [DST, #-4]!
 471 .Lmemxor3_uud_odd:
 472         ldr     r4, [AP, #-4]!
 473         ldr     r6, [BP, #-4]!
 474         lsl     r5, ATNC
 475         eor     r5, r5, r7, lsl BTNC
 476         eor     r5, r5, r4, lsr ACNT
 477         eor     r5, r5, r6, lsr BCNT
 478         str     r5, [DST, #-4]!
 479         subs    N, #8
 480         bcs     .Lmemxor3_uud_loop
 481         adds    N, #8
 482         beq     .Lmemxor3_done
 483
 484         C FIXME: More clever left-over handling? For now, just adjust pointers.
 485         add     AP, AP, ACNT, lsr #3
 486         add     BP, BP, BCNT, lsr #3
 487         b       .Lmemxor3_bytes
 488 EPILOGUE(memxor3)