C -*- mode: asm; asm-comment-char: ?C; -*- C nettle, low-level cryptographics library C C Copyright (C) 2013, Niels Möller C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, C MA 02111-1301, USA. C Possible speedups: C C The ldm instruction can do load two registers per cycle, C if the address is two-word aligned. Or three registers in two C cycles, regardless of alignment. C Register usage: define(, ) define(, ) define(, ) define(, ) define(, ) .syntax unified .file "memxor.asm" .text .arm C memxor(uint8_t *dst, const uint8_t *src, size_t n) .align 4 PROLOGUE(memxor) cmp N, #0 beq .Lmemxor_done cmp N, #7 bcs .Lmemxor_large C Simple byte loop .Lmemxor_bytes: ldrb r3, [SRC], #+1 ldrb r12, [DST] eor r3, r12 strb r3, [DST], #+1 subs N, #1 bne .Lmemxor_bytes .Lmemxor_done: bx lr .Lmemxor_align_loop: ldrb r3, [SRC], #+1 ldrb r12, [DST] eor r3, r12 strb r3, [DST], #+1 sub N, #1 .Lmemxor_large: tst DST, #3 bne .Lmemxor_align_loop C We have at least 4 bytes left to do here. sub N, #4 ands r3, SRC, #3 beq .Lmemxor_same C Different alignment case. C v original SRC C +-------+------+ C |SRC |SRC+4 | C +---+---+------+ C |DST | C +-------+ C C With little-endian, we need to do C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC) push {r4,r5,r6} lsl CNT, r3, #3 bic SRC, #3 rsb TNC, CNT, #32 ldr r4, [SRC], #+4 tst N, #4 itet eq moveq r5, r4 subne N, #4 beq .Lmemxor_odd .Lmemxor_word_loop: ldr r5, [SRC], #+4 ldr r3, [DST] eor r3, r3, r4, lsr CNT eor r3, r3, r5, lsl TNC str r3, [DST], #+4 .Lmemxor_odd: ldr r4, [SRC], #+4 ldr r3, [DST] eor r3, r3, r5, lsr CNT eor r3, r3, r4, lsl TNC str r3, [DST], #+4 subs N, #8 bcs .Lmemxor_word_loop adds N, #8 beq .Lmemxor_odd_done C We have TNC/8 left-over bytes in r4, high end lsr r4, CNT ldr r3, [DST] eor r3, r4 pop {r4,r5,r6} C Store bytes, one by one. .Lmemxor_leftover: strb r3, [DST], #+1 subs N, #1 beq .Lmemxor_done subs TNC, #8 lsr r3, #8 bne .Lmemxor_leftover b .Lmemxor_bytes .Lmemxor_odd_done: pop {r4,r5,r6} bx lr .Lmemxor_same: push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register subs N, #8 bcc .Lmemxor_same_end ldmia SRC!, {r3, r4, r5} C Keep address for loads in r14 mov r14, DST ldmia r14!, {r6, r7, r8} subs N, #12 eor r10, r3, r6 eor r11, r4, r7 eor r12, r5, r8 bcc .Lmemxor_same_final_store subs N, #12 ldmia r14!, {r6, r7, r8} bcc .Lmemxor_same_wind_down C 6 cycles per iteration, 0.50 cycles/byte. For this speed, C loop starts at offset 0x11c in the object file. .Lmemxor_same_loop: C r10-r12 contains values to be stored at DST C r6-r8 contains values read from r14, in advance ldmia SRC!, {r3, r4, r5} subs N, #12 stmia DST!, {r10, r11, r12} eor r10, r3, r6 eor r11, r4, r7 eor r12, r5, r8 ldmia r14!, {r6, r7, r8} bcs .Lmemxor_same_loop .Lmemxor_same_wind_down: C Wind down code ldmia SRC!, {r3, r4, r5} stmia DST!, {r10, r11, r12} eor r10, r3, r6 eor r11, r4, r7 eor r12, r5, r8 .Lmemxor_same_final_store: stmia DST!, {r10, r11, r12} .Lmemxor_same_end: C We have 0-11 bytes left to do, and N holds number of bytes -12. adds N, #4 bcc .Lmemxor_same_lt_8 C Do 8 bytes more, leftover is in N ldmia SRC!, {r3, r4} ldmia DST, {r6, r7} eor r3, r6 eor r4, r7 stmia DST!, {r3, r4} pop {r4,r5,r6,r7,r8,r10,r11,r14} beq .Lmemxor_done b .Lmemxor_bytes .Lmemxor_same_lt_8: pop {r4,r5,r6,r7,r8,r10,r11,r14} adds N, #4 bcc .Lmemxor_same_lt_4 ldr r3, [SRC], #+4 ldr r12, [DST] eor r3, r12 str r3, [DST], #+4 beq .Lmemxor_done b .Lmemxor_bytes .Lmemxor_same_lt_4: adds N, #4 beq .Lmemxor_done b .Lmemxor_bytes EPILOGUE(memxor) define(, ) define(, ) define(, ) define(, ) undefine() undefine() C Temporaries r4-r7 define(, ) define(, ) define(, ) define(, ) C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n) .align 2 PROLOGUE(memxor3) cmp N, #0 beq .Lmemxor3_ret push {r4,r5,r6,r7,r8,r10,r11} cmp N, #7 add AP, N add BP, N add DST, N bcs .Lmemxor3_large C Simple byte loop .Lmemxor3_bytes: ldrb r4, [AP, #-1]! ldrb r5, [BP, #-1]! eor r4, r5 strb r4, [DST, #-1]! subs N, #1 bne .Lmemxor3_bytes .Lmemxor3_done: pop {r4,r5,r6,r7,r8,r10,r11} .Lmemxor3_ret: bx lr .Lmemxor3_align_loop: ldrb r4, [AP, #-1]! ldrb r5, [BP, #-1]! eor r5, r4 strb r5, [DST, #-1]! sub N, #1 .Lmemxor3_large: tst DST, #3 bne .Lmemxor3_align_loop C We have at least 4 bytes left to do here. sub N, #4 ands ACNT, AP, #3 lsl ACNT, #3 beq .Lmemxor3_a_aligned ands BCNT, BP, #3 lsl BCNT, #3 bne .Lmemxor3_uu C Swap mov r4, AP mov AP, BP mov BP, r4 .Lmemxor3_au: C NOTE: We have the relevant shift count in ACNT, not BCNT C AP is aligned, BP is not C v original SRC C +-------+------+ C |SRC-4 |SRC | C +---+---+------+ C |DST-4 | C +-------+ C C With little-endian, we need to do C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC) rsb ATNC, ACNT, #32 bic BP, #3 ldr r4, [BP] tst N, #4 itet eq moveq r5, r4 subne N, #4 beq .Lmemxor3_au_odd .Lmemxor3_au_loop: ldr r5, [BP, #-4]! ldr r6, [AP, #-4]! eor r6, r6, r4, lsl ATNC eor r6, r6, r5, lsr ACNT str r6, [DST, #-4]! .Lmemxor3_au_odd: ldr r4, [BP, #-4]! ldr r6, [AP, #-4]! eor r6, r6, r5, lsl ATNC eor r6, r6, r4, lsr ACNT str r6, [DST, #-4]! subs N, #8 bcs .Lmemxor3_au_loop adds N, #8 beq .Lmemxor3_done C Leftover bytes in r4, low end ldr r5, [AP, #-4] eor r4, r5, r4, lsl ATNC .Lmemxor3_au_leftover: C Store a byte at a time ror r4, #24 strb r4, [DST, #-1]! subs N, #1 beq .Lmemxor3_done subs ACNT, #8 sub AP, #1 bne .Lmemxor3_au_leftover b .Lmemxor3_bytes .Lmemxor3_a_aligned: ands ACNT, BP, #3 lsl ACNT, #3 bne .Lmemxor3_au ; C a, b and dst all have the same alignment. subs N, #8 bcc .Lmemxor3_aligned_word_end C This loop runs at 8 cycles per iteration. It has been C observed running at only 7 cycles, for this speed, the loop C started at offset 0x2ac in the object file. C FIXME: consider software pipelining, similarly to the memxor C loop. .Lmemxor3_aligned_word_loop: ldmdb AP!, {r4,r5,r6} ldmdb BP!, {r7,r8,r10} subs N, #12 eor r4, r7 eor r5, r8 eor r6, r10 stmdb DST!, {r4, r5,r6} bcs .Lmemxor3_aligned_word_loop .Lmemxor3_aligned_word_end: C We have 0-11 bytes left to do, and N holds number of bytes -12. adds N, #4 bcc .Lmemxor3_aligned_lt_8 C Do 8 bytes more, leftover is in N ldmdb AP!, {r4, r5} ldmdb BP!, {r6, r7} eor r4, r6 eor r5, r7 stmdb DST!, {r4,r5} beq .Lmemxor3_done b .Lmemxor3_bytes .Lmemxor3_aligned_lt_8: adds N, #4 bcc .Lmemxor3_aligned_lt_4 ldr r4, [AP,#-4]! ldr r5, [BP,#-4]! eor r4, r5 str r4, [DST,#-4]! beq .Lmemxor3_done b .Lmemxor3_bytes .Lmemxor3_aligned_lt_4: adds N, #4 beq .Lmemxor3_done b .Lmemxor3_bytes .Lmemxor3_uu: cmp ACNT, BCNT bic AP, #3 bic BP, #3 rsb ATNC, ACNT, #32 bne .Lmemxor3_uud C AP and BP are unaligned in the same way ldr r4, [AP] ldr r6, [BP] eor r4, r6 tst N, #4 itet eq moveq r5, r4 subne N, #4 beq .Lmemxor3_uu_odd .Lmemxor3_uu_loop: ldr r5, [AP, #-4]! ldr r6, [BP, #-4]! eor r5, r6 lsl r4, ATNC eor r4, r4, r5, lsr ACNT str r4, [DST, #-4]! .Lmemxor3_uu_odd: ldr r4, [AP, #-4]! ldr r6, [BP, #-4]! eor r4, r6 lsl r5, ATNC eor r5, r5, r4, lsr ACNT str r5, [DST, #-4]! subs N, #8 bcs .Lmemxor3_uu_loop adds N, #8 beq .Lmemxor3_done C Leftover bytes in a4, low end ror r4, ACNT .Lmemxor3_uu_leftover: ror r4, #24 strb r4, [DST, #-1]! subs N, #1 beq .Lmemxor3_done subs ACNT, #8 bne .Lmemxor3_uu_leftover b .Lmemxor3_bytes .Lmemxor3_uud: C Both AP and BP unaligned, and in different ways rsb BTNC, BCNT, #32 ldr r4, [AP] ldr r6, [BP] tst N, #4 ittet eq moveq r5, r4 moveq r7, r6 subne N, #4 beq .Lmemxor3_uud_odd .Lmemxor3_uud_loop: ldr r5, [AP, #-4]! ldr r7, [BP, #-4]! lsl r4, ATNC eor r4, r4, r6, lsl BTNC eor r4, r4, r5, lsr ACNT eor r4, r4, r7, lsr BCNT str r4, [DST, #-4]! .Lmemxor3_uud_odd: ldr r4, [AP, #-4]! ldr r6, [BP, #-4]! lsl r5, ATNC eor r5, r5, r7, lsl BTNC eor r5, r5, r4, lsr ACNT eor r5, r5, r6, lsr BCNT str r5, [DST, #-4]! subs N, #8 bcs .Lmemxor3_uud_loop adds N, #8 beq .Lmemxor3_done C FIXME: More clever left-over handling? For now, just adjust pointers. add AP, AP, ACNT, lsr #3 add BP, BP, BCNT, lsr #3 b .Lmemxor3_bytes EPILOGUE(memxor3)