4 Copyright (C) 2013, 2015 Niels Möller
6 This file is part of GNU Nettle.
8 GNU Nettle is free software: you can redistribute it and/or
9 modify it under the terms of either:
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
17 * the GNU General Public License as published by the Free
18 Software Foundation; either version 2 of the License, or (at your
19 option) any later version.
21 or both in parallel, as here.
23 GNU Nettle is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 General Public License for more details.
28 You should have received copies of the GNU General Public License and
29 the GNU Lesser General Public License along with this program. If
30 not, see http://www.gnu.org/licenses/.
35 C The ldm instruction can do load two registers per cycle,
36 C if the address is two-word aligned. Or three registers in two
37 C cycles, regardless of alignment.
59 C memxor3(void *dst, const void *a, const void *b, size_t n)
61 PROLOGUE(nettle_memxor3)
65 push {r4,r5,r6,r7,r8,r10,r11}
84 pop {r4,r5,r6,r7,r8,r10,r11}
97 bne .Lmemxor3_align_loop
99 C We have at least 4 bytes left to do here.
103 beq .Lmemxor3_a_aligned
115 C NOTE: We have the relevant shift count in ACNT, not BCNT
117 C AP is aligned, BP is not
125 C With little-endian, we need to do
126 C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
141 eor r6, r6, r4, lsl ATNC
142 eor r6, r6, r5, lsr ACNT
147 eor r6, r6, r5, lsl ATNC
148 eor r6, r6, r4, lsr ACNT
151 bcs .Lmemxor3_au_loop
155 C Leftover bytes in r4, low end
157 eor r4, r5, r4, lsl ATNC
159 .Lmemxor3_au_leftover:
160 C Store a byte at a time
167 bne .Lmemxor3_au_leftover
175 C a, b and dst all have the same alignment.
177 bcc .Lmemxor3_aligned_word_end
179 C This loop runs at 8 cycles per iteration. It has been
180 C observed running at only 7 cycles, for this speed, the loop
181 C started at offset 0x2ac in the object file.
183 C FIXME: consider software pipelining, similarly to the memxor
186 .Lmemxor3_aligned_word_loop:
187 ldmdb AP!, {r4,r5,r6}
188 ldmdb BP!, {r7,r8,r10}
193 stmdb DST!, {r4, r5,r6}
194 bcs .Lmemxor3_aligned_word_loop
196 .Lmemxor3_aligned_word_end:
197 C We have 0-11 bytes left to do, and N holds number of bytes -12.
199 bcc .Lmemxor3_aligned_lt_8
200 C Do 8 bytes more, leftover is in N
209 .Lmemxor3_aligned_lt_8:
211 bcc .Lmemxor3_aligned_lt_4
220 .Lmemxor3_aligned_lt_4:
234 C AP and BP are unaligned in the same way
251 eor r4, r4, r5, lsr ACNT
258 eor r5, r5, r4, lsr ACNT
261 bcs .Lmemxor3_uu_loop
265 C Leftover bytes in a4, low end
267 .Lmemxor3_uu_leftover:
273 bne .Lmemxor3_uu_leftover
277 C Both AP and BP unaligned, and in different ways
288 beq .Lmemxor3_uud_odd
294 eor r4, r4, r6, lsl BTNC
295 eor r4, r4, r5, lsr ACNT
296 eor r4, r4, r7, lsr BCNT
302 eor r5, r5, r7, lsl BTNC
303 eor r5, r5, r4, lsr ACNT
304 eor r5, r5, r6, lsr BCNT
307 bcs .Lmemxor3_uud_loop
311 C FIXME: More clever left-over handling? For now, just adjust pointers.
312 add AP, AP, ACNT, lsr #3
313 add BP, BP, BCNT, lsr #3
315 EPILOGUE(nettle_memxor3)