1 C -*- mode: asm; asm-comment-char: ?C; -*-
2 C nettle, low-level cryptographics library
4 C Copyright (C) 2013, Niels Möller
6 C The nettle library is free software; you can redistribute it and/or modify
7 C it under the terms of the GNU Lesser General Public License as published by
8 C the Free Software Foundation; either version 2.1 of the License, or (at your
9 C option) any later version.
11 C The nettle library is distributed in the hope that it will be useful, but
12 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 C License for more details.
16 C You should have received a copy of the GNU Lesser General Public License
17 C along with the nettle library; see the file COPYING.LIB. If not, write to
18 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 C The ldm instruction can do load two registers per cycle,
24 C if the address is two-word aligned. Or three registers in two
25 C cycles, regardless of alignment.
42 C memxor(uint8_t *dst, const uint8_t *src, size_t n)
72 bne .Lmemxor_align_loop
74 C We have at least 4 bytes left to do here.
80 C Different alignment case.
88 C With little-endian, we need to do
89 C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
108 eor r3, r3, r4, lsr CNT
109 eor r3, r3, r5, lsl TNC
114 eor r3, r3, r5, lsr CNT
115 eor r3, r3, r4, lsl TNC
118 bcs .Lmemxor_word_loop
120 beq .Lmemxor_odd_done
122 C We have TNC/8 left-over bytes in r4, high end
129 C Store bytes, one by one.
136 bne .Lmemxor_leftover
143 push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
146 bcc .Lmemxor_same_end
148 ldmia SRC!, {r3, r4, r5}
149 C Keep address for loads in r14
151 ldmia r14!, {r6, r7, r8}
156 bcc .Lmemxor_same_final_store
158 ldmia r14!, {r6, r7, r8}
159 bcc .Lmemxor_same_wind_down
161 C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
162 C loop starts at offset 0x11c in the object file.
165 C r10-r12 contains values to be stored at DST
166 C r6-r8 contains values read from r14, in advance
167 ldmia SRC!, {r3, r4, r5}
169 stmia DST!, {r10, r11, r12}
173 ldmia r14!, {r6, r7, r8}
174 bcs .Lmemxor_same_loop
176 .Lmemxor_same_wind_down:
178 ldmia SRC!, {r3, r4, r5}
179 stmia DST!, {r10, r11, r12}
183 .Lmemxor_same_final_store:
184 stmia DST!, {r10, r11, r12}
187 C We have 0-11 bytes left to do, and N holds number of bytes -12.
189 bcc .Lmemxor_same_lt_8
190 C Do 8 bytes more, leftover is in N
196 pop {r4,r5,r6,r7,r8,r10,r11,r14}
201 pop {r4,r5,r6,r7,r8,r10,r11,r14}
203 bcc .Lmemxor_same_lt_4
228 define(<ATNC>, <r10>)
229 define(<BCNT>, <r11>)
230 define(<BTNC>, <r12>)
232 C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
238 push {r4,r5,r6,r7,r8,r10,r11}
257 pop {r4,r5,r6,r7,r8,r10,r11}
261 .Lmemxor3_align_loop:
270 bne .Lmemxor3_align_loop
272 C We have at least 4 bytes left to do here.
276 beq .Lmemxor3_a_aligned
288 C NOTE: We have the relevant shift count in ACNT, not BCNT
290 C AP is aligned, BP is not
298 C With little-endian, we need to do
299 C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
314 eor r6, r6, r4, lsl ATNC
315 eor r6, r6, r5, lsr ACNT
320 eor r6, r6, r5, lsl ATNC
321 eor r6, r6, r4, lsr ACNT
324 bcs .Lmemxor3_au_loop
328 C Leftover bytes in r4, low end
330 eor r4, r5, r4, lsl ATNC
332 .Lmemxor3_au_leftover:
333 C Store a byte at a time
340 bne .Lmemxor3_au_leftover
348 C a, b and dst all have the same alignment.
350 bcc .Lmemxor3_aligned_word_end
352 C This loop runs at 8 cycles per iteration. It has been
353 C observed running at only 7 cycles, for this speed, the loop
354 C started at offset 0x2ac in the object file.
356 C FIXME: consider software pipelining, similarly to the memxor
359 .Lmemxor3_aligned_word_loop:
360 ldmdb AP!, {r4,r5,r6}
361 ldmdb BP!, {r7,r8,r10}
366 stmdb DST!, {r4, r5,r6}
367 bcs .Lmemxor3_aligned_word_loop
369 .Lmemxor3_aligned_word_end:
370 C We have 0-11 bytes left to do, and N holds number of bytes -12.
372 bcc .Lmemxor3_aligned_lt_8
373 C Do 8 bytes more, leftover is in N
382 .Lmemxor3_aligned_lt_8:
384 bcc .Lmemxor3_aligned_lt_4
393 .Lmemxor3_aligned_lt_4:
407 C AP and BP are unaligned in the same way
424 eor r4, r4, r5, lsr ACNT
431 eor r5, r5, r4, lsr ACNT
434 bcs .Lmemxor3_uu_loop
438 C Leftover bytes in a4, low end
440 .Lmemxor3_uu_leftover:
446 bne .Lmemxor3_uu_leftover
450 C Both AP and BP unaligned, and in different ways
461 beq .Lmemxor3_uud_odd
467 eor r4, r4, r6, lsl BTNC
468 eor r4, r4, r5, lsr ACNT
469 eor r4, r4, r7, lsr BCNT
475 eor r5, r5, r7, lsl BTNC
476 eor r5, r5, r4, lsr ACNT
477 eor r5, r5, r6, lsr BCNT
480 bcs .Lmemxor3_uud_loop
484 C FIXME: More clever left-over handling? For now, just adjust pointers.
485 add AP, AP, ACNT, lsr #3
486 add BP, BP, BCNT, lsr #3