1 /* cast5-arm.S - ARM assembly implementation of CAST5 cipher
3 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * This file is part of Libgcrypt.
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #if defined(__ARMEL__)
24 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
31 .extern _gcry_cast5_s1to4;
34 # define GET_DATA_POINTER(reg, name, rtmp) \
38 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
40 3: add reg, pc, reg; \
43 # define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
46 /* structure of crypto context */
48 #define Kr (Km + (16 * 4))
49 #define Kr_arm_enc (Kr + (16))
50 #define Kr_arm_dec (Kr_arm_enc + (16))
74 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
75 ldrb rout, [rsrc, #((offs) + 0)]; \
76 ldrb rtmp, [rsrc, #((offs) + 1)]; \
77 orr rout, rout, rtmp, lsl #8; \
78 ldrb rtmp, [rsrc, #((offs) + 2)]; \
79 orr rout, rout, rtmp, lsl #16; \
80 ldrb rtmp, [rsrc, #((offs) + 3)]; \
81 orr rout, rout, rtmp, lsl #24;
83 #define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
84 mov rtmp0, rin, lsr #8; \
85 strb rin, [rdst, #((offs) + 0)]; \
86 mov rtmp1, rin, lsr #16; \
87 strb rtmp0, [rdst, #((offs) + 1)]; \
88 mov rtmp0, rin, lsr #24; \
89 strb rtmp1, [rdst, #((offs) + 2)]; \
90 strb rtmp0, [rdst, #((offs) + 3)];
92 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
93 ldrb rout, [rsrc, #((offs) + 3)]; \
94 ldrb rtmp, [rsrc, #((offs) + 2)]; \
95 orr rout, rout, rtmp, lsl #8; \
96 ldrb rtmp, [rsrc, #((offs) + 1)]; \
97 orr rout, rout, rtmp, lsl #16; \
98 ldrb rtmp, [rsrc, #((offs) + 0)]; \
99 orr rout, rout, rtmp, lsl #24;
101 #define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
102 mov rtmp0, rin, lsr #8; \
103 strb rin, [rdst, #((offs) + 3)]; \
104 mov rtmp1, rin, lsr #16; \
105 strb rtmp0, [rdst, #((offs) + 2)]; \
106 mov rtmp0, rin, lsr #24; \
107 strb rtmp1, [rdst, #((offs) + 1)]; \
108 strb rtmp0, [rdst, #((offs) + 0)];
111 #define ldr_unaligned_host ldr_unaligned_le
112 #define str_unaligned_host str_unaligned_le
114 /* bswap on little-endian */
115 #ifdef HAVE_ARM_ARCH_V6
116 #define host_to_be(reg, rtmp) \
118 #define be_to_host(reg, rtmp) \
121 #define host_to_be(reg, rtmp) \
122 eor rtmp, reg, reg, ror #16; \
123 mov rtmp, rtmp, lsr #8; \
124 bic rtmp, rtmp, #65280; \
125 eor reg, rtmp, reg, ror #8;
126 #define be_to_host(reg, rtmp) \
127 eor rtmp, reg, reg, ror #16; \
128 mov rtmp, rtmp, lsr #8; \
129 bic rtmp, rtmp, #65280; \
130 eor reg, rtmp, reg, ror #8;
133 #define ldr_unaligned_host ldr_unaligned_be
134 #define str_unaligned_host str_unaligned_be
136 /* nop on big-endian */
137 #define host_to_be(reg, rtmp) /*_*/
138 #define be_to_host(reg, rtmp) /*_*/
141 #define host_to_host(x, y) /*_*/
143 /**********************************************************************
145 **********************************************************************/
147 #define dummy(n) /*_*/
150 ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */
152 #define load_dec_kr(n) \
153 ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */
156 ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */
158 #define shift_kr(dummy) \
159 mov RKR, RKR, lsr #8;
161 #define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \
163 mov RKM, RKM, ror RKR; \
165 and RT0, RMASK, RKM, ror #(24); \
166 and RT1, RMASK, RKM, lsr #(16); \
167 and RT2, RMASK, RKM, lsr #(8); \
168 ldr RT0, [Rs1, RT0]; \
169 and RT3, RMASK, RKM; \
170 ldr RT1, [Rs2, RT1]; \
173 ldr RT2, [Rs3, RT2]; \
176 ldr RT3, [Rs4, RT3]; \
178 loadkm((n) + (1 - ((dec) * 2))); \
180 loadkr((n) + (1 - ((dec) * 2))); \
183 #define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
184 F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr)
185 #define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
186 F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr)
187 #define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
188 F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr)
190 #define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
191 Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr)
193 #define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
194 Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr)
196 #define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
197 ldr l0, [rin, #((offs) + 0)]; \
198 ldr r0, [rin, #((offs) + 4)]; \
202 #define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
205 str l0, [rout, #((offs) + 0)]; \
206 str r0, [rout, #((offs) + 4)];
208 #ifdef __ARM_FEATURE_UNALIGNED
209 /* unaligned word reads allowed */
210 #define read_block(rin, offs, l0, r0, rtmp0) \
211 read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
213 #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
214 write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
216 #define read_block_host(rin, offs, l0, r0, rtmp0) \
217 read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
219 #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
220 write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
222 /* need to handle unaligned reads by byte reads */
223 #define read_block(rin, offs, l0, r0, rtmp0) \
226 ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
227 ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
230 read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
233 #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
236 str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
237 str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
240 write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
243 #define read_block_host(rin, offs, l0, r0, rtmp0) \
246 ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
247 ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
250 read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
253 #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
256 str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
257 str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
260 write_block_aligned(rout, offs, l0, r0, host_to_host, rtmp0); \
265 .globl _gcry_cast5_arm_encrypt_block
266 .type _gcry_cast5_arm_encrypt_block,%function;
268 _gcry_cast5_arm_encrypt_block:
274 push {%r1, %r4-%r11, %ip, %lr};
276 GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
277 mov RMASK, #(0xff << 2);
278 add Rs2, Rs1, #(0x100*4);
279 add Rs3, Rs1, #(0x100*4*2);
280 add Rs4, Rs1, #(0x100*4*3);
282 read_block(%r2, 0, RL0, RR0, RT0);
286 enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy);
287 enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy);
288 enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy);
289 enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr);
290 enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy);
291 enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy);
292 enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy);
293 enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr);
294 enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy);
295 enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy);
296 enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy);
297 enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr);
298 enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy);
299 enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy);
300 enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
301 enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
304 write_block(%r1, 0, RR0, RL0, RT0, RT1);
306 pop {%r4-%r11, %ip, %pc};
308 .size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block;
311 .globl _gcry_cast5_arm_decrypt_block
312 .type _gcry_cast5_arm_decrypt_block,%function;
314 _gcry_cast5_arm_decrypt_block:
320 push {%r1, %r4-%r11, %ip, %lr};
322 GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
323 mov RMASK, #(0xff << 2);
324 add Rs2, Rs1, #(0x100 * 4);
325 add Rs3, Rs1, #(0x100 * 4 * 2);
326 add Rs4, Rs1, #(0x100 * 4 * 3);
328 read_block(%r2, 0, RL0, RR0, RT0);
332 dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy);
333 dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy);
334 dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy);
335 dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr);
336 dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy);
337 dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy);
338 dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy);
339 dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr);
340 dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy);
341 dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy);
342 dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy);
343 dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr);
344 dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy);
345 dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy);
346 dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
347 dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
350 write_block(%r1, 0, RR0, RL0, RT0, RT1);
352 pop {%r4-%r11, %ip, %pc};
354 .size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block;
356 /**********************************************************************
358 **********************************************************************/
360 #define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \
364 mov RT3, RT3, ror RKR; \
365 mov RKM, RKM, ror RKR; \
367 and RT0, RMASK, RT3, ror #(24); \
368 and RT1, RMASK, RT3, lsr #(16); \
369 and RT2, RMASK, RT3, lsr #(8); \
370 and RT3, RMASK, RT3; \
372 ldr RT0, [Rs1, RT0]; \
373 add RT2, #(0x100 * 4); \
374 ldr RT1, [Rs2, RT1]; \
375 add RT3, #(0x100 * 4 * 2); \
377 ldr RT2, [Rs2, RT2]; \
380 ldr RT3, [Rs2, RT3]; \
381 and RT1, RMASK, RKM, ror #(24); \
383 and RT2, RMASK, RKM, lsr #(16); \
385 and RT3, RMASK, RKM, lsr #(8); \
387 add RT3, #(0x100 * 4); \
388 ldr RT1, [Rs1, RT1]; \
389 and RT0, RMASK, RKM; \
390 ldr RT2, [Rs2, RT2]; \
391 add RT0, #(0x100 * 4 * 2); \
393 ldr RT3, [Rs2, RT3]; \
396 ldr RT0, [Rs2, RT0]; \
398 loadkm((n) + (1 - ((dec) * 2))); \
400 loadkr((n) + (1 - ((dec) * 2))); \
404 #define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
405 F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \
406 loadkm, shiftkr, loadkr)
407 #define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
408 F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \
409 loadkm, shiftkr, loadkr)
410 #define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
411 F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \
412 loadkm, shiftkr, loadkr)
414 #define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
415 Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr)
417 #define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
418 Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr)
420 #define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
421 ldr l0, [rin, #(0)]; \
422 ldr r0, [rin, #(4)]; \
424 ldr l1, [rin, #(8)]; \
426 ldr r1, [rin, #(12)]; \
430 #define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
434 str l0, [rout, #(0)]; \
436 str r0, [rout, #(4)]; \
437 str l1, [rout, #(8)]; \
438 str r1, [rout, #(12)];
440 #ifdef __ARM_FEATURE_UNALIGNED
441 /* unaligned word reads allowed */
442 #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
443 read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
445 #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
446 write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
448 #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
449 read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
451 #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
452 write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
454 /* need to handle unaligned reads by byte reads */
455 #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
458 ldr_unaligned_be(l0, rin, 0, rtmp0); \
459 ldr_unaligned_be(r0, rin, 4, rtmp0); \
460 ldr_unaligned_be(l1, rin, 8, rtmp0); \
461 ldr_unaligned_be(r1, rin, 12, rtmp0); \
464 read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
467 #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
470 str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
471 str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
472 str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
473 str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
476 write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
479 #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
482 ldr_unaligned_host(l0, rin, 0, rtmp0); \
483 ldr_unaligned_host(r0, rin, 4, rtmp0); \
484 ldr_unaligned_host(l1, rin, 8, rtmp0); \
485 ldr_unaligned_host(r1, rin, 12, rtmp0); \
488 read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
491 #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
494 str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
495 str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
496 str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
497 str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
500 write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
505 .type _gcry_cast5_arm_enc_blk2,%function;
507 _gcry_cast5_arm_enc_blk2:
510 * [RL0, RR0], [RL1, RR1]: src
512 * [RR0, RL0], [RR1, RL1]: dst
516 GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
517 mov RMASK, #(0xff << 2);
518 add Rs2, Rs1, #(0x100 * 4);
522 enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy);
523 enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy);
524 enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy);
525 enc_round2(3, F1, RR, RL, load_km, dummy, load_kr);
526 enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy);
527 enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy);
528 enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy);
529 enc_round2(7, F2, RR, RL, load_km, dummy, load_kr);
530 enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy);
531 enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy);
532 enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy);
533 enc_round2(11, F3, RR, RL, load_km, dummy, load_kr);
534 enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy);
535 enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy);
536 enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy);
537 enc_round2(15, F1, RR, RL, dummy, dummy, dummy);
539 host_to_be(RR0, RT0);
540 host_to_be(RL0, RT0);
541 host_to_be(RR1, RT0);
542 host_to_be(RL1, RT0);
546 .size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2;
549 .globl _gcry_cast5_arm_cfb_dec;
550 .type _gcry_cast5_arm_cfb_dec,%function;
552 _gcry_cast5_arm_cfb_dec:
555 * %r1: dst (2 blocks)
556 * %r2: src (2 blocks)
559 push {%r1, %r2, %r4-%r11, %ip, %lr};
563 /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
565 host_to_be(RL0, RT1);
566 host_to_be(RR0, RT1);
567 read_block(%r2, 0, RL1, RR1, %ip);
569 /* Update IV, load src[1] and save to iv[0] */
570 read_block_host(%r2, 8, %r5, %r6, %r7);
573 bl _gcry_cast5_arm_enc_blk2;
574 /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
576 /* %r0: dst, %r1: %src */
579 /* dst = src ^ result */
580 read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
585 write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
587 pop {%r4-%r11, %ip, %pc};
589 .size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec;
592 .globl _gcry_cast5_arm_ctr_enc;
593 .type _gcry_cast5_arm_ctr_enc,%function;
595 _gcry_cast5_arm_ctr_enc:
598 * %r1: dst (2 blocks)
599 * %r2: src (2 blocks)
600 * %r3: iv (64bit, big-endian)
602 push {%r1, %r2, %r4-%r11, %ip, %lr};
606 /* Load IV (big => host endian) */
607 read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1);
610 adds RR1, RR0, #1; /* +1 */
612 adds %r6, RR1, #1; /* +2 */
615 /* Store new IV (host => big-endian) */
616 write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1);
618 bl _gcry_cast5_arm_enc_blk2;
619 /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
621 /* %r0: dst, %r1: %src */
624 /* XOR key-stream with plaintext */
625 read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
630 write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
632 pop {%r4-%r11, %ip, %pc};
634 .size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc;
637 .type _gcry_cast5_arm_dec_blk2,%function;
639 _gcry_cast5_arm_dec_blk2:
642 * [RL0, RR0], [RL1, RR1]: src
644 * [RR0, RL0], [RR1, RL1]: dst
647 GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
648 mov RMASK, #(0xff << 2);
649 add Rs2, Rs1, #(0x100 * 4);
653 dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy);
654 dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy);
655 dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy);
656 dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr);
657 dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy);
658 dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy);
659 dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy);
660 dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr);
661 dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy);
662 dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy);
663 dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy);
664 dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr);
665 dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy);
666 dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy);
667 dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy);
668 dec_round2(0, F1, RR, RL, dummy, dummy, dummy);
670 host_to_be(RR0, RT0);
671 host_to_be(RL0, RT0);
672 host_to_be(RR1, RT0);
673 host_to_be(RL1, RT0);
677 .size _gcry_cast5_arm_dec_blk2,.-_gcry_cast5_arm_dec_blk2;
680 .globl _gcry_cast5_arm_cbc_dec;
681 .type _gcry_cast5_arm_cbc_dec,%function;
683 _gcry_cast5_arm_cbc_dec:
686 * %r1: dst (2 blocks)
687 * %r2: src (2 blocks)
690 push {%r1-%r11, %ip, %lr};
692 read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
694 /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
695 * of function call. */
696 b _gcry_cast5_arm_dec_blk2;
698 /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
700 /* %r0: dst, %r1: %src, %r2: iv */
703 /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
704 read_block_host(%r1, 0, %r7, %r8, %r5);
705 /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
715 /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
716 read_block_host(%r1, 8, %r7, %r8, %r5);
717 /* store IV+2 to iv[0] (aligned). */
720 /* store result to dst[0-3]. Might be unaligned. */
721 write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
723 pop {%r4-%r11, %ip, %pc};
725 .size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec;
727 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
728 #endif /*__ARM_ARCH >= 6*/