3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256 block procedure for ARMv4. May 2007.
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
34 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
35 open STDOUT,">$output";
49 @V=($A,$B,$C,$D,$E,$F,$G,$H);
59 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
61 $code.=<<___ if ($i<16);
63 @ ldr $t1,[$inp],#4 @ $i
65 str $inp,[sp,#17*4] @ make room for $t4
67 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
68 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
69 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
72 @ ldrb $t1,[$inp,#3] @ $i
73 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
78 orr $t1,$t1,$t0,lsl#16
80 str $inp,[sp,#17*4] @ make room for $t4
82 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
83 orr $t1,$t1,$t2,lsl#24
84 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
88 ldr $t2,[$Ktbl],#4 @ *K256++
89 add $h,$h,$t1 @ h+=X[i]
90 str $t1,[sp,#`$i%16`*4]
92 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
94 add $h,$h,$t2 @ h+=K256[i]
95 eor $t1,$t1,$g @ Ch(e,f,g)
96 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
97 add $h,$h,$t1 @ h+=Ch(e,f,g)
100 cmp $t2,#0xf2 @ done?
104 ldr $t1,[$inp],#4 @ prefetch
108 eor $t2,$a,$b @ a^b, b^c in next round
110 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
111 eor $t2,$a,$b @ a^b, b^c in next round
112 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
114 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
115 and $t3,$t3,$t2 @ (b^c)&=(a^b)
117 eor $t3,$t3,$b @ Maj(a,b,c)
118 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
119 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
125 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
128 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
129 @ ldr $t4,[sp,#`($i+14)%16`*4]
130 mov $t0,$t1,ror#$sigma0[0]
131 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
132 mov $t2,$t4,ror#$sigma1[0]
133 eor $t0,$t0,$t1,ror#$sigma0[1]
134 eor $t2,$t2,$t4,ror#$sigma1[1]
135 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
136 ldr $t1,[sp,#`($i+0)%16`*4]
137 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
138 ldr $t4,[sp,#`($i+9)%16`*4]
141 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
143 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
144 add $t1,$t1,$t4 @ X[i]
151 #include "arm_arch.h"
159 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
161 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
162 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
163 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
164 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
165 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
166 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
167 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
168 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
169 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
170 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
171 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
172 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
174 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178 .word OPENSSL_armcap_P-sha256_block_data_order
181 .global sha256_block_data_order
182 .hidden sha256_block_data_order
183 .type sha256_block_data_order,%function
184 sha256_block_data_order:
185 sub r3,pc,#8 @ sha256_block_data_order
186 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
188 ldr r12,.LOPENSSL_armcap
189 ldr r12,[r3,r12] @ OPENSSL_armcap_P
193 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
194 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
195 sub $Ktbl,r3,#256+32 @ K256
196 sub sp,sp,#16*4 @ alloca(X[16])
203 eor $t3,$B,$C @ magic
206 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
207 $code.=".Lrounds_16_xx:\n";
208 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
210 ldreq $t3,[sp,#16*4] @ pull ctx
213 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
228 ldr $inp,[sp,#17*4] @ pull inp
229 ldr $t2,[sp,#18*4] @ pull inp+len
232 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
234 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
237 add sp,sp,#`16+3`*4 @ destroy frame
239 ldmia sp!,{r4-r11,pc}
241 ldmia sp!,{r4-r11,lr}
243 moveq pc,lr @ be binary compatible with V4, yet
244 bx lr @ interoperable with Thumb ISA:-)
247 ######################################################################
251 my @X=map("q$_",(0..3));
252 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
256 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
257 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
259 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
260 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
262 $arg = "#$arg" if ($arg*1 eq $arg);
263 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
269 my @insns = (&$body,&$body,&$body,&$body);
270 my ($a,$b,$c,$d,$e,$f,$g,$h);
272 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
276 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
280 &vshr_u32 ($T2,$T0,$sigma0[0]);
283 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
286 &vshr_u32 ($T1,$T0,$sigma0[2]);
289 &vsli_32 ($T2,$T0,32-$sigma0[0]);
292 &vshr_u32 ($T3,$T0,$sigma0[1]);
298 &vsli_32 ($T3,$T0,32-$sigma0[1]);
301 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
304 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
307 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
310 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
313 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
319 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
322 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
325 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
328 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
331 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
334 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
337 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
343 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
346 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
349 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
352 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
355 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
358 &vadd_i32 ($T0,$T0,@X[0]);
359 while($#insns>=2) { eval(shift(@insns)); }
360 &vst1_32 ("{$T0}","[$Xfer,:128]!");
364 push(@X,shift(@X)); # "rotate" X[]
370 my @insns = (&$body,&$body,&$body,&$body);
371 my ($a,$b,$c,$d,$e,$f,$g,$h);
377 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
382 &vrev32_8 (@X[0],@X[0]);
387 &vadd_i32 ($T0,$T0,@X[0]);
388 foreach (@insns) { eval; } # remaining instructions
389 &vst1_32 ("{$T0}","[$Xfer,:128]!");
391 push(@X,shift(@X)); # "rotate" X[]
396 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
397 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
399 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
400 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
402 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
403 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
404 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
405 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
406 '&eor ($t2,$a,$b)', # a^b, b^c in next round
407 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
408 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
409 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
410 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
411 '&ldr ($t1,"[sp,#64]") if ($j==31)',
412 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
413 '&add ($d,$d,$h)', # d+=h
414 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
415 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
416 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
425 stmdb sp!,{r4-r12,lr}
428 sub sp,sp,#16*4+16 @ alloca
429 sub $Ktbl,r3,#256+32 @ K256
430 bic sp,sp,#15 @ align for 128-bit stores
432 vld1.8 {@X[0]},[$inp]!
433 vld1.8 {@X[1]},[$inp]!
434 vld1.8 {@X[2]},[$inp]!
435 vld1.8 {@X[3]},[$inp]!
436 vld1.32 {$T0},[$Ktbl,:128]!
437 vld1.32 {$T1},[$Ktbl,:128]!
438 vld1.32 {$T2},[$Ktbl,:128]!
439 vld1.32 {$T3},[$Ktbl,:128]!
440 vrev32.8 @X[0],@X[0] @ yes, even on
442 vrev32.8 @X[1],@X[1] @ big-endian
448 str $t2,[sp,#76] @ save original sp
449 vadd.i32 $T0,$T0,@X[0]
450 vadd.i32 $T1,$T1,@X[1]
451 vst1.32 {$T0},[$Xfer,:128]!
452 vadd.i32 $T2,$T2,@X[2]
453 vst1.32 {$T1},[$Xfer,:128]!
454 vadd.i32 $T3,$T3,@X[3]
455 vst1.32 {$T2},[$Xfer,:128]!
456 vst1.32 {$T3},[$Xfer,:128]!
468 &Xupdate(\&body_00_15);
469 &Xupdate(\&body_00_15);
470 &Xupdate(\&body_00_15);
471 &Xupdate(\&body_00_15);
473 teq $t1,#0 @ check for K256 terminator
480 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
482 subeq $inp,$inp,#64 @ avoid SEGV
483 vld1.8 {@X[0]},[$inp]! @ load next input block
484 vld1.8 {@X[1]},[$inp]!
485 vld1.8 {@X[2]},[$inp]!
486 vld1.8 {@X[3]},[$inp]!
490 &Xpreload(\&body_00_15);
491 &Xpreload(\&body_00_15);
492 &Xpreload(\&body_00_15);
493 &Xpreload(\&body_00_15);
496 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
500 add $A,$A,$t0 @ accumulate
521 ldreq sp,[sp,#76] @ restore original sp
525 ldmia sp!,{r4-r12,pc}
530 .size sha256_block_data_order,.-sha256_block_data_order
531 .asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
533 .comm OPENSSL_armcap_P,4,4
538 $code =~ s/\`([^\`]*)\`/eval $1/gem;
539 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
541 close STDOUT; # enforce flush