Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / boringssl / src / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256 block procedure for ARMv4. May 2007.
11
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
15
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26 # September 2013.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
32 # about it).
33
34 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
35 open STDOUT,">$output";
36
37 $ctx="r0";      $t0="r0";
38 $inp="r1";      $t4="r1";
39 $len="r2";      $t1="r2";
40 $T1="r3";       $t3="r3";
41 $A="r4";
42 $B="r5";
43 $C="r6";
44 $D="r7";
45 $E="r8";
46 $F="r9";
47 $G="r10";
48 $H="r11";
49 @V=($A,$B,$C,$D,$E,$F,$G,$H);
50 $t2="r12";
51 $Ktbl="r14";
52
53 @Sigma0=( 2,13,22);
54 @Sigma1=( 6,11,25);
55 @sigma0=( 7,18, 3);
56 @sigma1=(17,19,10);
57
58 sub BODY_00_15 {
59 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
60
61 $code.=<<___ if ($i<16);
62 #if __ARM_ARCH__>=7
63         @ ldr   $t1,[$inp],#4                   @ $i
64 # if $i==15
65         str     $inp,[sp,#17*4]                 @ make room for $t4
66 # endif
67         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
68         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
69         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
70         rev     $t1,$t1
71 #else
72         @ ldrb  $t1,[$inp,#3]                   @ $i
73         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
74         ldrb    $t2,[$inp,#2]
75         ldrb    $t0,[$inp,#1]
76         orr     $t1,$t1,$t2,lsl#8
77         ldrb    $t2,[$inp],#4
78         orr     $t1,$t1,$t0,lsl#16
79 # if $i==15
80         str     $inp,[sp,#17*4]                 @ make room for $t4
81 # endif
82         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
83         orr     $t1,$t1,$t2,lsl#24
84         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
85 #endif
86 ___
87 $code.=<<___;
88         ldr     $t2,[$Ktbl],#4                  @ *K256++
89         add     $h,$h,$t1                       @ h+=X[i]
90         str     $t1,[sp,#`$i%16`*4]
91         eor     $t1,$f,$g
92         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
93         and     $t1,$t1,$e
94         add     $h,$h,$t2                       @ h+=K256[i]
95         eor     $t1,$t1,$g                      @ Ch(e,f,g)
96         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
97         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
98 #if $i==31
99         and     $t2,$t2,#0xff
100         cmp     $t2,#0xf2                       @ done?
101 #endif
102 #if $i<15
103 # if __ARM_ARCH__>=7
104         ldr     $t1,[$inp],#4                   @ prefetch
105 # else
106         ldrb    $t1,[$inp,#3]
107 # endif
108         eor     $t2,$a,$b                       @ a^b, b^c in next round
109 #else
110         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
111         eor     $t2,$a,$b                       @ a^b, b^c in next round
112         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
113 #endif
114         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
115         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
116         add     $d,$d,$h                        @ d+=h
117         eor     $t3,$t3,$b                      @ Maj(a,b,c)
118         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
119         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
120 ___
121         ($t2,$t3)=($t3,$t2);
122 }
123
124 sub BODY_16_XX {
125 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
126
127 $code.=<<___;
128         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
129         @ ldr   $t4,[sp,#`($i+14)%16`*4]
130         mov     $t0,$t1,ror#$sigma0[0]
131         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
132         mov     $t2,$t4,ror#$sigma1[0]
133         eor     $t0,$t0,$t1,ror#$sigma0[1]
134         eor     $t2,$t2,$t4,ror#$sigma1[1]
135         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
136         ldr     $t1,[sp,#`($i+0)%16`*4]
137         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
138         ldr     $t4,[sp,#`($i+9)%16`*4]
139
140         add     $t2,$t2,$t0
141         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
142         add     $t1,$t1,$t2
143         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
144         add     $t1,$t1,$t4                     @ X[i]
145 ___
146         &BODY_00_15(@_);
147 }
148
149 $code=<<___;
150 #if defined(__arm__)
151 #include "arm_arch.h"
152
153 .text
154 .code   32
155
156 .type   K256,%object
157 .align  5
158 K256:
159 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
161 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
162 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
163 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
164 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
165 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
166 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
167 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
168 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
169 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
170 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
171 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
172 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
174 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
175 .size   K256,.-K256
176 .word   0                               @ terminator
177 .LOPENSSL_armcap:
178 .word   OPENSSL_armcap_P-sha256_block_data_order
179 .align  5
180
181 .global sha256_block_data_order
182 .hidden sha256_block_data_order
183 .type   sha256_block_data_order,%function
184 sha256_block_data_order:
185         sub     r3,pc,#8                @ sha256_block_data_order
186         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
187 #if __ARM_ARCH__>=7
188         ldr     r12,.LOPENSSL_armcap
189         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
190         tst     r12,#1
191         bne     .LNEON
192 #endif
193         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
194         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
195         sub     $Ktbl,r3,#256+32        @ K256
196         sub     sp,sp,#16*4             @ alloca(X[16])
197 .Loop:
198 # if __ARM_ARCH__>=7
199         ldr     $t1,[$inp],#4
200 # else
201         ldrb    $t1,[$inp,#3]
202 # endif
203         eor     $t3,$B,$C               @ magic
204         eor     $t2,$t2,$t2
205 ___
206 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
207 $code.=".Lrounds_16_xx:\n";
208 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
209 $code.=<<___;
210         ldreq   $t3,[sp,#16*4]          @ pull ctx
211         bne     .Lrounds_16_xx
212
213         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
214         ldr     $t0,[$t3,#0]
215         ldr     $t1,[$t3,#4]
216         ldr     $t2,[$t3,#8]
217         add     $A,$A,$t0
218         ldr     $t0,[$t3,#12]
219         add     $B,$B,$t1
220         ldr     $t1,[$t3,#16]
221         add     $C,$C,$t2
222         ldr     $t2,[$t3,#20]
223         add     $D,$D,$t0
224         ldr     $t0,[$t3,#24]
225         add     $E,$E,$t1
226         ldr     $t1,[$t3,#28]
227         add     $F,$F,$t2
228         ldr     $inp,[sp,#17*4]         @ pull inp
229         ldr     $t2,[sp,#18*4]          @ pull inp+len
230         add     $G,$G,$t0
231         add     $H,$H,$t1
232         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
233         cmp     $inp,$t2
234         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
235         bne     .Loop
236
237         add     sp,sp,#`16+3`*4 @ destroy frame
238 #if __ARM_ARCH__>=5
239         ldmia   sp!,{r4-r11,pc}
240 #else
241         ldmia   sp!,{r4-r11,lr}
242         tst     lr,#1
243         moveq   pc,lr                   @ be binary compatible with V4, yet
244         bx      lr                      @ interoperable with Thumb ISA:-)
245 #endif
246 ___
247 ######################################################################
248 # NEON stuff
249 #
250 {{{
251 my @X=map("q$_",(0..3));
252 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
253 my $Xfer=$t4;
254 my $j=0;
255
256 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
257 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
258
259 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
260 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
261   my $arg = pop;
262     $arg = "#$arg" if ($arg*1 eq $arg);
263     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
264 }
265
266 sub Xupdate()
267 { use integer;
268   my $body = shift;
269   my @insns = (&$body,&$body,&$body,&$body);
270   my ($a,$b,$c,$d,$e,$f,$g,$h);
271
272         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
273          eval(shift(@insns));
274          eval(shift(@insns));
275          eval(shift(@insns));
276         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
277          eval(shift(@insns));
278          eval(shift(@insns));
279          eval(shift(@insns));
280         &vshr_u32       ($T2,$T0,$sigma0[0]);
281          eval(shift(@insns));
282          eval(shift(@insns));
283         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
284          eval(shift(@insns));
285          eval(shift(@insns));
286         &vshr_u32       ($T1,$T0,$sigma0[2]);
287          eval(shift(@insns));
288          eval(shift(@insns));
289         &vsli_32        ($T2,$T0,32-$sigma0[0]);
290          eval(shift(@insns));
291          eval(shift(@insns));
292         &vshr_u32       ($T3,$T0,$sigma0[1]);
293          eval(shift(@insns));
294          eval(shift(@insns));
295         &veor           ($T1,$T1,$T2);
296          eval(shift(@insns));
297          eval(shift(@insns));
298         &vsli_32        ($T3,$T0,32-$sigma0[1]);
299          eval(shift(@insns));
300          eval(shift(@insns));
301           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
302          eval(shift(@insns));
303          eval(shift(@insns));
304         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
305          eval(shift(@insns));
306          eval(shift(@insns));
307           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
308          eval(shift(@insns));
309          eval(shift(@insns));
310           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
311          eval(shift(@insns));
312          eval(shift(@insns));
313         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
314          eval(shift(@insns));
315          eval(shift(@insns));
316           &veor         ($T5,$T5,$T4);
317          eval(shift(@insns));
318          eval(shift(@insns));
319           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
320          eval(shift(@insns));
321          eval(shift(@insns));
322           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
323          eval(shift(@insns));
324          eval(shift(@insns));
325           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
326          eval(shift(@insns));
327          eval(shift(@insns));
328         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
329          eval(shift(@insns));
330          eval(shift(@insns));
331           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
332          eval(shift(@insns));
333          eval(shift(@insns));
334           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
335          eval(shift(@insns));
336          eval(shift(@insns));
337           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
338          eval(shift(@insns));
339          eval(shift(@insns));
340           &veor         ($T5,$T5,$T4);
341          eval(shift(@insns));
342          eval(shift(@insns));
343           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
344          eval(shift(@insns));
345          eval(shift(@insns));
346         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
347          eval(shift(@insns));
348          eval(shift(@insns));
349           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
350          eval(shift(@insns));
351          eval(shift(@insns));
352           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
353          eval(shift(@insns));
354          eval(shift(@insns));
355         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
356          eval(shift(@insns));
357          eval(shift(@insns));
358         &vadd_i32       ($T0,$T0,@X[0]);
359          while($#insns>=2) { eval(shift(@insns)); }
360         &vst1_32        ("{$T0}","[$Xfer,:128]!");
361          eval(shift(@insns));
362          eval(shift(@insns));
363
364         push(@X,shift(@X));             # "rotate" X[]
365 }
366
367 sub Xpreload()
368 { use integer;
369   my $body = shift;
370   my @insns = (&$body,&$body,&$body,&$body);
371   my ($a,$b,$c,$d,$e,$f,$g,$h);
372
373          eval(shift(@insns));
374          eval(shift(@insns));
375          eval(shift(@insns));
376          eval(shift(@insns));
377         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
378          eval(shift(@insns));
379          eval(shift(@insns));
380          eval(shift(@insns));
381          eval(shift(@insns));
382         &vrev32_8       (@X[0],@X[0]);
383          eval(shift(@insns));
384          eval(shift(@insns));
385          eval(shift(@insns));
386          eval(shift(@insns));
387         &vadd_i32       ($T0,$T0,@X[0]);
388          foreach (@insns) { eval; }     # remaining instructions
389         &vst1_32        ("{$T0}","[$Xfer,:128]!");
390
391         push(@X,shift(@X));             # "rotate" X[]
392 }
393
394 sub body_00_15 () {
395         (
396         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
397         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
398         '&eor   ($t1,$f,$g)',
399         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
400         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
401         '&and   ($t1,$t1,$e)',
402         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
403         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
404         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
405         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
406         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
407         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
408         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
409         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
410         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
411         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
412         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
413         '&add   ($d,$d,$h)',                    # d+=h
414         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
415         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
416         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
417         )
418 }
419
420 $code.=<<___;
421 #if __ARM_ARCH__>=7
422 .fpu    neon
423 .align  4
424 .LNEON:
425         stmdb   sp!,{r4-r12,lr}
426
427         mov     $t2,sp
428         sub     sp,sp,#16*4+16          @ alloca
429         sub     $Ktbl,r3,#256+32        @ K256
430         bic     sp,sp,#15               @ align for 128-bit stores
431
432         vld1.8          {@X[0]},[$inp]!
433         vld1.8          {@X[1]},[$inp]!
434         vld1.8          {@X[2]},[$inp]!
435         vld1.8          {@X[3]},[$inp]!
436         vld1.32         {$T0},[$Ktbl,:128]!
437         vld1.32         {$T1},[$Ktbl,:128]!
438         vld1.32         {$T2},[$Ktbl,:128]!
439         vld1.32         {$T3},[$Ktbl,:128]!
440         vrev32.8        @X[0],@X[0]             @ yes, even on
441         str             $ctx,[sp,#64]
442         vrev32.8        @X[1],@X[1]             @ big-endian
443         str             $inp,[sp,#68]
444         mov             $Xfer,sp
445         vrev32.8        @X[2],@X[2]
446         str             $len,[sp,#72]
447         vrev32.8        @X[3],@X[3]
448         str             $t2,[sp,#76]            @ save original sp
449         vadd.i32        $T0,$T0,@X[0]
450         vadd.i32        $T1,$T1,@X[1]
451         vst1.32         {$T0},[$Xfer,:128]!
452         vadd.i32        $T2,$T2,@X[2]
453         vst1.32         {$T1},[$Xfer,:128]!
454         vadd.i32        $T3,$T3,@X[3]
455         vst1.32         {$T2},[$Xfer,:128]!
456         vst1.32         {$T3},[$Xfer,:128]!
457
458         ldmia           $ctx,{$A-$H}
459         sub             $Xfer,$Xfer,#64
460         ldr             $t1,[sp,#0]
461         eor             $t2,$t2,$t2
462         eor             $t3,$B,$C
463         b               .L_00_48
464
465 .align  4
466 .L_00_48:
467 ___
468         &Xupdate(\&body_00_15);
469         &Xupdate(\&body_00_15);
470         &Xupdate(\&body_00_15);
471         &Xupdate(\&body_00_15);
472 $code.=<<___;
473         teq     $t1,#0                          @ check for K256 terminator
474         ldr     $t1,[sp,#0]
475         sub     $Xfer,$Xfer,#64
476         bne     .L_00_48
477
478         ldr             $inp,[sp,#68]
479         ldr             $t0,[sp,#72]
480         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
481         teq             $inp,$t0
482         subeq           $inp,$inp,#64           @ avoid SEGV
483         vld1.8          {@X[0]},[$inp]!         @ load next input block
484         vld1.8          {@X[1]},[$inp]!
485         vld1.8          {@X[2]},[$inp]!
486         vld1.8          {@X[3]},[$inp]!
487         strne           $inp,[sp,#68]
488         mov             $Xfer,sp
489 ___
490         &Xpreload(\&body_00_15);
491         &Xpreload(\&body_00_15);
492         &Xpreload(\&body_00_15);
493         &Xpreload(\&body_00_15);
494 $code.=<<___;
495         ldr     $t0,[$t1,#0]
496         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
497         ldr     $t2,[$t1,#4]
498         ldr     $t3,[$t1,#8]
499         ldr     $t4,[$t1,#12]
500         add     $A,$A,$t0                       @ accumulate
501         ldr     $t0,[$t1,#16]
502         add     $B,$B,$t2
503         ldr     $t2,[$t1,#20]
504         add     $C,$C,$t3
505         ldr     $t3,[$t1,#24]
506         add     $D,$D,$t4
507         ldr     $t4,[$t1,#28]
508         add     $E,$E,$t0
509         str     $A,[$t1],#4
510         add     $F,$F,$t2
511         str     $B,[$t1],#4
512         add     $G,$G,$t3
513         str     $C,[$t1],#4
514         add     $H,$H,$t4
515         str     $D,[$t1],#4
516         stmia   $t1,{$E-$H}
517
518         movne   $Xfer,sp
519         ldrne   $t1,[sp,#0]
520         eorne   $t2,$t2,$t2
521         ldreq   sp,[sp,#76]                     @ restore original sp
522         eorne   $t3,$B,$C
523         bne     .L_00_48
524
525         ldmia   sp!,{r4-r12,pc}
526 #endif
527 ___
528 }}}
529 $code.=<<___;
530 .size   sha256_block_data_order,.-sha256_block_data_order
531 .asciz  "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
532 .align  2
533 .comm   OPENSSL_armcap_P,4,4
534
535 #endif
536 ___
537
538 $code =~ s/\`([^\`]*)\`/eval $1/gem;
539 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
540 print $code;
541 close STDOUT; # enforce flush