2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**) gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37 # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38 # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
40 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41 # Westmere is omitted from loop, this is because gain was not
42 # estimated high enough to justify the effort;
43 # (**) these are EVP-free results, results obtained with 'speed
44 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56 die "can't locate x86_64-xlate.pl";
58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.19) + ($1>=2.22);
63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70 $avx = ($1>=10) + ($1>=12);
73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
74 $avx = ($2>=3.0) + ($2>3.0);
77 $shaext=$avx; ### set to zero if compiling for 1.0.1
78 $avx=1 if (!$shaext && $avx);
80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
83 $func="aesni_cbc_sha256_enc";
86 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87 "%r8d","%r9d","%r10d","%r11d");
88 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
95 ########################################################################
96 # void aesni_cbc_sha256_enc(const void *inp,
103 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
104 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
108 $_inp="16*$SZ+0*8(%rsp)";
109 $_out="16*$SZ+1*8(%rsp)";
110 $_end="16*$SZ+2*8(%rsp)";
111 $_key="16*$SZ+3*8(%rsp)";
112 $_ivp="16*$SZ+4*8(%rsp)";
113 $_ctx="16*$SZ+5*8(%rsp)";
114 $_in0="16*$SZ+6*8(%rsp)";
115 $_rsp="`16*$SZ+7*8`(%rsp)";
121 .extern OPENSSL_ia32cap_P
123 .type $func,\@abi-omnipotent
130 lea OPENSSL_ia32cap_P(%rip),%r11
132 cmp \$0,`$win64?"%rcx":"%rdi"`
137 $code.=<<___ if ($shaext);
138 bt \$61,%r10 # check for SHA
145 test \$`1<<11`,%r10d # check for XOP
148 $code.=<<___ if ($avx>1);
149 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
150 cmp \$`1<<8|1<<5|1<<3`,%r11d
154 and \$`1<<28`,%r10d # check for AVX
161 cmp \$0,`$win64?"%rcx":"%rdi"`
170 .type $TABLE,\@object
172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
174 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
175 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
176 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
177 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
178 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
179 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
180 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
181 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
182 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
183 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
186 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
187 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
188 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
189 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
192 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
193 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
194 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
195 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
196 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
197 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
198 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
199 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
200 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
201 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
206 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
207 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
208 .long 0,0,0,0, 0,0,0,0
209 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
213 ######################################################################
217 ($iv,$inout,$roundkey,$temp,
218 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
222 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
223 ## &vmovdqu ($inout,($inp));
224 ## &mov ($_inp,$inp);
226 '&vpxor ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
229 '&vpxor ($inout,$inout,$iv);',
231 '&vaesenc ($inout,$inout,$roundkey);'.
232 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
234 '&vaesenc ($inout,$inout,$roundkey);'.
235 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
237 '&vaesenc ($inout,$inout,$roundkey);'.
238 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
240 '&vaesenc ($inout,$inout,$roundkey);'.
241 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
243 '&vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
246 '&vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
249 '&vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
252 '&vaesenc ($inout,$inout,$roundkey);'.
253 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
255 '&vaesenc ($inout,$inout,$roundkey);'.
256 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
258 '&vaesenclast ($temp,$inout,$roundkey);'.
259 ' &vaesenc ($inout,$inout,$roundkey);'.
260 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
262 '&vpand ($iv,$temp,$mask10);'.
263 ' &vaesenc ($inout,$inout,$roundkey);'.
264 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
266 '&vaesenclast ($temp,$inout,$roundkey);'.
267 ' &vaesenc ($inout,$inout,$roundkey);'.
268 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
270 '&vpand ($temp,$temp,$mask12);'.
271 ' &vaesenc ($inout,$inout,$roundkey);'.
272 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
274 '&vpor ($iv,$iv,$temp);'.
275 ' &vaesenclast ($temp,$inout,$roundkey);'.
276 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
278 ## &mov ($inp,$_inp);
279 ## &mov ($out,$_out);
280 ## &vpand ($temp,$temp,$mask14);
281 ## &vpor ($iv,$iv,$temp);
282 ## &vmovdqu ($iv,($out,$inp);
283 ## &lea (inp,16($inp));
287 my ($a,$b,$c,$d,$e,$f,$g,$h);
289 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
290 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
292 $arg = "\$$arg" if ($arg*1 eq $arg);
293 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
298 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
300 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
305 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
306 '&xor ($a4,$g)', # f^g
308 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
310 '&and ($a4,$e)', # (f^g)&e
312 @aesni_cbc_block[$aesni_cbc_idx++].
314 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
317 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
318 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
319 '&xor ($a2,$b)', # a^b, b^c in next round
321 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
322 '&add ($h,$a4)', # h+=Ch(e,f,g)
323 '&and ($a3,$a2)', # (b^c)&(a^b)
326 '&add ($h,$a0)', # h+=Sigma1(e)
327 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
329 '&add ($d,$h)', # d+=h
330 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
331 '&add ($h,$a3)', # h+=Maj(a,b,c)
334 '&add ($a1,$h);'. # h+=Sigma0(a)
335 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
340 ######################################################################
344 .type ${func}_xop,\@function,6
349 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
350 mov %rsp,%rax # copy %rsp
351 .cfi_def_cfa_register %rax
364 sub \$`$framesz+$win64*16*10`,%rsp
365 and \$-64,%rsp # align stack frame
368 sub $inp,$out # re-bias
370 add $inp,$len # end of input
372 #mov $inp,$_inp # saved later
375 #mov $key,$_key # remains resident in $inp register
380 .cfi_cfa_expression $_rsp,deref,+8
382 $code.=<<___ if ($win64);
383 movaps %xmm6,`$framesz+16*0`(%rsp)
384 movaps %xmm7,`$framesz+16*1`(%rsp)
385 movaps %xmm8,`$framesz+16*2`(%rsp)
386 movaps %xmm9,`$framesz+16*3`(%rsp)
387 movaps %xmm10,`$framesz+16*4`(%rsp)
388 movaps %xmm11,`$framesz+16*5`(%rsp)
389 movaps %xmm12,`$framesz+16*6`(%rsp)
390 movaps %xmm13,`$framesz+16*7`(%rsp)
391 movaps %xmm14,`$framesz+16*8`(%rsp)
392 movaps %xmm15,`$framesz+16*9`(%rsp)
398 mov $inp,%r12 # borrow $a4
399 lea 0x80($key),$inp # size optimization, reassign
400 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
401 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
402 mov $ctx,%r15 # borrow $a2
403 mov $in0,%rsi # borrow $a3
404 vmovdqu ($ivp),$iv # load IV
416 vmovdqa 0x00(%r13,%r14,8),$mask14
417 vmovdqa 0x10(%r13,%r14,8),$mask12
418 vmovdqa 0x20(%r13,%r14,8),$mask10
419 vmovdqu 0x00-0x80($inp),$roundkey
422 if ($SZ==4) { # SHA256
423 my @X = map("%xmm$_",(0..3));
424 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
429 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
430 vmovdqu 0x00(%rsi,%r12),@X[0]
431 vmovdqu 0x10(%rsi,%r12),@X[1]
432 vmovdqu 0x20(%rsi,%r12),@X[2]
433 vmovdqu 0x30(%rsi,%r12),@X[3]
434 vpshufb $t3,@X[0],@X[0]
435 lea $TABLE(%rip),$Tbl
436 vpshufb $t3,@X[1],@X[1]
437 vpshufb $t3,@X[2],@X[2]
438 vpaddd 0x00($Tbl),@X[0],$t0
439 vpshufb $t3,@X[3],@X[3]
440 vpaddd 0x20($Tbl),@X[1],$t1
441 vpaddd 0x40($Tbl),@X[2],$t2
442 vpaddd 0x60($Tbl),@X[3],$t3
443 vmovdqa $t0,0x00(%rsp)
445 vmovdqa $t1,0x10(%rsp)
447 vmovdqa $t2,0x20(%rsp)
449 vmovdqa $t3,0x30(%rsp)
455 sub \$-16*2*$SZ,$Tbl # size optimization
456 vmovdqu (%r12),$inout # $a4
459 sub XOP_256_00_47 () {
463 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
465 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
468 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
471 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
474 &vpsrld ($t0,$t0,$sigma0[2]);
477 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
482 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
485 &vpxor ($t0,$t0,$t1);
490 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
493 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
496 &vpsrld ($t2,@X[3],$sigma1[2]);
499 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
502 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
505 &vpxor ($t3,$t3,$t2);
510 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
515 &vpsrldq ($t3,$t3,8);
520 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
525 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
528 &vpsrld ($t2,@X[0],$sigma1[2]);
531 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
534 &vpxor ($t3,$t3,$t2);
539 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
544 &vpslldq ($t3,$t3,8); # 22 instructions
549 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
554 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
555 foreach (@insns) { eval; } # remaining instructions
556 &vmovdqa (16*$j."(%rsp)",$t2);
560 for ($i=0,$j=0; $j<4; $j++) {
561 &XOP_256_00_47($j,\&body_00_15,@X);
562 push(@X,shift(@X)); # rotate(@X)
564 &mov ("%r12",$_inp); # borrow $a4
565 &vpand ($temp,$temp,$mask14);
566 &mov ("%r15",$_out); # borrow $a2
567 &vpor ($iv,$iv,$temp);
568 &vmovdqu ("(%r15,%r12)",$iv); # write output
569 &lea ("%r12","16(%r12)"); # inp++
571 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
572 &jne (".Lxop_00_47");
574 &vmovdqu ($inout,"(%r12)");
578 for ($i=0; $i<16; ) {
579 foreach(body_00_15()) { eval; }
583 mov $_inp,%r12 # borrow $a4
584 mov $_out,%r13 # borrow $a0
585 mov $_ctx,%r15 # borrow $a2
586 mov $_in0,%rsi # borrow $a3
588 vpand $mask14,$temp,$temp
591 vmovdqu $iv,(%r13,%r12) # write output
592 lea 16(%r12),%r12 # inp++
619 vmovdqu $iv,($ivp) # output IV
622 $code.=<<___ if ($win64);
623 movaps `$framesz+16*0`(%rsp),%xmm6
624 movaps `$framesz+16*1`(%rsp),%xmm7
625 movaps `$framesz+16*2`(%rsp),%xmm8
626 movaps `$framesz+16*3`(%rsp),%xmm9
627 movaps `$framesz+16*4`(%rsp),%xmm10
628 movaps `$framesz+16*5`(%rsp),%xmm11
629 movaps `$framesz+16*6`(%rsp),%xmm12
630 movaps `$framesz+16*7`(%rsp),%xmm13
631 movaps `$framesz+16*8`(%rsp),%xmm14
632 movaps `$framesz+16*9`(%rsp),%xmm15
648 .cfi_def_cfa_register %rsp
652 .size ${func}_xop,.-${func}_xop
654 ######################################################################
657 local *ror = sub { &shrd(@_[0],@_) };
660 .type ${func}_avx,\@function,6
665 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
666 mov %rsp,%rax # copy %rsp
667 .cfi_def_cfa_register %rax
680 sub \$`$framesz+$win64*16*10`,%rsp
681 and \$-64,%rsp # align stack frame
684 sub $inp,$out # re-bias
686 add $inp,$len # end of input
688 #mov $inp,$_inp # saved later
691 #mov $key,$_key # remains resident in $inp register
696 .cfi_cfa_expression $_rsp,deref,+8
698 $code.=<<___ if ($win64);
699 movaps %xmm6,`$framesz+16*0`(%rsp)
700 movaps %xmm7,`$framesz+16*1`(%rsp)
701 movaps %xmm8,`$framesz+16*2`(%rsp)
702 movaps %xmm9,`$framesz+16*3`(%rsp)
703 movaps %xmm10,`$framesz+16*4`(%rsp)
704 movaps %xmm11,`$framesz+16*5`(%rsp)
705 movaps %xmm12,`$framesz+16*6`(%rsp)
706 movaps %xmm13,`$framesz+16*7`(%rsp)
707 movaps %xmm14,`$framesz+16*8`(%rsp)
708 movaps %xmm15,`$framesz+16*9`(%rsp)
714 mov $inp,%r12 # borrow $a4
715 lea 0x80($key),$inp # size optimization, reassign
716 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
717 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
718 mov $ctx,%r15 # borrow $a2
719 mov $in0,%rsi # borrow $a3
720 vmovdqu ($ivp),$iv # load IV
732 vmovdqa 0x00(%r13,%r14,8),$mask14
733 vmovdqa 0x10(%r13,%r14,8),$mask12
734 vmovdqa 0x20(%r13,%r14,8),$mask10
735 vmovdqu 0x00-0x80($inp),$roundkey
737 if ($SZ==4) { # SHA256
738 my @X = map("%xmm$_",(0..3));
739 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
745 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
746 vmovdqu 0x00(%rsi,%r12),@X[0]
747 vmovdqu 0x10(%rsi,%r12),@X[1]
748 vmovdqu 0x20(%rsi,%r12),@X[2]
749 vmovdqu 0x30(%rsi,%r12),@X[3]
750 vpshufb $t3,@X[0],@X[0]
751 lea $TABLE(%rip),$Tbl
752 vpshufb $t3,@X[1],@X[1]
753 vpshufb $t3,@X[2],@X[2]
754 vpaddd 0x00($Tbl),@X[0],$t0
755 vpshufb $t3,@X[3],@X[3]
756 vpaddd 0x20($Tbl),@X[1],$t1
757 vpaddd 0x40($Tbl),@X[2],$t2
758 vpaddd 0x60($Tbl),@X[3],$t3
759 vmovdqa $t0,0x00(%rsp)
761 vmovdqa $t1,0x10(%rsp)
763 vmovdqa $t2,0x20(%rsp)
765 vmovdqa $t3,0x30(%rsp)
771 sub \$-16*2*$SZ,$Tbl # size optimization
772 vmovdqu (%r12),$inout # $a4
775 sub Xupdate_256_AVX () {
777 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
778 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
779 '&vpsrld ($t2,$t0,$sigma0[0]);',
780 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
781 '&vpsrld ($t3,$t0,$sigma0[2])',
782 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
783 '&vpxor ($t0,$t3,$t2)',
784 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
785 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
786 '&vpxor ($t0,$t0,$t1)',
787 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t2)',
789 '&vpsrld ($t2,$t3,$sigma1[2]);',
790 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
791 '&vpsrlq ($t3,$t3,$sigma1[0]);',
792 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
793 '&vpxor ($t2,$t2,$t3);',
794 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
795 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
796 '&vpshufd ($t2,$t2,0b10000100)',
797 '&vpsrldq ($t2,$t2,8)',
798 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
799 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
800 '&vpsrld ($t2,$t3,$sigma1[2])',
801 '&vpsrlq ($t3,$t3,$sigma1[0])',
802 '&vpxor ($t2,$t2,$t3);',
803 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3)',
805 '&vpshufd ($t2,$t2,0b11101000)',
806 '&vpslldq ($t2,$t2,8)',
807 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
811 sub AVX_256_00_47 () {
815 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
817 foreach (Xupdate_256_AVX()) { # 29 instructions
823 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
824 foreach (@insns) { eval; } # remaining instructions
825 &vmovdqa (16*$j."(%rsp)",$t2);
829 for ($i=0,$j=0; $j<4; $j++) {
830 &AVX_256_00_47($j,\&body_00_15,@X);
831 push(@X,shift(@X)); # rotate(@X)
833 &mov ("%r12",$_inp); # borrow $a4
834 &vpand ($temp,$temp,$mask14);
835 &mov ("%r15",$_out); # borrow $a2
836 &vpor ($iv,$iv,$temp);
837 &vmovdqu ("(%r15,%r12)",$iv); # write output
838 &lea ("%r12","16(%r12)"); # inp++
840 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
841 &jne (".Lavx_00_47");
843 &vmovdqu ($inout,"(%r12)");
847 for ($i=0; $i<16; ) {
848 foreach(body_00_15()) { eval; }
853 mov $_inp,%r12 # borrow $a4
854 mov $_out,%r13 # borrow $a0
855 mov $_ctx,%r15 # borrow $a2
856 mov $_in0,%rsi # borrow $a3
858 vpand $mask14,$temp,$temp
861 vmovdqu $iv,(%r13,%r12) # write output
862 lea 16(%r12),%r12 # inp++
888 vmovdqu $iv,($ivp) # output IV
891 $code.=<<___ if ($win64);
892 movaps `$framesz+16*0`(%rsp),%xmm6
893 movaps `$framesz+16*1`(%rsp),%xmm7
894 movaps `$framesz+16*2`(%rsp),%xmm8
895 movaps `$framesz+16*3`(%rsp),%xmm9
896 movaps `$framesz+16*4`(%rsp),%xmm10
897 movaps `$framesz+16*5`(%rsp),%xmm11
898 movaps `$framesz+16*6`(%rsp),%xmm12
899 movaps `$framesz+16*7`(%rsp),%xmm13
900 movaps `$framesz+16*8`(%rsp),%xmm14
901 movaps `$framesz+16*9`(%rsp),%xmm15
917 .cfi_def_cfa_register %rsp
921 .size ${func}_avx,.-${func}_avx
925 ######################################################################
928 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
933 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
935 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
937 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
938 '&and ($a4,$e)', # f&e
939 '&rorx ($a0,$e,$Sigma1[2])',
940 '&rorx ($a2,$e,$Sigma1[1])',
942 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
943 '&lea ($h,"($h,$a4)")',
944 '&andn ($a4,$e,$g)', # ~e&g
947 '&rorx ($a1,$e,$Sigma1[0])',
948 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
949 '&xor ($a0,$a1)', # Sigma1(e)
952 '&rorx ($a4,$a,$Sigma0[2])',
953 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
954 '&xor ($a2,$b)', # a^b, b^c in next round
955 '&rorx ($a1,$a,$Sigma0[1])',
957 '&rorx ($a0,$a,$Sigma0[0])',
958 '&lea ($d,"($d,$h)")', # d+=h
959 '&and ($a3,$a2)', # (b^c)&(a^b)
960 @aesni_cbc_block[$aesni_cbc_idx++].
963 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
964 '&xor ($a1,$a0)', # Sigma0(a)
965 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
966 '&mov ($a4,$e)', # copy of f in future
968 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
970 # and at the finish one has to $a+=$a1
974 .type ${func}_avx2,\@function,6
979 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
980 mov %rsp,%rax # copy %rsp
981 .cfi_def_cfa_register %rax
994 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
995 and \$-256*$SZ,%rsp # align stack frame
996 add \$`2*$SZ*($rounds-8)`,%rsp
999 sub $inp,$out # re-bias
1001 add $inp,$len # end of input
1003 #mov $inp,$_inp # saved later
1004 #mov $out,$_out # kept in $offload
1006 #mov $key,$_key # remains resident in $inp register
1011 .cfi_cfa_expression $_rsp,deref,+8
1013 $code.=<<___ if ($win64);
1014 movaps %xmm6,`$framesz+16*0`(%rsp)
1015 movaps %xmm7,`$framesz+16*1`(%rsp)
1016 movaps %xmm8,`$framesz+16*2`(%rsp)
1017 movaps %xmm9,`$framesz+16*3`(%rsp)
1018 movaps %xmm10,`$framesz+16*4`(%rsp)
1019 movaps %xmm11,`$framesz+16*5`(%rsp)
1020 movaps %xmm12,`$framesz+16*6`(%rsp)
1021 movaps %xmm13,`$framesz+16*7`(%rsp)
1022 movaps %xmm14,`$framesz+16*8`(%rsp)
1023 movaps %xmm15,`$framesz+16*9`(%rsp)
1029 mov $inp,%r13 # borrow $a0
1030 vpinsrq \$1,$out,$offload,$offload
1031 lea 0x80($key),$inp # size optimization, reassign
1032 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1033 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1034 mov $ctx,%r15 # borrow $a2
1035 mov $in0,%rsi # borrow $a3
1036 vmovdqu ($ivp),$iv # load IV
1039 vmovdqa 0x00(%r12,%r14,8),$mask14
1040 vmovdqa 0x10(%r12,%r14,8),$mask12
1041 vmovdqa 0x20(%r12,%r14,8),$mask10
1043 sub \$-16*$SZ,%r13 # inp++, size optimization
1045 lea (%rsi,%r13),%r12 # borrow $a0
1047 cmp $len,%r13 # $_end
1049 cmove %rsp,%r12 # next block or random data
1055 vmovdqu 0x00-0x80($inp),$roundkey
1057 if ($SZ==4) { # SHA256
1058 my @X = map("%ymm$_",(0..3));
1059 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1065 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1066 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1067 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1068 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1069 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1071 vinserti128 \$1,(%r12),@X[0],@X[0]
1072 vinserti128 \$1,16(%r12),@X[1],@X[1]
1073 vpshufb $t3,@X[0],@X[0]
1074 vinserti128 \$1,32(%r12),@X[2],@X[2]
1075 vpshufb $t3,@X[1],@X[1]
1076 vinserti128 \$1,48(%r12),@X[3],@X[3]
1078 lea $TABLE(%rip),$Tbl
1079 vpshufb $t3,@X[2],@X[2]
1080 lea -16*$SZ(%r13),%r13
1081 vpaddd 0x00($Tbl),@X[0],$t0
1082 vpshufb $t3,@X[3],@X[3]
1083 vpaddd 0x20($Tbl),@X[1],$t1
1084 vpaddd 0x40($Tbl),@X[2],$t2
1085 vpaddd 0x60($Tbl),@X[3],$t3
1086 vmovdqa $t0,0x00(%rsp)
1088 vmovdqa $t1,0x20(%rsp)
1090 $code.=<<___ if (!$win64);
1091 # temporarily use %rsi as frame pointer
1096 lea -$PUSH8(%rsp),%rsp
1098 $code.=<<___ if (!$win64);
1099 # the frame info is at $_rsp, but the stack is moving...
1100 # so a second frame pointer is saved at -8(%rsp)
1101 # that is in the red zone
1103 .cfi_cfa_expression %rsp-8,deref,+8
1107 vmovdqa $t2,0x00(%rsp)
1109 vmovdqa $t3,0x20(%rsp)
1111 sub \$-16*2*$SZ,$Tbl # size optimization
1116 vmovdqu (%r13),$inout
1117 vpinsrq \$0,%r13,$offload,$offload
1120 sub AVX2_256_00_47 () {
1124 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1125 my $base = "+2*$PUSH8(%rsp)";
1128 &lea ("%rsp","-$PUSH8(%rsp)");
1129 $code.=<<___ if (!$win64);
1130 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1131 # copy secondary frame pointer to new location again at -8(%rsp)
1132 pushq $PUSH8-8(%rsp)
1133 .cfi_cfa_expression %rsp,deref,+8
1135 .cfi_cfa_expression %rsp-8,deref,+8
1138 foreach (Xupdate_256_AVX()) { # 29 instructions
1140 eval(shift(@insns));
1141 eval(shift(@insns));
1142 eval(shift(@insns));
1144 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1145 foreach (@insns) { eval; } # remaining instructions
1146 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1149 for ($i=0,$j=0; $j<4; $j++) {
1150 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1151 push(@X,shift(@X)); # rotate(@X)
1153 &vmovq ("%r13",$offload); # borrow $a0
1154 &vpextrq ("%r15",$offload,1); # borrow $a2
1155 &vpand ($temp,$temp,$mask14);
1156 &vpor ($iv,$iv,$temp);
1157 &vmovdqu ("(%r15,%r13)",$iv); # write output
1158 &lea ("%r13","16(%r13)"); # inp++
1160 &lea ($Tbl,16*2*$SZ."($Tbl)");
1161 &cmpb (($SZ-1)."($Tbl)",0);
1162 &jne (".Lavx2_00_47");
1164 &vmovdqu ($inout,"(%r13)");
1165 &vpinsrq ($offload,$offload,"%r13",0);
1168 for ($i=0; $i<16; ) {
1169 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1170 foreach(bodyx_00_15()) { eval; }
1174 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1175 vmovq $offload,%r13 # $_inp, borrow $a0
1176 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1178 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1180 vpand $mask14,$temp,$temp
1182 vmovdqu $iv,(%r12,%r13) # write output
1203 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1213 vmovdqu (%r13),$inout
1214 vpinsrq \$0,%r13,$offload,$offload
1217 for ($i=0; $i<16; ) {
1218 my $base="+16($Tbl)";
1219 foreach(bodyx_00_15()) { eval; }
1220 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1223 vmovq $offload,%r13 # borrow $a0
1224 vpextrq \$1,$offload,%r15 # borrow $a2
1225 vpand $mask14,$temp,$temp
1227 lea -$PUSH8($Tbl),$Tbl
1228 vmovdqu $iv,(%r15,%r13) # write output
1229 lea 16(%r13),%r13 # inp++
1233 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1234 lea 16*$SZ(%r13),%r13
1235 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1237 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1246 lea (%rsi,%r13),%r12
1252 cmove %rsp,%r12 # next block or stale data
1263 # temporarily use $Tbl as index to $_rsp
1264 # this avoids the need to save a secondary frame pointer at -8(%rsp)
1265 .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
1268 mov 16*$SZ+4*8($Tbl),$ivp
1269 mov 16*$SZ+7*8($Tbl),%rsi
1271 vmovdqu $iv,($ivp) # output IV
1274 $code.=<<___ if ($win64);
1275 movaps `$framesz+16*0`($Tbl),%xmm6
1276 movaps `$framesz+16*1`($Tbl),%xmm7
1277 movaps `$framesz+16*2`($Tbl),%xmm8
1278 movaps `$framesz+16*3`($Tbl),%xmm9
1279 movaps `$framesz+16*4`($Tbl),%xmm10
1280 movaps `$framesz+16*5`($Tbl),%xmm11
1281 movaps `$framesz+16*6`($Tbl),%xmm12
1282 movaps `$framesz+16*7`($Tbl),%xmm13
1283 movaps `$framesz+16*8`($Tbl),%xmm14
1284 movaps `$framesz+16*9`($Tbl),%xmm15
1300 .cfi_def_cfa_register %rsp
1304 .size ${func}_avx2,.-${func}_avx2
1309 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1311 my ($rounds,$Tbl)=("%r11d","%rbx");
1313 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1314 my @rndkey=("%xmm4","%xmm5");
1318 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1319 my @MSG=map("%xmm$_",(10..13));
1323 my ($n,$k)=($r/10,$r%10);
1326 movups `16*$n`($in0),$in # load input
1329 $code.=<<___ if ($n);
1330 movups $iv,`16*($n-1)`($out,$in0) # write output
1334 movups `32+16*$k-112`($key),$rndkey[1]
1335 aesenc $rndkey[0],$iv
1342 movups `32+16*($k+0)-112`($key),$rndkey[1]
1343 aesenc $rndkey[0],$iv
1344 movups `32+16*($k+1)-112`($key),$rndkey[0]
1345 aesenc $rndkey[1],$iv
1347 movups `32+16*($k+2)-112`($key),$rndkey[1]
1348 aesenc $rndkey[0],$iv
1349 movups `32+16*($k+3)-112`($key),$rndkey[0]
1350 aesenc $rndkey[1],$iv
1352 aesenclast $rndkey[0],$iv
1353 movups 16-112($key),$rndkey[1] # forward reference
1358 movups `32+16*$k-112`($key),$rndkey[1]
1359 aesenc $rndkey[0],$iv
1362 $r++; unshift(@rndkey,pop(@rndkey));
1369 .type ${func}_shaext,\@function,6
1373 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1375 $code.=<<___ if ($win64);
1376 lea `-8-10*16`(%rsp),%rsp
1377 movaps %xmm6,-8-10*16(%rax)
1378 movaps %xmm7,-8-9*16(%rax)
1379 movaps %xmm8,-8-8*16(%rax)
1380 movaps %xmm9,-8-7*16(%rax)
1381 movaps %xmm10,-8-6*16(%rax)
1382 movaps %xmm11,-8-5*16(%rax)
1383 movaps %xmm12,-8-4*16(%rax)
1384 movaps %xmm13,-8-3*16(%rax)
1385 movaps %xmm14,-8-2*16(%rax)
1386 movaps %xmm15,-8-1*16(%rax)
1390 lea K256+0x80(%rip),$Tbl
1391 movdqu ($ctx),$ABEF # DCBA
1392 movdqu 16($ctx),$CDGH # HGFE
1393 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1395 mov 240($key),$rounds
1397 movups ($key),$rndkey0 # $key[0]
1398 movups ($ivp),$iv # load IV
1399 movups 16($key),$rndkey[0] # forward reference
1400 lea 112($key),$key # size optimization
1402 pshufd \$0x1b,$ABEF,$Wi # ABCD
1403 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1404 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1405 movdqa $TMP,$BSWAP # offload
1406 palignr \$8,$CDGH,$ABEF # ABEF
1407 punpcklqdq $Wi,$CDGH # CDGH
1413 movdqu ($inp),@MSG[0]
1414 movdqu 0x10($inp),@MSG[1]
1415 movdqu 0x20($inp),@MSG[2]
1417 movdqu 0x30($inp),@MSG[3]
1419 movdqa 0*32-0x80($Tbl),$Wi
1422 movdqa $CDGH,$CDGH_SAVE # offload
1423 movdqa $ABEF,$ABEF_SAVE # offload
1427 sha256rnds2 $ABEF,$CDGH # 0-3
1428 pshufd \$0x0e,$Wi,$Wi
1432 sha256rnds2 $CDGH,$ABEF
1434 movdqa 1*32-0x80($Tbl),$Wi
1441 sha256rnds2 $ABEF,$CDGH # 4-7
1442 pshufd \$0x0e,$Wi,$Wi
1446 sha256rnds2 $CDGH,$ABEF
1448 movdqa 2*32-0x80($Tbl),$Wi
1451 sha256msg1 @MSG[1],@MSG[0]
1455 sha256rnds2 $ABEF,$CDGH # 8-11
1456 pshufd \$0x0e,$Wi,$Wi
1458 palignr \$4,@MSG[2],$TMP
1463 sha256rnds2 $CDGH,$ABEF
1465 movdqa 3*32-0x80($Tbl),$Wi
1467 sha256msg2 @MSG[3],@MSG[0]
1468 sha256msg1 @MSG[2],@MSG[1]
1472 sha256rnds2 $ABEF,$CDGH # 12-15
1473 pshufd \$0x0e,$Wi,$Wi
1478 palignr \$4,@MSG[3],$TMP
1480 sha256rnds2 $CDGH,$ABEF
1482 for($i=4;$i<16-3;$i++) {
1483 &$aesenc() if (($r%10)==0);
1485 movdqa $i*32-0x80($Tbl),$Wi
1487 sha256msg2 @MSG[0],@MSG[1]
1488 sha256msg1 @MSG[3],@MSG[2]
1492 sha256rnds2 $ABEF,$CDGH # 16-19...
1493 pshufd \$0x0e,$Wi,$Wi
1495 palignr \$4,@MSG[0],$TMP
1499 &$aesenc() if ($r==19);
1501 sha256rnds2 $CDGH,$ABEF
1503 push(@MSG,shift(@MSG));
1506 movdqa 13*32-0x80($Tbl),$Wi
1508 sha256msg2 @MSG[0],@MSG[1]
1509 sha256msg1 @MSG[3],@MSG[2]
1513 sha256rnds2 $ABEF,$CDGH # 52-55
1514 pshufd \$0x0e,$Wi,$Wi
1516 palignr \$4,@MSG[0],$TMP
1522 sha256rnds2 $CDGH,$ABEF
1524 movdqa 14*32-0x80($Tbl),$Wi
1526 sha256msg2 @MSG[1],@MSG[2]
1531 sha256rnds2 $ABEF,$CDGH # 56-59
1532 pshufd \$0x0e,$Wi,$Wi
1536 sha256rnds2 $CDGH,$ABEF
1538 movdqa 15*32-0x80($Tbl),$Wi
1544 sha256rnds2 $ABEF,$CDGH # 60-63
1545 pshufd \$0x0e,$Wi,$Wi
1549 sha256rnds2 $CDGH,$ABEF
1550 #pxor $CDGH,$rndkey0 # black magic
1552 while ($r<40) { &$aesenc(); } # remaining aesenc's
1554 #xorps $CDGH,$rndkey0 # black magic
1555 paddd $CDGH_SAVE,$CDGH
1556 paddd $ABEF_SAVE,$ABEF
1559 movups $iv,48($out,$in0) # write output
1563 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1564 pshufd \$0x1b,$ABEF,$TMP # FEBA
1565 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1566 punpckhqdq $CDGH,$ABEF # DCBA
1567 palignr \$8,$TMP,$CDGH # HGFE
1569 movups $iv,($ivp) # write IV
1571 movdqu $CDGH,16($ctx)
1573 $code.=<<___ if ($win64);
1574 movaps 0*16(%rsp),%xmm6
1575 movaps 1*16(%rsp),%xmm7
1576 movaps 2*16(%rsp),%xmm8
1577 movaps 3*16(%rsp),%xmm9
1578 movaps 4*16(%rsp),%xmm10
1579 movaps 5*16(%rsp),%xmm11
1580 movaps 6*16(%rsp),%xmm12
1581 movaps 7*16(%rsp),%xmm13
1582 movaps 8*16(%rsp),%xmm14
1583 movaps 9*16(%rsp),%xmm15
1584 lea 8+10*16(%rsp),%rsp
1590 .size ${func}_shaext,.-${func}_shaext
1595 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1596 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1597 if ($win64 && $avx) {
1604 .extern __imp_RtlVirtualUnwind
1605 .type se_handler,\@abi-omnipotent
1619 mov 120($context),%rax # pull context->Rax
1620 mov 248($context),%rbx # pull context->Rip
1622 mov 8($disp),%rsi # disp->ImageBase
1623 mov 56($disp),%r11 # disp->HanderlData
1625 mov 0(%r11),%r10d # HandlerData[0]
1626 lea (%rsi,%r10),%r10 # prologue label
1627 cmp %r10,%rbx # context->Rip<prologue label
1630 mov 152($context),%rax # pull context->Rsp
1632 mov 4(%r11),%r10d # HandlerData[1]
1633 lea (%rsi,%r10),%r10 # epilogue label
1634 cmp %r10,%rbx # context->Rip>=epilogue label
1637 $code.=<<___ if ($shaext);
1638 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1643 lea 512($context),%rdi # &context.Xmm6
1645 .long 0xa548f3fc # cld; rep movsq
1646 lea 168(%rax),%rax # adjust stack pointer
1650 $code.=<<___ if ($avx>1);
1651 lea .Lavx2_shortcut(%rip),%r10
1652 cmp %r10,%rbx # context->Rip<avx2_shortcut
1656 add \$`2*$SZ*($rounds-8)`,%rax
1660 mov %rax,%rsi # put aside Rsp
1661 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1669 mov %rbx,144($context) # restore context->Rbx
1670 mov %rbp,160($context) # restore context->Rbp
1671 mov %r12,216($context) # restore context->R12
1672 mov %r13,224($context) # restore context->R13
1673 mov %r14,232($context) # restore context->R14
1674 mov %r15,240($context) # restore context->R15
1676 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1677 lea 512($context),%rdi # &context.Xmm6
1679 .long 0xa548f3fc # cld; rep movsq
1684 mov %rax,152($context) # restore context->Rsp
1685 mov %rsi,168($context) # restore context->Rsi
1686 mov %rdi,176($context) # restore context->Rdi
1688 mov 40($disp),%rdi # disp->ContextRecord
1689 mov $context,%rsi # context
1690 mov \$154,%ecx # sizeof(CONTEXT)
1691 .long 0xa548f3fc # cld; rep movsq
1694 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1695 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1696 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1697 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1698 mov 40(%rsi),%r10 # disp->ContextRecord
1699 lea 56(%rsi),%r11 # &disp->HandlerData
1700 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1701 mov %r10,32(%rsp) # arg5
1702 mov %r11,40(%rsp) # arg6
1703 mov %r12,48(%rsp) # arg7
1704 mov %rcx,56(%rsp) # arg8, (NULL)
1705 call *__imp_RtlVirtualUnwind(%rip)
1707 mov \$1,%eax # ExceptionContinueSearch
1719 .size se_handler,.-se_handler
1722 .rva .LSEH_begin_${func}_xop
1723 .rva .LSEH_end_${func}_xop
1724 .rva .LSEH_info_${func}_xop
1726 .rva .LSEH_begin_${func}_avx
1727 .rva .LSEH_end_${func}_avx
1728 .rva .LSEH_info_${func}_avx
1730 $code.=<<___ if ($avx>1);
1731 .rva .LSEH_begin_${func}_avx2
1732 .rva .LSEH_end_${func}_avx2
1733 .rva .LSEH_info_${func}_avx2
1735 $code.=<<___ if ($shaext);
1736 .rva .LSEH_begin_${func}_shaext
1737 .rva .LSEH_end_${func}_shaext
1738 .rva .LSEH_info_${func}_shaext
1743 .LSEH_info_${func}_xop:
1746 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1748 .LSEH_info_${func}_avx:
1751 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1753 $code.=<<___ if ($avx>1);
1754 .LSEH_info_${func}_avx2:
1757 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1759 $code.=<<___ if ($shaext);
1760 .LSEH_info_${func}_shaext:
1763 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1767 ####################################################################
1769 local *opcode=shift;
1773 $rex|=0x04 if($dst>=8);
1774 $rex|=0x01 if($src>=8);
1775 unshift @opcode,$rex|0x40 if($rex);
1780 "sha256rnds2" => 0xcb,
1781 "sha256msg1" => 0xcc,
1782 "sha256msg2" => 0xcd );
1787 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1788 my @opcode=(0x0f,0x38);
1789 rex(\@opcode,$2,$1);
1790 push @opcode,$opcodelet{$instr};
1791 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1792 return ".byte\t".join(',',@opcode);
1794 return $instr."\t".@_[0];
1799 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1800 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1802 close STDOUT or die "error closing STDOUT: $!";