crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256   this(**)    gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
  37 # Ryzen(***)        2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
  38 # Goldmont(***)     3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
  39 #
  40 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
  41 #       Westmere is omitted from loop, this is because gain was not
  42 #       estimated high enough to justify the effort;
  43 # (**)  these are EVP-free results, results obtained with 'speed
  44 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  45 # (***) these are SHAEXT results;
  46
  47 $flavour = shift;
  48 $output  = shift;
  49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  50
  51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  52
  53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  56 die "can't locate x86_64-xlate.pl";
  57
  58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  59                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  60         $avx = ($1>=2.19) + ($1>=2.22);
  61 }
  62
  63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  64            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  65         $avx = ($1>=2.09) + ($1>=2.10);
  66 }
  67
  68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  69            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  70         $avx = ($1>=10) + ($1>=12);
  71 }
  72
  73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  74         $avx = ($2>=3.0) + ($2>3.0);
  75 }
  76
  77 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  78 $avx=1          if (!$shaext && $avx);
  79
  80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  81 *STDOUT=*OUT;
  82
  83 $func="aesni_cbc_sha256_enc";
  84 $TABLE="K256";
  85 $SZ=4;
  86 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  87                                 "%r8d","%r9d","%r10d","%r11d");
  88 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  89 @Sigma0=( 2,13,22);
  90 @Sigma1=( 6,11,25);
  91 @sigma0=( 7,18, 3);
  92 @sigma1=(17,19,10);
  93 $rounds=64;
  94
  95 ########################################################################
  96 # void aesni_cbc_sha256_enc(const void *inp,
  97 #                       void *out,
  98 #                       size_t length,
  99 #                       const AES_KEY *key,
 100 #                       unsigned char *iv,
 101 #                       SHA256_CTX *ctx,
 102 #                       const void *in0);
 103 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 104 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 105
 106 $Tbl="%rbp";
 107
 108 $_inp="16*$SZ+0*8(%rsp)";
 109 $_out="16*$SZ+1*8(%rsp)";
 110 $_end="16*$SZ+2*8(%rsp)";
 111 $_key="16*$SZ+3*8(%rsp)";
 112 $_ivp="16*$SZ+4*8(%rsp)";
 113 $_ctx="16*$SZ+5*8(%rsp)";
 114 $_in0="16*$SZ+6*8(%rsp)";
 115 $_rsp="`16*$SZ+7*8`(%rsp)";
 116 $framesz=16*$SZ+8*8;
 117
 118 $code=<<___;
 119 .text
 120
 121 .extern OPENSSL_ia32cap_P
 122 .globl  $func
 123 .type   $func,\@abi-omnipotent
 124 .align  16
 125 $func:
 126 .cfi_startproc
 127 ___
 128                                                 if ($avx) {
 129 $code.=<<___;
 130         lea     OPENSSL_ia32cap_P(%rip),%r11
 131         mov     \$1,%eax
 132         cmp     \$0,`$win64?"%rcx":"%rdi"`
 133         je      .Lprobe
 134         mov     0(%r11),%eax
 135         mov     4(%r11),%r10
 136 ___
 137 $code.=<<___ if ($shaext);
 138         bt      \$61,%r10                       # check for SHA
 139         jc      ${func}_shaext
 140 ___
 141 $code.=<<___;
 142         mov     %r10,%r11
 143         shr     \$32,%r11
 144
 145         test    \$`1<<11`,%r10d                 # check for XOP
 146         jnz     ${func}_xop
 147 ___
 148 $code.=<<___ if ($avx>1);
 149         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 150         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 151         je      ${func}_avx2
 152 ___
 153 $code.=<<___;
 154         and     \$`1<<28`,%r10d                 # check for AVX
 155         jnz     ${func}_avx
 156         ud2
 157 ___
 158                                                 }
 159 $code.=<<___;
 160         xor     %eax,%eax
 161         cmp     \$0,`$win64?"%rcx":"%rdi"`
 162         je      .Lprobe
 163         ud2
 164 .Lprobe:
 165         ret
 166 .cfi_endproc
 167 .size   $func,.-$func
 168
 169 .align  64
 170 .type   $TABLE,\@object
 171 $TABLE:
 172         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 173         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 174         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 175         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 176         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 177         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 178         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 179         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 180         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 181         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 182         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 183         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 184         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 185         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 186         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 187         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 188         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 189         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 190         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 191         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 192         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 193         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 194         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 195         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 196         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 197         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 198         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 199         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 200         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 201         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 202         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 203         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 204
 205         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 206         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 207         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 208         .long   0,0,0,0,   0,0,0,0
 209         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 210 .align  64
 211 ___
 212
 213 ######################################################################
 214 # SIMD code paths
 215 #
 216 {{{
 217 ($iv,$inout,$roundkey,$temp,
 218  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 219
 220 $aesni_cbc_idx=0;
 221 @aesni_cbc_block = (
 222 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 223 ##      &vmovdqu        ($inout,($inp));
 224 ##      &mov            ($_inp,$inp);
 225
 226         '&vpxor         ($inout,$inout,$roundkey);'.
 227         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 228
 229         '&vpxor         ($inout,$inout,$iv);',
 230
 231         '&vaesenc       ($inout,$inout,$roundkey);'.
 232         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 233
 234         '&vaesenc       ($inout,$inout,$roundkey);'.
 235         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 236
 237         '&vaesenc       ($inout,$inout,$roundkey);'.
 238         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 239
 240         '&vaesenc       ($inout,$inout,$roundkey);'.
 241         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 242
 243         '&vaesenc       ($inout,$inout,$roundkey);'.
 244         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 245
 246         '&vaesenc       ($inout,$inout,$roundkey);'.
 247         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 248
 249         '&vaesenc       ($inout,$inout,$roundkey);'.
 250         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 251
 252         '&vaesenc       ($inout,$inout,$roundkey);'.
 253         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 254
 255         '&vaesenc       ($inout,$inout,$roundkey);'.
 256         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 257
 258         '&vaesenclast   ($temp,$inout,$roundkey);'.
 259         ' &vaesenc      ($inout,$inout,$roundkey);'.
 260         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 261
 262         '&vpand         ($iv,$temp,$mask10);'.
 263         ' &vaesenc      ($inout,$inout,$roundkey);'.
 264         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 265
 266         '&vaesenclast   ($temp,$inout,$roundkey);'.
 267         ' &vaesenc      ($inout,$inout,$roundkey);'.
 268         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 269
 270         '&vpand         ($temp,$temp,$mask12);'.
 271         ' &vaesenc      ($inout,$inout,$roundkey);'.
 272          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 273
 274         '&vpor          ($iv,$iv,$temp);'.
 275         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 276         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 277
 278 ##      &mov            ($inp,$_inp);
 279 ##      &mov            ($out,$_out);
 280 ##      &vpand          ($temp,$temp,$mask14);
 281 ##      &vpor           ($iv,$iv,$temp);
 282 ##      &vmovdqu        ($iv,($out,$inp);
 283 ##      &lea            (inp,16($inp));
 284 );
 285
 286 my $a4=$T1;
 287 my ($a,$b,$c,$d,$e,$f,$g,$h);
 288
 289 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 290 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 291   my $arg = pop;
 292     $arg = "\$$arg" if ($arg*1 eq $arg);
 293     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 294 }
 295
 296 sub body_00_15 () {
 297         (
 298         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 299
 300         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 301         '&mov   ($a,$a1)',
 302         '&mov   ($a4,$f)',
 303
 304         '&xor   ($a0,$e)',
 305         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 306         '&xor   ($a4,$g)',                      # f^g
 307
 308         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 309         '&xor   ($a1,$a)',
 310         '&and   ($a4,$e)',                      # (f^g)&e
 311
 312         @aesni_cbc_block[$aesni_cbc_idx++].
 313         '&xor   ($a0,$e)',
 314         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 315         '&mov   ($a2,$a)',
 316
 317         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 318         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 319         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 320
 321         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 322         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 323         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 324
 325         '&xor   ($a1,$a)',
 326         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 327         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 328
 329         '&add   ($d,$h)',                       # d+=h
 330         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 331         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 332
 333         '&mov   ($a0,$d)',
 334         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 335         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 336         );
 337 }
 338
 339 if ($avx) {{
 340 ######################################################################
 341 # XOP code path
 342 #
 343 $code.=<<___;
 344 .type   ${func}_xop,\@function,6
 345 .align  64
 346 ${func}_xop:
 347 .cfi_startproc
 348 .Lxop_shortcut:
 349         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 350         mov     %rsp,%rax               # copy %rsp
 351 .cfi_def_cfa_register   %rax
 352         push    %rbx
 353 .cfi_push       %rbx
 354         push    %rbp
 355 .cfi_push       %rbp
 356         push    %r12
 357 .cfi_push       %r12
 358         push    %r13
 359 .cfi_push       %r13
 360         push    %r14
 361 .cfi_push       %r14
 362         push    %r15
 363 .cfi_push       %r15
 364         sub     \$`$framesz+$win64*16*10`,%rsp
 365         and     \$-64,%rsp              # align stack frame
 366
 367         shl     \$6,$len
 368         sub     $inp,$out               # re-bias
 369         sub     $inp,$in0
 370         add     $inp,$len               # end of input
 371
 372         #mov    $inp,$_inp              # saved later
 373         mov     $out,$_out
 374         mov     $len,$_end
 375         #mov    $key,$_key              # remains resident in $inp register
 376         mov     $ivp,$_ivp
 377         mov     $ctx,$_ctx
 378         mov     $in0,$_in0
 379         mov     %rax,$_rsp
 380 .cfi_cfa_expression     $_rsp,deref,+8
 381 ___
 382 $code.=<<___ if ($win64);
 383         movaps  %xmm6,`$framesz+16*0`(%rsp)
 384         movaps  %xmm7,`$framesz+16*1`(%rsp)
 385         movaps  %xmm8,`$framesz+16*2`(%rsp)
 386         movaps  %xmm9,`$framesz+16*3`(%rsp)
 387         movaps  %xmm10,`$framesz+16*4`(%rsp)
 388         movaps  %xmm11,`$framesz+16*5`(%rsp)
 389         movaps  %xmm12,`$framesz+16*6`(%rsp)
 390         movaps  %xmm13,`$framesz+16*7`(%rsp)
 391         movaps  %xmm14,`$framesz+16*8`(%rsp)
 392         movaps  %xmm15,`$framesz+16*9`(%rsp)
 393 ___
 394 $code.=<<___;
 395 .Lprologue_xop:
 396         vzeroall
 397
 398         mov     $inp,%r12               # borrow $a4
 399         lea     0x80($key),$inp         # size optimization, reassign
 400         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 401         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 402         mov     $ctx,%r15               # borrow $a2
 403         mov     $in0,%rsi               # borrow $a3
 404         vmovdqu ($ivp),$iv              # load IV
 405         sub     \$9,%r14
 406
 407         mov     $SZ*0(%r15),$A
 408         mov     $SZ*1(%r15),$B
 409         mov     $SZ*2(%r15),$C
 410         mov     $SZ*3(%r15),$D
 411         mov     $SZ*4(%r15),$E
 412         mov     $SZ*5(%r15),$F
 413         mov     $SZ*6(%r15),$G
 414         mov     $SZ*7(%r15),$H
 415
 416         vmovdqa 0x00(%r13,%r14,8),$mask14
 417         vmovdqa 0x10(%r13,%r14,8),$mask12
 418         vmovdqa 0x20(%r13,%r14,8),$mask10
 419         vmovdqu 0x00-0x80($inp),$roundkey
 420         jmp     .Lloop_xop
 421 ___
 422                                         if ($SZ==4) {   # SHA256
 423     my @X = map("%xmm$_",(0..3));
 424     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 425
 426 $code.=<<___;
 427 .align  16
 428 .Lloop_xop:
 429         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 430         vmovdqu 0x00(%rsi,%r12),@X[0]
 431         vmovdqu 0x10(%rsi,%r12),@X[1]
 432         vmovdqu 0x20(%rsi,%r12),@X[2]
 433         vmovdqu 0x30(%rsi,%r12),@X[3]
 434         vpshufb $t3,@X[0],@X[0]
 435         lea     $TABLE(%rip),$Tbl
 436         vpshufb $t3,@X[1],@X[1]
 437         vpshufb $t3,@X[2],@X[2]
 438         vpaddd  0x00($Tbl),@X[0],$t0
 439         vpshufb $t3,@X[3],@X[3]
 440         vpaddd  0x20($Tbl),@X[1],$t1
 441         vpaddd  0x40($Tbl),@X[2],$t2
 442         vpaddd  0x60($Tbl),@X[3],$t3
 443         vmovdqa $t0,0x00(%rsp)
 444         mov     $A,$a1
 445         vmovdqa $t1,0x10(%rsp)
 446         mov     $B,$a3
 447         vmovdqa $t2,0x20(%rsp)
 448         xor     $C,$a3                  # magic
 449         vmovdqa $t3,0x30(%rsp)
 450         mov     $E,$a0
 451         jmp     .Lxop_00_47
 452
 453 .align  16
 454 .Lxop_00_47:
 455         sub     \$-16*2*$SZ,$Tbl        # size optimization
 456         vmovdqu (%r12),$inout           # $a4
 457         mov     %r12,$_inp              # $a4
 458 ___
 459 sub XOP_256_00_47 () {
 460 my $j = shift;
 461 my $body = shift;
 462 my @X = @_;
 463 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 464
 465         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 466           eval(shift(@insns));
 467           eval(shift(@insns));
 468          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474         &vpsrld         ($t0,$t0,$sigma0[2]);
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485         &vpxor          ($t0,$t0,$t1);
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 491           eval(shift(@insns));
 492           eval(shift(@insns));
 493         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496          &vpsrld        ($t2,@X[3],$sigma1[2]);
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505          &vpxor         ($t3,$t3,$t2);
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515         &vpsrldq        ($t3,$t3,8);
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 526           eval(shift(@insns));
 527           eval(shift(@insns));
 528          &vpsrld        ($t2,@X[0],$sigma1[2]);
 529           eval(shift(@insns));
 530           eval(shift(@insns));
 531          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 532           eval(shift(@insns));
 533           eval(shift(@insns));
 534          &vpxor         ($t3,$t3,$t2);
 535           eval(shift(@insns));
 536           eval(shift(@insns));
 537           eval(shift(@insns));
 538           eval(shift(@insns));
 539          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 540           eval(shift(@insns));
 541           eval(shift(@insns));
 542           eval(shift(@insns));
 543           eval(shift(@insns));
 544         &vpslldq        ($t3,$t3,8);            # 22 instructions
 545           eval(shift(@insns));
 546           eval(shift(@insns));
 547           eval(shift(@insns));
 548           eval(shift(@insns));
 549         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 550           eval(shift(@insns));
 551           eval(shift(@insns));
 552           eval(shift(@insns));
 553           eval(shift(@insns));
 554         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 555           foreach (@insns) { eval; }            # remaining instructions
 556         &vmovdqa        (16*$j."(%rsp)",$t2);
 557 }
 558
 559     $aesni_cbc_idx=0;
 560     for ($i=0,$j=0; $j<4; $j++) {
 561         &XOP_256_00_47($j,\&body_00_15,@X);
 562         push(@X,shift(@X));                     # rotate(@X)
 563     }
 564         &mov            ("%r12",$_inp);         # borrow $a4
 565         &vpand          ($temp,$temp,$mask14);
 566         &mov            ("%r15",$_out);         # borrow $a2
 567         &vpor           ($iv,$iv,$temp);
 568         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 569         &lea            ("%r12","16(%r12)");    # inp++
 570
 571         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 572         &jne    (".Lxop_00_47");
 573
 574         &vmovdqu        ($inout,"(%r12)");
 575         &mov            ($_inp,"%r12");
 576
 577     $aesni_cbc_idx=0;
 578     for ($i=0; $i<16; ) {
 579         foreach(body_00_15()) { eval; }
 580     }
 581                                         }
 582 $code.=<<___;
 583         mov     $_inp,%r12              # borrow $a4
 584         mov     $_out,%r13              # borrow $a0
 585         mov     $_ctx,%r15              # borrow $a2
 586         mov     $_in0,%rsi              # borrow $a3
 587
 588         vpand   $mask14,$temp,$temp
 589         mov     $a1,$A
 590         vpor    $temp,$iv,$iv
 591         vmovdqu $iv,(%r13,%r12)         # write output
 592         lea     16(%r12),%r12           # inp++
 593
 594         add     $SZ*0(%r15),$A
 595         add     $SZ*1(%r15),$B
 596         add     $SZ*2(%r15),$C
 597         add     $SZ*3(%r15),$D
 598         add     $SZ*4(%r15),$E
 599         add     $SZ*5(%r15),$F
 600         add     $SZ*6(%r15),$G
 601         add     $SZ*7(%r15),$H
 602
 603         cmp     $_end,%r12
 604
 605         mov     $A,$SZ*0(%r15)
 606         mov     $B,$SZ*1(%r15)
 607         mov     $C,$SZ*2(%r15)
 608         mov     $D,$SZ*3(%r15)
 609         mov     $E,$SZ*4(%r15)
 610         mov     $F,$SZ*5(%r15)
 611         mov     $G,$SZ*6(%r15)
 612         mov     $H,$SZ*7(%r15)
 613
 614         jb      .Lloop_xop
 615
 616         mov     $_ivp,$ivp
 617         mov     $_rsp,%rsi
 618 .cfi_def_cfa    %rsi,8
 619         vmovdqu $iv,($ivp)              # output IV
 620         vzeroall
 621 ___
 622 $code.=<<___ if ($win64);
 623         movaps  `$framesz+16*0`(%rsp),%xmm6
 624         movaps  `$framesz+16*1`(%rsp),%xmm7
 625         movaps  `$framesz+16*2`(%rsp),%xmm8
 626         movaps  `$framesz+16*3`(%rsp),%xmm9
 627         movaps  `$framesz+16*4`(%rsp),%xmm10
 628         movaps  `$framesz+16*5`(%rsp),%xmm11
 629         movaps  `$framesz+16*6`(%rsp),%xmm12
 630         movaps  `$framesz+16*7`(%rsp),%xmm13
 631         movaps  `$framesz+16*8`(%rsp),%xmm14
 632         movaps  `$framesz+16*9`(%rsp),%xmm15
 633 ___
 634 $code.=<<___;
 635         mov     -48(%rsi),%r15
 636 .cfi_restore    %r15
 637         mov     -40(%rsi),%r14
 638 .cfi_restore    %r14
 639         mov     -32(%rsi),%r13
 640 .cfi_restore    %r13
 641         mov     -24(%rsi),%r12
 642 .cfi_restore    %r12
 643         mov     -16(%rsi),%rbp
 644 .cfi_restore    %rbp
 645         mov     -8(%rsi),%rbx
 646 .cfi_restore    %rbx
 647         lea     (%rsi),%rsp
 648 .cfi_def_cfa_register   %rsp
 649 .Lepilogue_xop:
 650         ret
 651 .cfi_endproc
 652 .size   ${func}_xop,.-${func}_xop
 653 ___
 654 ######################################################################
 655 # AVX+shrd code path
 656 #
 657 local *ror = sub { &shrd(@_[0],@_) };
 658
 659 $code.=<<___;
 660 .type   ${func}_avx,\@function,6
 661 .align  64
 662 ${func}_avx:
 663 .cfi_startproc
 664 .Lavx_shortcut:
 665         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 666         mov     %rsp,%rax               # copy %rsp
 667 .cfi_def_cfa_register   %rax
 668         push    %rbx
 669 .cfi_push       %rbx
 670         push    %rbp
 671 .cfi_push       %rbp
 672         push    %r12
 673 .cfi_push       %r12
 674         push    %r13
 675 .cfi_push       %r13
 676         push    %r14
 677 .cfi_push       %r14
 678         push    %r15
 679 .cfi_push       %r15
 680         sub     \$`$framesz+$win64*16*10`,%rsp
 681         and     \$-64,%rsp              # align stack frame
 682
 683         shl     \$6,$len
 684         sub     $inp,$out               # re-bias
 685         sub     $inp,$in0
 686         add     $inp,$len               # end of input
 687
 688         #mov    $inp,$_inp              # saved later
 689         mov     $out,$_out
 690         mov     $len,$_end
 691         #mov    $key,$_key              # remains resident in $inp register
 692         mov     $ivp,$_ivp
 693         mov     $ctx,$_ctx
 694         mov     $in0,$_in0
 695         mov     %rax,$_rsp
 696 .cfi_cfa_expression     $_rsp,deref,+8
 697 ___
 698 $code.=<<___ if ($win64);
 699         movaps  %xmm6,`$framesz+16*0`(%rsp)
 700         movaps  %xmm7,`$framesz+16*1`(%rsp)
 701         movaps  %xmm8,`$framesz+16*2`(%rsp)
 702         movaps  %xmm9,`$framesz+16*3`(%rsp)
 703         movaps  %xmm10,`$framesz+16*4`(%rsp)
 704         movaps  %xmm11,`$framesz+16*5`(%rsp)
 705         movaps  %xmm12,`$framesz+16*6`(%rsp)
 706         movaps  %xmm13,`$framesz+16*7`(%rsp)
 707         movaps  %xmm14,`$framesz+16*8`(%rsp)
 708         movaps  %xmm15,`$framesz+16*9`(%rsp)
 709 ___
 710 $code.=<<___;
 711 .Lprologue_avx:
 712         vzeroall
 713
 714         mov     $inp,%r12               # borrow $a4
 715         lea     0x80($key),$inp         # size optimization, reassign
 716         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 717         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 718         mov     $ctx,%r15               # borrow $a2
 719         mov     $in0,%rsi               # borrow $a3
 720         vmovdqu ($ivp),$iv              # load IV
 721         sub     \$9,%r14
 722
 723         mov     $SZ*0(%r15),$A
 724         mov     $SZ*1(%r15),$B
 725         mov     $SZ*2(%r15),$C
 726         mov     $SZ*3(%r15),$D
 727         mov     $SZ*4(%r15),$E
 728         mov     $SZ*5(%r15),$F
 729         mov     $SZ*6(%r15),$G
 730         mov     $SZ*7(%r15),$H
 731
 732         vmovdqa 0x00(%r13,%r14,8),$mask14
 733         vmovdqa 0x10(%r13,%r14,8),$mask12
 734         vmovdqa 0x20(%r13,%r14,8),$mask10
 735         vmovdqu 0x00-0x80($inp),$roundkey
 736 ___
 737                                         if ($SZ==4) {   # SHA256
 738     my @X = map("%xmm$_",(0..3));
 739     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 740
 741 $code.=<<___;
 742         jmp     .Lloop_avx
 743 .align  16
 744 .Lloop_avx:
 745         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 746         vmovdqu 0x00(%rsi,%r12),@X[0]
 747         vmovdqu 0x10(%rsi,%r12),@X[1]
 748         vmovdqu 0x20(%rsi,%r12),@X[2]
 749         vmovdqu 0x30(%rsi,%r12),@X[3]
 750         vpshufb $t3,@X[0],@X[0]
 751         lea     $TABLE(%rip),$Tbl
 752         vpshufb $t3,@X[1],@X[1]
 753         vpshufb $t3,@X[2],@X[2]
 754         vpaddd  0x00($Tbl),@X[0],$t0
 755         vpshufb $t3,@X[3],@X[3]
 756         vpaddd  0x20($Tbl),@X[1],$t1
 757         vpaddd  0x40($Tbl),@X[2],$t2
 758         vpaddd  0x60($Tbl),@X[3],$t3
 759         vmovdqa $t0,0x00(%rsp)
 760         mov     $A,$a1
 761         vmovdqa $t1,0x10(%rsp)
 762         mov     $B,$a3
 763         vmovdqa $t2,0x20(%rsp)
 764         xor     $C,$a3                  # magic
 765         vmovdqa $t3,0x30(%rsp)
 766         mov     $E,$a0
 767         jmp     .Lavx_00_47
 768
 769 .align  16
 770 .Lavx_00_47:
 771         sub     \$-16*2*$SZ,$Tbl        # size optimization
 772         vmovdqu (%r12),$inout           # $a4
 773         mov     %r12,$_inp              # $a4
 774 ___
 775 sub Xupdate_256_AVX () {
 776         (
 777         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 778          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 779         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 780          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 781         '&vpsrld        ($t3,$t0,$sigma0[2])',
 782         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 783         '&vpxor         ($t0,$t3,$t2)',
 784          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 785         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 786         '&vpxor         ($t0,$t0,$t1)',
 787         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 788         '&vpxor         ($t0,$t0,$t2)',
 789          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 790         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 791          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 792         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 793          '&vpxor        ($t2,$t2,$t3);',
 794          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 795          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 796          '&vpshufd      ($t2,$t2,0b10000100)',
 797          '&vpsrldq      ($t2,$t2,8)',
 798         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 799          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 800          '&vpsrld       ($t2,$t3,$sigma1[2])',
 801          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 802          '&vpxor        ($t2,$t2,$t3);',
 803          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 804          '&vpxor        ($t2,$t2,$t3)',
 805          '&vpshufd      ($t2,$t2,0b11101000)',
 806          '&vpslldq      ($t2,$t2,8)',
 807         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 808         );
 809 }
 810
 811 sub AVX_256_00_47 () {
 812 my $j = shift;
 813 my $body = shift;
 814 my @X = @_;
 815 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 816
 817         foreach (Xupdate_256_AVX()) {           # 29 instructions
 818             eval;
 819             eval(shift(@insns));
 820             eval(shift(@insns));
 821             eval(shift(@insns));
 822         }
 823         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 824           foreach (@insns) { eval; }            # remaining instructions
 825         &vmovdqa        (16*$j."(%rsp)",$t2);
 826 }
 827
 828     $aesni_cbc_idx=0;
 829     for ($i=0,$j=0; $j<4; $j++) {
 830         &AVX_256_00_47($j,\&body_00_15,@X);
 831         push(@X,shift(@X));                     # rotate(@X)
 832     }
 833         &mov            ("%r12",$_inp);         # borrow $a4
 834         &vpand          ($temp,$temp,$mask14);
 835         &mov            ("%r15",$_out);         # borrow $a2
 836         &vpor           ($iv,$iv,$temp);
 837         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 838         &lea            ("%r12","16(%r12)");    # inp++
 839
 840         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 841         &jne    (".Lavx_00_47");
 842
 843         &vmovdqu        ($inout,"(%r12)");
 844         &mov            ($_inp,"%r12");
 845
 846     $aesni_cbc_idx=0;
 847     for ($i=0; $i<16; ) {
 848         foreach(body_00_15()) { eval; }
 849     }
 850
 851                                         }
 852 $code.=<<___;
 853         mov     $_inp,%r12              # borrow $a4
 854         mov     $_out,%r13              # borrow $a0
 855         mov     $_ctx,%r15              # borrow $a2
 856         mov     $_in0,%rsi              # borrow $a3
 857
 858         vpand   $mask14,$temp,$temp
 859         mov     $a1,$A
 860         vpor    $temp,$iv,$iv
 861         vmovdqu $iv,(%r13,%r12)         # write output
 862         lea     16(%r12),%r12           # inp++
 863
 864         add     $SZ*0(%r15),$A
 865         add     $SZ*1(%r15),$B
 866         add     $SZ*2(%r15),$C
 867         add     $SZ*3(%r15),$D
 868         add     $SZ*4(%r15),$E
 869         add     $SZ*5(%r15),$F
 870         add     $SZ*6(%r15),$G
 871         add     $SZ*7(%r15),$H
 872
 873         cmp     $_end,%r12
 874
 875         mov     $A,$SZ*0(%r15)
 876         mov     $B,$SZ*1(%r15)
 877         mov     $C,$SZ*2(%r15)
 878         mov     $D,$SZ*3(%r15)
 879         mov     $E,$SZ*4(%r15)
 880         mov     $F,$SZ*5(%r15)
 881         mov     $G,$SZ*6(%r15)
 882         mov     $H,$SZ*7(%r15)
 883         jb      .Lloop_avx
 884
 885         mov     $_ivp,$ivp
 886         mov     $_rsp,%rsi
 887 .cfi_def_cfa    %rsi,8
 888         vmovdqu $iv,($ivp)              # output IV
 889         vzeroall
 890 ___
 891 $code.=<<___ if ($win64);
 892         movaps  `$framesz+16*0`(%rsp),%xmm6
 893         movaps  `$framesz+16*1`(%rsp),%xmm7
 894         movaps  `$framesz+16*2`(%rsp),%xmm8
 895         movaps  `$framesz+16*3`(%rsp),%xmm9
 896         movaps  `$framesz+16*4`(%rsp),%xmm10
 897         movaps  `$framesz+16*5`(%rsp),%xmm11
 898         movaps  `$framesz+16*6`(%rsp),%xmm12
 899         movaps  `$framesz+16*7`(%rsp),%xmm13
 900         movaps  `$framesz+16*8`(%rsp),%xmm14
 901         movaps  `$framesz+16*9`(%rsp),%xmm15
 902 ___
 903 $code.=<<___;
 904         mov     -48(%rsi),%r15
 905 .cfi_restore    %r15
 906         mov     -40(%rsi),%r14
 907 .cfi_restore    %r14
 908         mov     -32(%rsi),%r13
 909 .cfi_restore    %r13
 910         mov     -24(%rsi),%r12
 911 .cfi_restore    %r12
 912         mov     -16(%rsi),%rbp
 913 .cfi_restore    %rbp
 914         mov     -8(%rsi),%rbx
 915 .cfi_restore    %rbx
 916         lea     (%rsi),%rsp
 917 .cfi_def_cfa_register   %rsp
 918 .Lepilogue_avx:
 919         ret
 920 .cfi_endproc
 921 .size   ${func}_avx,.-${func}_avx
 922 ___
 923
 924 if ($avx>1) {{
 925 ######################################################################
 926 # AVX2+BMI code path
 927 #
 928 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 929 my $PUSH8=8*2*$SZ;
 930 use integer;
 931
 932 sub bodyx_00_15 () {
 933         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 934         (
 935         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 936
 937         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 938         '&and   ($a4,$e)',              # f&e
 939         '&rorx  ($a0,$e,$Sigma1[2])',
 940         '&rorx  ($a2,$e,$Sigma1[1])',
 941
 942         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 943         '&lea   ($h,"($h,$a4)")',
 944         '&andn  ($a4,$e,$g)',           # ~e&g
 945         '&xor   ($a0,$a2)',
 946
 947         '&rorx  ($a1,$e,$Sigma1[0])',
 948         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 949         '&xor   ($a0,$a1)',             # Sigma1(e)
 950         '&mov   ($a2,$a)',
 951
 952         '&rorx  ($a4,$a,$Sigma0[2])',
 953         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 954         '&xor   ($a2,$b)',              # a^b, b^c in next round
 955         '&rorx  ($a1,$a,$Sigma0[1])',
 956
 957         '&rorx  ($a0,$a,$Sigma0[0])',
 958         '&lea   ($d,"($d,$h)")',        # d+=h
 959         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 960         @aesni_cbc_block[$aesni_cbc_idx++].
 961         '&xor   ($a1,$a4)',
 962
 963         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 964         '&xor   ($a1,$a0)',             # Sigma0(a)
 965         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 966         '&mov   ($a4,$e)',              # copy of f in future
 967
 968         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 969         );
 970         # and at the finish one has to $a+=$a1
 971 }
 972
 973 $code.=<<___;
 974 .type   ${func}_avx2,\@function,6
 975 .align  64
 976 ${func}_avx2:
 977 .cfi_startproc
 978 .Lavx2_shortcut:
 979         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 980         mov     %rsp,%rax               # copy %rsp
 981 .cfi_def_cfa_register   %rax
 982         push    %rbx
 983 .cfi_push       %rbx
 984         push    %rbp
 985 .cfi_push       %rbp
 986         push    %r12
 987 .cfi_push       %r12
 988         push    %r13
 989 .cfi_push       %r13
 990         push    %r14
 991 .cfi_push       %r14
 992         push    %r15
 993 .cfi_push       %r15
 994         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 995         and     \$-256*$SZ,%rsp         # align stack frame
 996         add     \$`2*$SZ*($rounds-8)`,%rsp
 997
 998         shl     \$6,$len
 999         sub     $inp,$out               # re-bias
1000         sub     $inp,$in0
1001         add     $inp,$len               # end of input
1002
1003         #mov    $inp,$_inp              # saved later
1004         #mov    $out,$_out              # kept in $offload
1005         mov     $len,$_end
1006         #mov    $key,$_key              # remains resident in $inp register
1007         mov     $ivp,$_ivp
1008         mov     $ctx,$_ctx
1009         mov     $in0,$_in0
1010         mov     %rax,$_rsp
1011 .cfi_cfa_expression     $_rsp,deref,+8
1012 ___
1013 $code.=<<___ if ($win64);
1014         movaps  %xmm6,`$framesz+16*0`(%rsp)
1015         movaps  %xmm7,`$framesz+16*1`(%rsp)
1016         movaps  %xmm8,`$framesz+16*2`(%rsp)
1017         movaps  %xmm9,`$framesz+16*3`(%rsp)
1018         movaps  %xmm10,`$framesz+16*4`(%rsp)
1019         movaps  %xmm11,`$framesz+16*5`(%rsp)
1020         movaps  %xmm12,`$framesz+16*6`(%rsp)
1021         movaps  %xmm13,`$framesz+16*7`(%rsp)
1022         movaps  %xmm14,`$framesz+16*8`(%rsp)
1023         movaps  %xmm15,`$framesz+16*9`(%rsp)
1024 ___
1025 $code.=<<___;
1026 .Lprologue_avx2:
1027         vzeroall
1028
1029         mov     $inp,%r13               # borrow $a0
1030         vpinsrq \$1,$out,$offload,$offload
1031         lea     0x80($key),$inp         # size optimization, reassign
1032         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1033         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1034         mov     $ctx,%r15               # borrow $a2
1035         mov     $in0,%rsi               # borrow $a3
1036         vmovdqu ($ivp),$iv              # load IV
1037         lea     -9(%r14),%r14
1038
1039         vmovdqa 0x00(%r12,%r14,8),$mask14
1040         vmovdqa 0x10(%r12,%r14,8),$mask12
1041         vmovdqa 0x20(%r12,%r14,8),$mask10
1042
1043         sub     \$-16*$SZ,%r13          # inp++, size optimization
1044         mov     $SZ*0(%r15),$A
1045         lea     (%rsi,%r13),%r12        # borrow $a0
1046         mov     $SZ*1(%r15),$B
1047         cmp     $len,%r13               # $_end
1048         mov     $SZ*2(%r15),$C
1049         cmove   %rsp,%r12               # next block or random data
1050         mov     $SZ*3(%r15),$D
1051         mov     $SZ*4(%r15),$E
1052         mov     $SZ*5(%r15),$F
1053         mov     $SZ*6(%r15),$G
1054         mov     $SZ*7(%r15),$H
1055         vmovdqu 0x00-0x80($inp),$roundkey
1056 ___
1057                                         if ($SZ==4) {   # SHA256
1058     my @X = map("%ymm$_",(0..3));
1059     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060
1061 $code.=<<___;
1062         jmp     .Loop_avx2
1063 .align  16
1064 .Loop_avx2:
1065         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1066         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1067         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1068         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1069         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1070
1071         vinserti128     \$1,(%r12),@X[0],@X[0]
1072         vinserti128     \$1,16(%r12),@X[1],@X[1]
1073          vpshufb        $t3,@X[0],@X[0]
1074         vinserti128     \$1,32(%r12),@X[2],@X[2]
1075          vpshufb        $t3,@X[1],@X[1]
1076         vinserti128     \$1,48(%r12),@X[3],@X[3]
1077
1078         lea     $TABLE(%rip),$Tbl
1079         vpshufb $t3,@X[2],@X[2]
1080         lea     -16*$SZ(%r13),%r13
1081         vpaddd  0x00($Tbl),@X[0],$t0
1082         vpshufb $t3,@X[3],@X[3]
1083         vpaddd  0x20($Tbl),@X[1],$t1
1084         vpaddd  0x40($Tbl),@X[2],$t2
1085         vpaddd  0x60($Tbl),@X[3],$t3
1086         vmovdqa $t0,0x00(%rsp)
1087         xor     $a1,$a1
1088         vmovdqa $t1,0x20(%rsp)
1089 ___
1090 $code.=<<___ if (!$win64);
1091 # temporarily use %rsi as frame pointer
1092         mov     $_rsp,%rsi
1093 .cfi_def_cfa    %rsi,8
1094 ___
1095 $code.=<<___;
1096         lea     -$PUSH8(%rsp),%rsp
1097 ___
1098 $code.=<<___ if (!$win64);
1099 # the frame info is at $_rsp, but the stack is moving...
1100 # so a second frame pointer is saved at -8(%rsp)
1101 # that is in the red zone
1102         mov     %rsi,-8(%rsp)
1103 .cfi_cfa_expression     %rsp-8,deref,+8
1104 ___
1105 $code.=<<___;
1106         mov     $B,$a3
1107         vmovdqa $t2,0x00(%rsp)
1108         xor     $C,$a3                  # magic
1109         vmovdqa $t3,0x20(%rsp)
1110         mov     $F,$a4
1111         sub     \$-16*2*$SZ,$Tbl        # size optimization
1112         jmp     .Lavx2_00_47
1113
1114 .align  16
1115 .Lavx2_00_47:
1116         vmovdqu (%r13),$inout
1117         vpinsrq \$0,%r13,$offload,$offload
1118 ___
1119
1120 sub AVX2_256_00_47 () {
1121 my $j = shift;
1122 my $body = shift;
1123 my @X = @_;
1124 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1125 my $base = "+2*$PUSH8(%rsp)";
1126
1127         if (($j%2)==0) {
1128         &lea    ("%rsp","-$PUSH8(%rsp)");
1129 $code.=<<___ if (!$win64);
1130 .cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
1131 # copy secondary frame pointer to new location again at -8(%rsp)
1132         pushq   $PUSH8-8(%rsp)
1133 .cfi_cfa_expression     %rsp,deref,+8
1134         lea     8(%rsp),%rsp
1135 .cfi_cfa_expression     %rsp-8,deref,+8
1136 ___
1137         }
1138         foreach (Xupdate_256_AVX()) {           # 29 instructions
1139             eval;
1140             eval(shift(@insns));
1141             eval(shift(@insns));
1142             eval(shift(@insns));
1143         }
1144         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1145           foreach (@insns) { eval; }            # remaining instructions
1146         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1147 }
1148     $aesni_cbc_idx=0;
1149     for ($i=0,$j=0; $j<4; $j++) {
1150         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1151         push(@X,shift(@X));                     # rotate(@X)
1152     }
1153         &vmovq          ("%r13",$offload);      # borrow $a0
1154         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1155         &vpand          ($temp,$temp,$mask14);
1156         &vpor           ($iv,$iv,$temp);
1157         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1158         &lea            ("%r13","16(%r13)");    # inp++
1159
1160         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1161         &cmpb   (($SZ-1)."($Tbl)",0);
1162         &jne    (".Lavx2_00_47");
1163
1164         &vmovdqu        ($inout,"(%r13)");
1165         &vpinsrq        ($offload,$offload,"%r13",0);
1166
1167     $aesni_cbc_idx=0;
1168     for ($i=0; $i<16; ) {
1169         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1170         foreach(bodyx_00_15()) { eval; }
1171     }
1172                                         }
1173 $code.=<<___;
1174         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1175         vmovq   $offload,%r13                   # $_inp, borrow $a0
1176         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1177         add     $a1,$A
1178         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1179
1180         vpand   $mask14,$temp,$temp
1181         vpor    $temp,$iv,$iv
1182         vmovdqu $iv,(%r12,%r13)                 # write output
1183         lea     16(%r13),%r13
1184
1185         add     $SZ*0(%r15),$A
1186         add     $SZ*1(%r15),$B
1187         add     $SZ*2(%r15),$C
1188         add     $SZ*3(%r15),$D
1189         add     $SZ*4(%r15),$E
1190         add     $SZ*5(%r15),$F
1191         add     $SZ*6(%r15),$G
1192         add     $SZ*7(%r15),$H
1193
1194         mov     $A,$SZ*0(%r15)
1195         mov     $B,$SZ*1(%r15)
1196         mov     $C,$SZ*2(%r15)
1197         mov     $D,$SZ*3(%r15)
1198         mov     $E,$SZ*4(%r15)
1199         mov     $F,$SZ*5(%r15)
1200         mov     $G,$SZ*6(%r15)
1201         mov     $H,$SZ*7(%r15)
1202
1203         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1204         je      .Ldone_avx2
1205
1206         xor     $a1,$a1
1207         mov     $B,$a3
1208         mov     $F,$a4
1209         xor     $C,$a3                  # magic
1210         jmp     .Lower_avx2
1211 .align  16
1212 .Lower_avx2:
1213         vmovdqu (%r13),$inout
1214         vpinsrq \$0,%r13,$offload,$offload
1215 ___
1216     $aesni_cbc_idx=0;
1217     for ($i=0; $i<16; ) {
1218         my $base="+16($Tbl)";
1219         foreach(bodyx_00_15()) { eval; }
1220         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1221     }
1222 $code.=<<___;
1223         vmovq   $offload,%r13                   # borrow $a0
1224         vpextrq \$1,$offload,%r15               # borrow $a2
1225         vpand   $mask14,$temp,$temp
1226         vpor    $temp,$iv,$iv
1227         lea     -$PUSH8($Tbl),$Tbl
1228         vmovdqu $iv,(%r15,%r13)                 # write output
1229         lea     16(%r13),%r13                   # inp++
1230         cmp     %rsp,$Tbl
1231         jae     .Lower_avx2
1232
1233         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1234         lea     16*$SZ(%r13),%r13
1235         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1236         add     $a1,$A
1237         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1238
1239         add     $SZ*0(%r15),$A
1240         add     $SZ*1(%r15),$B
1241         add     $SZ*2(%r15),$C
1242         add     $SZ*3(%r15),$D
1243         add     $SZ*4(%r15),$E
1244         add     $SZ*5(%r15),$F
1245         add     $SZ*6(%r15),$G
1246         lea     (%rsi,%r13),%r12
1247         add     $SZ*7(%r15),$H
1248
1249         cmp     $_end,%r13
1250
1251         mov     $A,$SZ*0(%r15)
1252         cmove   %rsp,%r12               # next block or stale data
1253         mov     $B,$SZ*1(%r15)
1254         mov     $C,$SZ*2(%r15)
1255         mov     $D,$SZ*3(%r15)
1256         mov     $E,$SZ*4(%r15)
1257         mov     $F,$SZ*5(%r15)
1258         mov     $G,$SZ*6(%r15)
1259         mov     $H,$SZ*7(%r15)
1260
1261         jbe     .Loop_avx2
1262         lea     (%rsp),$Tbl
1263 # temporarily use $Tbl as index to $_rsp
1264 # this avoids the need to save a secondary frame pointer at -8(%rsp)
1265 .cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
1266
1267 .Ldone_avx2:
1268         mov     16*$SZ+4*8($Tbl),$ivp
1269         mov     16*$SZ+7*8($Tbl),%rsi
1270 .cfi_def_cfa    %rsi,8
1271         vmovdqu $iv,($ivp)              # output IV
1272         vzeroall
1273 ___
1274 $code.=<<___ if ($win64);
1275         movaps  `$framesz+16*0`($Tbl),%xmm6
1276         movaps  `$framesz+16*1`($Tbl),%xmm7
1277         movaps  `$framesz+16*2`($Tbl),%xmm8
1278         movaps  `$framesz+16*3`($Tbl),%xmm9
1279         movaps  `$framesz+16*4`($Tbl),%xmm10
1280         movaps  `$framesz+16*5`($Tbl),%xmm11
1281         movaps  `$framesz+16*6`($Tbl),%xmm12
1282         movaps  `$framesz+16*7`($Tbl),%xmm13
1283         movaps  `$framesz+16*8`($Tbl),%xmm14
1284         movaps  `$framesz+16*9`($Tbl),%xmm15
1285 ___
1286 $code.=<<___;
1287         mov     -48(%rsi),%r15
1288 .cfi_restore    %r15
1289         mov     -40(%rsi),%r14
1290 .cfi_restore    %r14
1291         mov     -32(%rsi),%r13
1292 .cfi_restore    %r13
1293         mov     -24(%rsi),%r12
1294 .cfi_restore    %r12
1295         mov     -16(%rsi),%rbp
1296 .cfi_restore    %rbp
1297         mov     -8(%rsi),%rbx
1298 .cfi_restore    %rbx
1299         lea     (%rsi),%rsp
1300 .cfi_def_cfa_register   %rsp
1301 .Lepilogue_avx2:
1302         ret
1303 .cfi_endproc
1304 .size   ${func}_avx2,.-${func}_avx2
1305 ___
1306 }}
1307 }}
1308 {{
1309 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1310
1311 my ($rounds,$Tbl)=("%r11d","%rbx");
1312
1313 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1314 my @rndkey=("%xmm4","%xmm5");
1315 my $r=0;
1316 my $sn=0;
1317
1318 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1319 my @MSG=map("%xmm$_",(10..13));
1320
1321 my $aesenc=sub {
1322   use integer;
1323   my ($n,$k)=($r/10,$r%10);
1324     if ($k==0) {
1325       $code.=<<___;
1326         movups          `16*$n`($in0),$in               # load input
1327         xorps           $rndkey0,$in
1328 ___
1329       $code.=<<___ if ($n);
1330         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1331 ___
1332       $code.=<<___;
1333         xorps           $in,$iv
1334         movups          `32+16*$k-112`($key),$rndkey[1]
1335         aesenc          $rndkey[0],$iv
1336 ___
1337     } elsif ($k==9) {
1338       $sn++;
1339       $code.=<<___;
1340         cmp             \$11,$rounds
1341         jb              .Laesenclast$sn
1342         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1343         aesenc          $rndkey[0],$iv
1344         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1345         aesenc          $rndkey[1],$iv
1346         je              .Laesenclast$sn
1347         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1348         aesenc          $rndkey[0],$iv
1349         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1350         aesenc          $rndkey[1],$iv
1351 .Laesenclast$sn:
1352         aesenclast      $rndkey[0],$iv
1353         movups          16-112($key),$rndkey[1]         # forward reference
1354         nop
1355 ___
1356     } else {
1357       $code.=<<___;
1358         movups          `32+16*$k-112`($key),$rndkey[1]
1359         aesenc          $rndkey[0],$iv
1360 ___
1361     }
1362     $r++;       unshift(@rndkey,pop(@rndkey));
1363 };
1364
1365 if ($shaext) {
1366 my $Tbl="%rax";
1367
1368 $code.=<<___;
1369 .type   ${func}_shaext,\@function,6
1370 .align  32
1371 ${func}_shaext:
1372 .cfi_startproc
1373         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1374 ___
1375 $code.=<<___ if ($win64);
1376         lea     `-8-10*16`(%rsp),%rsp
1377         movaps  %xmm6,-8-10*16(%rax)
1378         movaps  %xmm7,-8-9*16(%rax)
1379         movaps  %xmm8,-8-8*16(%rax)
1380         movaps  %xmm9,-8-7*16(%rax)
1381         movaps  %xmm10,-8-6*16(%rax)
1382         movaps  %xmm11,-8-5*16(%rax)
1383         movaps  %xmm12,-8-4*16(%rax)
1384         movaps  %xmm13,-8-3*16(%rax)
1385         movaps  %xmm14,-8-2*16(%rax)
1386         movaps  %xmm15,-8-1*16(%rax)
1387 .Lprologue_shaext:
1388 ___
1389 $code.=<<___;
1390         lea             K256+0x80(%rip),$Tbl
1391         movdqu          ($ctx),$ABEF            # DCBA
1392         movdqu          16($ctx),$CDGH          # HGFE
1393         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1394
1395         mov             240($key),$rounds
1396         sub             $in0,$out
1397         movups          ($key),$rndkey0         # $key[0]
1398         movups          ($ivp),$iv              # load IV
1399         movups          16($key),$rndkey[0]     # forward reference
1400         lea             112($key),$key          # size optimization
1401
1402         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1403         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1404         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1405         movdqa          $TMP,$BSWAP             # offload
1406         palignr         \$8,$CDGH,$ABEF         # ABEF
1407         punpcklqdq      $Wi,$CDGH               # CDGH
1408
1409         jmp     .Loop_shaext
1410
1411 .align  16
1412 .Loop_shaext:
1413         movdqu          ($inp),@MSG[0]
1414         movdqu          0x10($inp),@MSG[1]
1415         movdqu          0x20($inp),@MSG[2]
1416         pshufb          $TMP,@MSG[0]
1417         movdqu          0x30($inp),@MSG[3]
1418
1419         movdqa          0*32-0x80($Tbl),$Wi
1420         paddd           @MSG[0],$Wi
1421         pshufb          $TMP,@MSG[1]
1422         movdqa          $CDGH,$CDGH_SAVE        # offload
1423         movdqa          $ABEF,$ABEF_SAVE        # offload
1424 ___
1425         &$aesenc();
1426 $code.=<<___;
1427         sha256rnds2     $ABEF,$CDGH             # 0-3
1428         pshufd          \$0x0e,$Wi,$Wi
1429 ___
1430         &$aesenc();
1431 $code.=<<___;
1432         sha256rnds2     $CDGH,$ABEF
1433
1434         movdqa          1*32-0x80($Tbl),$Wi
1435         paddd           @MSG[1],$Wi
1436         pshufb          $TMP,@MSG[2]
1437         lea             0x40($inp),$inp
1438 ___
1439         &$aesenc();
1440 $code.=<<___;
1441         sha256rnds2     $ABEF,$CDGH             # 4-7
1442         pshufd          \$0x0e,$Wi,$Wi
1443 ___
1444         &$aesenc();
1445 $code.=<<___;
1446         sha256rnds2     $CDGH,$ABEF
1447
1448         movdqa          2*32-0x80($Tbl),$Wi
1449         paddd           @MSG[2],$Wi
1450         pshufb          $TMP,@MSG[3]
1451         sha256msg1      @MSG[1],@MSG[0]
1452 ___
1453         &$aesenc();
1454 $code.=<<___;
1455         sha256rnds2     $ABEF,$CDGH             # 8-11
1456         pshufd          \$0x0e,$Wi,$Wi
1457         movdqa          @MSG[3],$TMP
1458         palignr         \$4,@MSG[2],$TMP
1459         paddd           $TMP,@MSG[0]
1460 ___
1461         &$aesenc();
1462 $code.=<<___;
1463         sha256rnds2     $CDGH,$ABEF
1464
1465         movdqa          3*32-0x80($Tbl),$Wi
1466         paddd           @MSG[3],$Wi
1467         sha256msg2      @MSG[3],@MSG[0]
1468         sha256msg1      @MSG[2],@MSG[1]
1469 ___
1470         &$aesenc();
1471 $code.=<<___;
1472         sha256rnds2     $ABEF,$CDGH             # 12-15
1473         pshufd          \$0x0e,$Wi,$Wi
1474 ___
1475         &$aesenc();
1476 $code.=<<___;
1477         movdqa          @MSG[0],$TMP
1478         palignr         \$4,@MSG[3],$TMP
1479         paddd           $TMP,@MSG[1]
1480         sha256rnds2     $CDGH,$ABEF
1481 ___
1482 for($i=4;$i<16-3;$i++) {
1483         &$aesenc()      if (($r%10)==0);
1484 $code.=<<___;
1485         movdqa          $i*32-0x80($Tbl),$Wi
1486         paddd           @MSG[0],$Wi
1487         sha256msg2      @MSG[0],@MSG[1]
1488         sha256msg1      @MSG[3],@MSG[2]
1489 ___
1490         &$aesenc();
1491 $code.=<<___;
1492         sha256rnds2     $ABEF,$CDGH             # 16-19...
1493         pshufd          \$0x0e,$Wi,$Wi
1494         movdqa          @MSG[1],$TMP
1495         palignr         \$4,@MSG[0],$TMP
1496         paddd           $TMP,@MSG[2]
1497 ___
1498         &$aesenc();
1499         &$aesenc()      if ($r==19);
1500 $code.=<<___;
1501         sha256rnds2     $CDGH,$ABEF
1502 ___
1503         push(@MSG,shift(@MSG));
1504 }
1505 $code.=<<___;
1506         movdqa          13*32-0x80($Tbl),$Wi
1507         paddd           @MSG[0],$Wi
1508         sha256msg2      @MSG[0],@MSG[1]
1509         sha256msg1      @MSG[3],@MSG[2]
1510 ___
1511         &$aesenc();
1512 $code.=<<___;
1513         sha256rnds2     $ABEF,$CDGH             # 52-55
1514         pshufd          \$0x0e,$Wi,$Wi
1515         movdqa          @MSG[1],$TMP
1516         palignr         \$4,@MSG[0],$TMP
1517         paddd           $TMP,@MSG[2]
1518 ___
1519         &$aesenc();
1520         &$aesenc();
1521 $code.=<<___;
1522         sha256rnds2     $CDGH,$ABEF
1523
1524         movdqa          14*32-0x80($Tbl),$Wi
1525         paddd           @MSG[1],$Wi
1526         sha256msg2      @MSG[1],@MSG[2]
1527         movdqa          $BSWAP,$TMP
1528 ___
1529         &$aesenc();
1530 $code.=<<___;
1531         sha256rnds2     $ABEF,$CDGH             # 56-59
1532         pshufd          \$0x0e,$Wi,$Wi
1533 ___
1534         &$aesenc();
1535 $code.=<<___;
1536         sha256rnds2     $CDGH,$ABEF
1537
1538         movdqa          15*32-0x80($Tbl),$Wi
1539         paddd           @MSG[2],$Wi
1540 ___
1541         &$aesenc();
1542         &$aesenc();
1543 $code.=<<___;
1544         sha256rnds2     $ABEF,$CDGH             # 60-63
1545         pshufd          \$0x0e,$Wi,$Wi
1546 ___
1547         &$aesenc();
1548 $code.=<<___;
1549         sha256rnds2     $CDGH,$ABEF
1550         #pxor           $CDGH,$rndkey0          # black magic
1551 ___
1552         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1553 $code.=<<___;
1554         #xorps          $CDGH,$rndkey0          # black magic
1555         paddd           $CDGH_SAVE,$CDGH
1556         paddd           $ABEF_SAVE,$ABEF
1557
1558         dec             $len
1559         movups          $iv,48($out,$in0)       # write output
1560         lea             64($in0),$in0
1561         jnz             .Loop_shaext
1562
1563         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1564         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1565         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1566         punpckhqdq      $CDGH,$ABEF             # DCBA
1567         palignr         \$8,$TMP,$CDGH          # HGFE
1568
1569         movups          $iv,($ivp)              # write IV
1570         movdqu          $ABEF,($ctx)
1571         movdqu          $CDGH,16($ctx)
1572 ___
1573 $code.=<<___ if ($win64);
1574         movaps  0*16(%rsp),%xmm6
1575         movaps  1*16(%rsp),%xmm7
1576         movaps  2*16(%rsp),%xmm8
1577         movaps  3*16(%rsp),%xmm9
1578         movaps  4*16(%rsp),%xmm10
1579         movaps  5*16(%rsp),%xmm11
1580         movaps  6*16(%rsp),%xmm12
1581         movaps  7*16(%rsp),%xmm13
1582         movaps  8*16(%rsp),%xmm14
1583         movaps  9*16(%rsp),%xmm15
1584         lea     8+10*16(%rsp),%rsp
1585 .Lepilogue_shaext:
1586 ___
1587 $code.=<<___;
1588         ret
1589 .cfi_endproc
1590 .size   ${func}_shaext,.-${func}_shaext
1591 ___
1592 }
1593 }}}}}
1594
1595 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1596 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1597 if ($win64 && $avx) {
1598 $rec="%rcx";
1599 $frame="%rdx";
1600 $context="%r8";
1601 $disp="%r9";
1602
1603 $code.=<<___;
1604 .extern __imp_RtlVirtualUnwind
1605 .type   se_handler,\@abi-omnipotent
1606 .align  16
1607 se_handler:
1608         push    %rsi
1609         push    %rdi
1610         push    %rbx
1611         push    %rbp
1612         push    %r12
1613         push    %r13
1614         push    %r14
1615         push    %r15
1616         pushfq
1617         sub     \$64,%rsp
1618
1619         mov     120($context),%rax      # pull context->Rax
1620         mov     248($context),%rbx      # pull context->Rip
1621
1622         mov     8($disp),%rsi           # disp->ImageBase
1623         mov     56($disp),%r11          # disp->HanderlData
1624
1625         mov     0(%r11),%r10d           # HandlerData[0]
1626         lea     (%rsi,%r10),%r10        # prologue label
1627         cmp     %r10,%rbx               # context->Rip<prologue label
1628         jb      .Lin_prologue
1629
1630         mov     152($context),%rax      # pull context->Rsp
1631
1632         mov     4(%r11),%r10d           # HandlerData[1]
1633         lea     (%rsi,%r10),%r10        # epilogue label
1634         cmp     %r10,%rbx               # context->Rip>=epilogue label
1635         jae     .Lin_prologue
1636 ___
1637 $code.=<<___ if ($shaext);
1638         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1639         cmp     %r10,%rbx
1640         jb      .Lnot_in_shaext
1641
1642         lea     (%rax),%rsi
1643         lea     512($context),%rdi      # &context.Xmm6
1644         mov     \$20,%ecx
1645         .long   0xa548f3fc              # cld; rep movsq
1646         lea     168(%rax),%rax          # adjust stack pointer
1647         jmp     .Lin_prologue
1648 .Lnot_in_shaext:
1649 ___
1650 $code.=<<___ if ($avx>1);
1651         lea     .Lavx2_shortcut(%rip),%r10
1652         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1653         jb      .Lnot_in_avx2
1654
1655         and     \$-256*$SZ,%rax
1656         add     \$`2*$SZ*($rounds-8)`,%rax
1657 .Lnot_in_avx2:
1658 ___
1659 $code.=<<___;
1660         mov     %rax,%rsi               # put aside Rsp
1661         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1662
1663         mov     -8(%rax),%rbx
1664         mov     -16(%rax),%rbp
1665         mov     -24(%rax),%r12
1666         mov     -32(%rax),%r13
1667         mov     -40(%rax),%r14
1668         mov     -48(%rax),%r15
1669         mov     %rbx,144($context)      # restore context->Rbx
1670         mov     %rbp,160($context)      # restore context->Rbp
1671         mov     %r12,216($context)      # restore context->R12
1672         mov     %r13,224($context)      # restore context->R13
1673         mov     %r14,232($context)      # restore context->R14
1674         mov     %r15,240($context)      # restore context->R15
1675
1676         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1677         lea     512($context),%rdi      # &context.Xmm6
1678         mov     \$20,%ecx
1679         .long   0xa548f3fc              # cld; rep movsq
1680
1681 .Lin_prologue:
1682         mov     8(%rax),%rdi
1683         mov     16(%rax),%rsi
1684         mov     %rax,152($context)      # restore context->Rsp
1685         mov     %rsi,168($context)      # restore context->Rsi
1686         mov     %rdi,176($context)      # restore context->Rdi
1687
1688         mov     40($disp),%rdi          # disp->ContextRecord
1689         mov     $context,%rsi           # context
1690         mov     \$154,%ecx              # sizeof(CONTEXT)
1691         .long   0xa548f3fc              # cld; rep movsq
1692
1693         mov     $disp,%rsi
1694         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1695         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1696         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1697         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1698         mov     40(%rsi),%r10           # disp->ContextRecord
1699         lea     56(%rsi),%r11           # &disp->HandlerData
1700         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1701         mov     %r10,32(%rsp)           # arg5
1702         mov     %r11,40(%rsp)           # arg6
1703         mov     %r12,48(%rsp)           # arg7
1704         mov     %rcx,56(%rsp)           # arg8, (NULL)
1705         call    *__imp_RtlVirtualUnwind(%rip)
1706
1707         mov     \$1,%eax                # ExceptionContinueSearch
1708         add     \$64,%rsp
1709         popfq
1710         pop     %r15
1711         pop     %r14
1712         pop     %r13
1713         pop     %r12
1714         pop     %rbp
1715         pop     %rbx
1716         pop     %rdi
1717         pop     %rsi
1718         ret
1719 .size   se_handler,.-se_handler
1720
1721 .section        .pdata
1722         .rva    .LSEH_begin_${func}_xop
1723         .rva    .LSEH_end_${func}_xop
1724         .rva    .LSEH_info_${func}_xop
1725
1726         .rva    .LSEH_begin_${func}_avx
1727         .rva    .LSEH_end_${func}_avx
1728         .rva    .LSEH_info_${func}_avx
1729 ___
1730 $code.=<<___ if ($avx>1);
1731         .rva    .LSEH_begin_${func}_avx2
1732         .rva    .LSEH_end_${func}_avx2
1733         .rva    .LSEH_info_${func}_avx2
1734 ___
1735 $code.=<<___ if ($shaext);
1736         .rva    .LSEH_begin_${func}_shaext
1737         .rva    .LSEH_end_${func}_shaext
1738         .rva    .LSEH_info_${func}_shaext
1739 ___
1740 $code.=<<___;
1741 .section        .xdata
1742 .align  8
1743 .LSEH_info_${func}_xop:
1744         .byte   9,0,0,0
1745         .rva    se_handler
1746         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1747
1748 .LSEH_info_${func}_avx:
1749         .byte   9,0,0,0
1750         .rva    se_handler
1751         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1752 ___
1753 $code.=<<___ if ($avx>1);
1754 .LSEH_info_${func}_avx2:
1755         .byte   9,0,0,0
1756         .rva    se_handler
1757         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1758 ___
1759 $code.=<<___ if ($shaext);
1760 .LSEH_info_${func}_shaext:
1761         .byte   9,0,0,0
1762         .rva    se_handler
1763         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1764 ___
1765 }
1766
1767 ####################################################################
1768 sub rex {
1769   local *opcode=shift;
1770   my ($dst,$src)=@_;
1771   my $rex=0;
1772
1773     $rex|=0x04                  if($dst>=8);
1774     $rex|=0x01                  if($src>=8);
1775     unshift @opcode,$rex|0x40   if($rex);
1776 }
1777
1778 {
1779   my %opcodelet = (
1780                 "sha256rnds2" => 0xcb,
1781                 "sha256msg1"  => 0xcc,
1782                 "sha256msg2"  => 0xcd   );
1783
1784   sub sha256op38 {
1785     my $instr = shift;
1786
1787     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1788       my @opcode=(0x0f,0x38);
1789         rex(\@opcode,$2,$1);
1790         push @opcode,$opcodelet{$instr};
1791         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1792         return ".byte\t".join(',',@opcode);
1793     } else {
1794         return $instr."\t".@_[0];
1795     }
1796   }
1797 }
1798
1799 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1800 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1801 print $code;
1802 close STDOUT or die "error closing STDOUT: $!";