openssl: add assembly for aes and gcm
authorFedor Indutny <fedor.indutny@gmail.com>
Wed, 27 Nov 2013 22:14:00 +0000 (02:14 +0400)
committerFedor Indutny <fedor.indutny@gmail.com>
Fri, 6 Dec 2013 07:10:24 +0000 (11:10 +0400)
18 files changed:
deps/openssl/asm/Makefile
deps/openssl/asm/x64-elf-gas/aes/bsaes-x86_64.s [new file with mode: 0644]
deps/openssl/asm/x64-elf-gas/aes/vpaes-x86_64.s [new file with mode: 0644]
deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s [new file with mode: 0644]
deps/openssl/asm/x64-macosx-gas/aes/bsaes-x86_64.s [new file with mode: 0644]
deps/openssl/asm/x64-macosx-gas/aes/vpaes-x86_64.s [new file with mode: 0644]
deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s [new file with mode: 0644]
deps/openssl/asm/x64-win32-masm/aes/bsaes-x86_64.asm [new file with mode: 0644]
deps/openssl/asm/x64-win32-masm/aes/vpaes-x86_64.asm [new file with mode: 0644]
deps/openssl/asm/x64-win32-masm/aes/vpaesni-x86_64.asm [new file with mode: 0644]
deps/openssl/asm/x64-win32-masm/modes/ghash-x86_64.asm [new file with mode: 0644]
deps/openssl/asm/x86-elf-gas/aes/vpaes-x86.s [new file with mode: 0644]
deps/openssl/asm/x86-elf-gas/modes/ghash-x86.s [new file with mode: 0644]
deps/openssl/asm/x86-macosx-gas/aes/vpaes-x86.s [new file with mode: 0644]
deps/openssl/asm/x86-macosx-gas/modes/ghash-x86.s [new file with mode: 0644]
deps/openssl/asm/x86-win32-masm/aes/vpaes-x86.asm [new file with mode: 0644]
deps/openssl/asm/x86-win32-masm/modes/ghash-x86.asm [new file with mode: 0644]
deps/openssl/openssl.gyp

index eb6c646..2218859 100644 (file)
@@ -5,6 +5,7 @@ PERL    += -I../openssl/crypto/perlasm -I../openssl/crypto/bn/asm
 OUTPUTS        = \
        x86-elf-gas/aes/aes-586.s \
        x86-elf-gas/aes/aesni-x86.s \
+       x86-elf-gas/aes/vpaes-x86.s \
        x86-elf-gas/bf/bf-686.s \
        x86-elf-gas/bn/x86-mont.s \
        x86-elf-gas/bn/x86.s \
@@ -20,9 +21,12 @@ OUTPUTS      = \
        x86-elf-gas/sha/sha256-586.s \
        x86-elf-gas/sha/sha512-586.s \
        x86-elf-gas/whrlpool/wp-mmx.s \
+       x86-elf-gas/modes/ghash-x86.s \
        x86-elf-gas/x86cpuid.s \
        x64-elf-gas/aes/aes-x86_64.s \
        x64-elf-gas/aes/aesni-x86_64.s \
+       x64-elf-gas/aes/vpaes-x86_64.s \
+       x64-elf-gas/aes/bsaes-x86_64.s \
        x64-elf-gas/aes/aesni-sha1-x86_64.s \
        x64-elf-gas/bn/modexp512-x86_64.s \
        x64-elf-gas/bn/x86_64-mont.s \
@@ -36,9 +40,11 @@ OUTPUTS      = \
        x64-elf-gas/sha/sha256-x86_64.s \
        x64-elf-gas/sha/sha512-x86_64.s \
        x64-elf-gas/whrlpool/wp-x86_64.s \
+       x64-elf-gas/modes/ghash-x86_64.s \
        x64-elf-gas/x86_64cpuid.s \
        x86-macosx-gas/aes/aes-586.s \
        x86-macosx-gas/aes/aesni-x86.s \
+       x86-macosx-gas/aes/vpaes-x86.s \
        x86-macosx-gas/bf/bf-686.s \
        x86-macosx-gas/bn/x86-mont.s \
        x86-macosx-gas/bn/x86.s \
@@ -54,9 +60,12 @@ OUTPUTS      = \
        x86-macosx-gas/sha/sha256-586.s \
        x86-macosx-gas/sha/sha512-586.s \
        x86-macosx-gas/whrlpool/wp-mmx.s \
+       x86-macosx-gas/modes/ghash-x86.s \
        x86-macosx-gas/x86cpuid.s \
        x64-macosx-gas/aes/aes-x86_64.s \
        x64-macosx-gas/aes/aesni-x86_64.s \
+       x64-macosx-gas/aes/vpaes-x86_64.s \
+       x64-macosx-gas/aes/bsaes-x86_64.s \
        x64-macosx-gas/aes/aesni-sha1-x86_64.s \
        x64-macosx-gas/bn/modexp512-x86_64.s \
        x64-macosx-gas/bn/x86_64-mont.s \
@@ -70,9 +79,11 @@ OUTPUTS      = \
        x64-macosx-gas/sha/sha256-x86_64.s \
        x64-macosx-gas/sha/sha512-x86_64.s \
        x64-macosx-gas/whrlpool/wp-x86_64.s \
+       x64-macosx-gas/modes/ghash-x86_64.s \
        x64-macosx-gas/x86_64cpuid.s \
        x86-win32-masm/aes/aes-586.asm \
        x86-win32-masm/aes/aesni-x86.asm \
+       x86-win32-masm/aes/vpaes-x86.asm \
        x86-win32-masm/bf/bf-686.asm \
        x86-win32-masm/bn/x86-mont.asm \
        x86-win32-masm/bn/x86.asm \
@@ -88,9 +99,12 @@ OUTPUTS      = \
        x86-win32-masm/sha/sha256-586.asm \
        x86-win32-masm/sha/sha512-586.asm \
        x86-win32-masm/whrlpool/wp-mmx.asm \
+       x86-win32-masm/modes/ghash-x86.asm \
        x86-win32-masm/x86cpuid.asm \
        x64-win32-masm/aes/aes-x86_64.asm \
        x64-win32-masm/aes/aesni-x86_64.asm \
+       x64-win32-masm/aes/vpaes-x86_64.asm \
+       x64-win32-masm/aes/bsaes-x86_64.asm \
        x64-win32-masm/aes/aesni-sha1-x86_64.asm \
        x64-win32-masm/bn/modexp512-x86_64.asm \
        x64-win32-masm/bn/x86_64-mont.asm \
@@ -104,6 +118,7 @@ OUTPUTS     = \
        x64-win32-masm/sha/sha256-x86_64.asm \
        x64-win32-masm/sha/sha512-x86_64.asm \
        x64-win32-masm/whrlpool/wp-x86_64.asm \
+       x64-win32-masm/modes/ghash-x86_64.asm \
        x64-win32-masm/x86_64cpuid.asm \
 
 x64-elf-gas/%.s x86-elf-gas/%.s:
@@ -132,6 +147,8 @@ clean:
 
 x64-elf-gas/aes/aes-x86_64.s: ../openssl/crypto/aes/asm/aes-x86_64.pl
 x64-elf-gas/aes/aesni-x86_64.s: ../openssl/crypto/aes/asm/aesni-x86_64.pl
+x64-elf-gas/aes/vpaes-x86_64.s: ../openssl/crypto/aes/asm/vpaes-x86_64.pl
+x64-elf-gas/aes/bsaes-x86_64.s: ../openssl/crypto/aes/asm/bsaes-x86_64.pl
 x64-elf-gas/aes/aesni-sha1-x86_64.s: ../openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
 x64-elf-gas/bn/modexp512-x86_64.s: ../openssl/crypto/bn/asm/modexp512-x86_64.pl
 x64-elf-gas/bn/x86_64-mont.s: ../openssl/crypto/bn/asm/x86_64-mont.pl
@@ -145,9 +162,12 @@ x64-elf-gas/sha/sha1-x86_64.s: ../openssl/crypto/sha/asm/sha1-x86_64.pl
 x64-elf-gas/sha/sha512-x86_64.s: ../openssl/crypto/sha/asm/sha512-x86_64.pl
 x64-elf-gas/sha/sha256-x86_64.s: ../openssl/crypto/sha/asm/sha256-x86_64.pl
 x64-elf-gas/whrlpool/wp-x86_64.s: ../openssl/crypto/whrlpool/asm/wp-x86_64.pl
+x64-elf-gas/modes/ghash-x86_64.s: ../openssl/crypto/modes/asm/ghash-x86_64.pl
 x64-elf-gas/x86_64cpuid.s: ../openssl/crypto/x86_64cpuid.pl
 x64-macosx-gas/aes/aes-x86_64.s: ../openssl/crypto/aes/asm/aes-x86_64.pl
 x64-macosx-gas/aes/aesni-x86_64.s: ../openssl/crypto/aes/asm/aesni-x86_64.pl
+x64-macosx-gas/aes/vpaes-x86_64.s: ../openssl/crypto/aes/asm/vpaes-x86_64.pl
+x64-macosx-gas/aes/bsaes-x86_64.s: ../openssl/crypto/aes/asm/bsaes-x86_64.pl
 x64-macosx-gas/aes/aesni-sha1-x86_64.s: ../openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
 x64-macosx-gas/bn/modexp512-x86_64.s: ../openssl/crypto/bn/asm/modexp512-x86_64.pl
 x64-macosx-gas/bn/x86_64-mont.s: ../openssl/crypto/bn/asm/x86_64-mont.pl
@@ -161,9 +181,12 @@ x64-macosx-gas/sha/sha1-x86_64.s: ../openssl/crypto/sha/asm/sha1-x86_64.pl
 x64-macosx-gas/sha/sha256-x86_64.s: ../openssl/crypto/sha/asm/sha256-x86_64.pl
 x64-macosx-gas/sha/sha512-x86_64.s: ../openssl/crypto/sha/asm/sha512-x86_64.pl
 x64-macosx-gas/whrlpool/wp-x86_64.s: ../openssl/crypto/whrlpool/asm/wp-x86_64.pl
+x64-macosx-gas/modes/ghash-x86_64.s: ../openssl/crypto/modes/asm/ghash-x86_64.pl
 x64-macosx-gas/x86_64cpuid.s: ../openssl/crypto/x86_64cpuid.pl
 x64-win32-masm/aes/aes-x86_64.asm: ../openssl/crypto/aes/asm/aes-x86_64.pl
 x64-win32-masm/aes/aesni-x86_64.asm: ../openssl/crypto/aes/asm/aesni-x86_64.pl
+x64-win32-masm/aes/vpaes-x86_64.asm: ../openssl/crypto/aes/asm/vpaes-x86_64.pl
+x64-win32-masm/aes/bsaes-x86_64.asm: ../openssl/crypto/aes/asm/bsaes-x86_64.pl
 x64-win32-masm/aes/aesni-sha1-x86_64.asm: ../openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
 x64-win32-masm/bn/modexp512-x86_64.asm: ../openssl/crypto/bn/asm/modexp512-x86_64.pl
 x64-win32-masm/bn/x86_64-mont.asm: ../openssl/crypto/bn/asm/x86_64-mont.pl
@@ -177,9 +200,11 @@ x64-win32-masm/sha/sha1-x86_64.asm: ../openssl/crypto/sha/asm/sha1-x86_64.pl
 x64-win32-masm/sha/sha256-x86_64.asm: ../openssl/crypto/sha/asm/sha256-x86_64.pl
 x64-win32-masm/sha/sha512-x86_64.asm: ../openssl/crypto/sha/asm/sha512-x86_64.pl
 x64-win32-masm/whrlpool/wp-x86_64.asm: ../openssl/crypto/whrlpool/asm/wp-x86_64.pl
+x64-win32-masm/modes/ghash-x86_64.asm: ../openssl/crypto/modes/asm/ghash-x86_64.pl
 x64-win32-masm/x86_64cpuid.asm: ../openssl/crypto/x86_64cpuid.pl
 x86-elf-gas/aes/aes-586.s: ../openssl/crypto/aes/asm/aes-586.pl
 x86-elf-gas/aes/aesni-x86.s: ../openssl/crypto/aes/asm/aesni-x86.pl
+x86-elf-gas/aes/vpaes-x86.s: ../openssl/crypto/aes/asm/vpaes-x86.pl
 x86-elf-gas/bf/bf-686.s: ../openssl/crypto/bf/asm/bf-686.pl
 x86-elf-gas/bn/x86-mont.s: ../openssl/crypto/bn/asm/x86-mont.pl
 x86-elf-gas/bn/x86.s: ../openssl/crypto/bn/asm/x86.pl
@@ -195,9 +220,11 @@ x86-elf-gas/sha/sha1-586.s: ../openssl/crypto/sha/asm/sha1-586.pl
 x86-elf-gas/sha/sha256-586.s: ../openssl/crypto/sha/asm/sha256-586.pl
 x86-elf-gas/sha/sha512-586.s: ../openssl/crypto/sha/asm/sha512-586.pl
 x86-elf-gas/whrlpool/wp-mmx.s: ../openssl/crypto/whrlpool/asm/wp-mmx.pl
+x86-elf-gas/modes/ghash-x86.s: ../openssl/crypto/modes/asm/ghash-x86.pl
 x86-elf-gas/x86cpuid.s: ../openssl/crypto/x86cpuid.pl
 x86-macosx-gas/aes/aes-586.s: ../openssl/crypto/aes/asm/aes-586.pl
 x86-macosx-gas/aes/aesni-x86.s: ../openssl/crypto/aes/asm/aesni-x86.pl
+x86-macosx-gas/aes/vpaes-x86.s: ../openssl/crypto/aes/asm/vpaes-x86.pl
 x86-macosx-gas/bf/bf-686.s: ../openssl/crypto/bf/asm/bf-686.pl
 x86-macosx-gas/bn/x86-mont.s: ../openssl/crypto/bn/asm/x86-mont.pl
 x86-macosx-gas/bn/x86.s: ../openssl/crypto/bn/asm/x86.pl
@@ -213,9 +240,11 @@ x86-macosx-gas/sha/sha1-586.s: ../openssl/crypto/sha/asm/sha1-586.pl
 x86-macosx-gas/sha/sha256-586.s: ../openssl/crypto/sha/asm/sha256-586.pl
 x86-macosx-gas/sha/sha512-586.s: ../openssl/crypto/sha/asm/sha512-586.pl
 x86-macosx-gas/whrlpool/wp-mmx.s: ../openssl/crypto/whrlpool/asm/wp-mmx.pl
+x86-macosx-gas/modes/ghash-x86.s: ../openssl/crypto/modes/asm/ghash-x86.pl
 x86-macosx-gas/x86cpuid.s: ../openssl/crypto/x86cpuid.pl
 x86-win32-masm/aes/aes-586.asm: ../openssl/crypto/aes/asm/aes-586.pl
 x86-win32-masm/aes/aesni-x86.asm: ../openssl/crypto/aes/asm/aesni-x86.pl
+x86-win32-masm/aes/vpaes-x86.asm: ../openssl/crypto/aes/asm/vpaes-x86.pl
 x86-win32-masm/bf/bf-686.asm: ../openssl/crypto/bf/asm/bf-686.pl
 x86-win32-masm/bn/x86.asm: ../openssl/crypto/bn/asm/x86.pl
 x86-win32-masm/bn/x86-mont.asm: ../openssl/crypto/bn/asm/x86-mont.pl
@@ -231,4 +260,5 @@ x86-win32-masm/sha/sha1-586.asm: ../openssl/crypto/sha/asm/sha1-586.pl
 x86-win32-masm/sha/sha256-586.asm: ../openssl/crypto/sha/asm/sha256-586.pl
 x86-win32-masm/sha/sha512-586.asm: ../openssl/crypto/sha/asm/sha512-586.pl
 x86-win32-masm/whrlpool/wp-mmx.asm: ../openssl/crypto/whrlpool/asm/wp-mmx.pl
+x86-win32-masm/modes/ghash-x86.asm: ../openssl/crypto/modes/asm/ghash-x86.pl
 x86-win32-masm/x86cpuid.asm: ../openssl/crypto/x86cpuid.pl
diff --git a/deps/openssl/asm/x64-elf-gas/aes/bsaes-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/bsaes-x86_64.s
new file mode 100644 (file)
index 0000000..f827690
--- /dev/null
@@ -0,0 +1,2570 @@
+.text
+
+
+
+
+
+.type  _bsaes_encrypt8,@function
+.align 64
+_bsaes_encrypt8:
+       leaq    .LBS0(%rip),%r11
+
+       movdqa  (%rax),%xmm8
+       leaq    16(%rax),%rax
+       movdqa  80(%r11),%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+.byte  102,68,15,56,0,255
+       pxor    %xmm8,%xmm1
+.byte  102,15,56,0,199
+       pxor    %xmm8,%xmm2
+.byte  102,15,56,0,207
+       pxor    %xmm8,%xmm3
+.byte  102,15,56,0,215
+       pxor    %xmm8,%xmm4
+.byte  102,15,56,0,223
+       pxor    %xmm8,%xmm5
+.byte  102,15,56,0,231
+       pxor    %xmm8,%xmm6
+.byte  102,15,56,0,239
+.byte  102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm5,%xmm9
+       psrlq   $1,%xmm5
+       movdqa  %xmm3,%xmm10
+       psrlq   $1,%xmm3
+       pxor    %xmm6,%xmm5
+       pxor    %xmm4,%xmm3
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm3
+       pxor    %xmm5,%xmm6
+       psllq   $1,%xmm5
+       pxor    %xmm3,%xmm4
+       psllq   $1,%xmm3
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm1,%xmm9
+       psrlq   $1,%xmm1
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm1
+       pand    %xmm7,%xmm15
+       pxor    %xmm1,%xmm2
+       psllq   $1,%xmm1
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm1
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm4,%xmm9
+       psrlq   $2,%xmm4
+       movdqa  %xmm3,%xmm10
+       psrlq   $2,%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm3
+       pand    %xmm8,%xmm4
+       pand    %xmm8,%xmm3
+       pxor    %xmm4,%xmm6
+       psllq   $2,%xmm4
+       pxor    %xmm3,%xmm5
+       psllq   $2,%xmm3
+       pxor    %xmm9,%xmm4
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm2,%xmm0
+       pxor    %xmm1,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm2
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm1
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm2,%xmm9
+       psrlq   $4,%xmm2
+       movdqa  %xmm1,%xmm10
+       psrlq   $4,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm5,%xmm1
+       pand    %xmm7,%xmm2
+       pand    %xmm7,%xmm1
+       pxor    %xmm2,%xmm6
+       psllq   $4,%xmm2
+       pxor    %xmm1,%xmm5
+       psllq   $4,%xmm1
+       pxor    %xmm9,%xmm2
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm4,%xmm0
+       pxor    %xmm3,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm4
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm3
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       decl    %r10d
+       jmp     .Lenc_sbox
+.align 16
+.Lenc_loop:
+       pxor    0(%rax),%xmm15
+       pxor    16(%rax),%xmm0
+.byte  102,68,15,56,0,255
+       pxor    32(%rax),%xmm1
+.byte  102,15,56,0,199
+       pxor    48(%rax),%xmm2
+.byte  102,15,56,0,207
+       pxor    64(%rax),%xmm3
+.byte  102,15,56,0,215
+       pxor    80(%rax),%xmm4
+.byte  102,15,56,0,223
+       pxor    96(%rax),%xmm5
+.byte  102,15,56,0,231
+       pxor    112(%rax),%xmm6
+.byte  102,15,56,0,239
+       leaq    128(%rax),%rax
+.byte  102,15,56,0,247
+.Lenc_sbox:
+       pxor    %xmm5,%xmm4
+       pxor    %xmm0,%xmm1
+       pxor    %xmm15,%xmm2
+       pxor    %xmm1,%xmm5
+       pxor    %xmm15,%xmm4
+
+       pxor    %xmm2,%xmm5
+       pxor    %xmm6,%xmm2
+       pxor    %xmm4,%xmm6
+       pxor    %xmm3,%xmm2
+       pxor    %xmm4,%xmm3
+       pxor    %xmm0,%xmm2
+
+       pxor    %xmm6,%xmm1
+       pxor    %xmm4,%xmm0
+       movdqa  %xmm6,%xmm10
+       movdqa  %xmm0,%xmm9
+       movdqa  %xmm4,%xmm8
+       movdqa  %xmm1,%xmm12
+       movdqa  %xmm5,%xmm11
+
+       pxor    %xmm3,%xmm10
+       pxor    %xmm1,%xmm9
+       pxor    %xmm2,%xmm8
+       movdqa  %xmm10,%xmm13
+       pxor    %xmm3,%xmm12
+       movdqa  %xmm9,%xmm7
+       pxor    %xmm15,%xmm11
+       movdqa  %xmm10,%xmm14
+
+       por     %xmm8,%xmm9
+       por     %xmm11,%xmm10
+       pxor    %xmm7,%xmm14
+       pand    %xmm11,%xmm13
+       pxor    %xmm8,%xmm11
+       pand    %xmm8,%xmm7
+       pand    %xmm11,%xmm14
+       movdqa  %xmm2,%xmm11
+       pxor    %xmm15,%xmm11
+       pand    %xmm11,%xmm12
+       pxor    %xmm12,%xmm10
+       pxor    %xmm12,%xmm9
+       movdqa  %xmm6,%xmm12
+       movdqa  %xmm4,%xmm11
+       pxor    %xmm0,%xmm12
+       pxor    %xmm5,%xmm11
+       movdqa  %xmm12,%xmm8
+       pand    %xmm11,%xmm12
+       por     %xmm11,%xmm8
+       pxor    %xmm12,%xmm7
+       pxor    %xmm14,%xmm10
+       pxor    %xmm13,%xmm9
+       pxor    %xmm14,%xmm8
+       movdqa  %xmm1,%xmm11
+       pxor    %xmm13,%xmm7
+       movdqa  %xmm3,%xmm12
+       pxor    %xmm13,%xmm8
+       movdqa  %xmm0,%xmm13
+       pand    %xmm2,%xmm11
+       movdqa  %xmm6,%xmm14
+       pand    %xmm15,%xmm12
+       pand    %xmm4,%xmm13
+       por     %xmm5,%xmm14
+       pxor    %xmm11,%xmm10
+       pxor    %xmm12,%xmm9
+       pxor    %xmm13,%xmm8
+       pxor    %xmm14,%xmm7
+
+
+
+
+
+       movdqa  %xmm10,%xmm11
+       pand    %xmm8,%xmm10
+       pxor    %xmm9,%xmm11
+
+       movdqa  %xmm7,%xmm13
+       movdqa  %xmm11,%xmm14
+       pxor    %xmm10,%xmm13
+       pand    %xmm13,%xmm14
+
+       movdqa  %xmm8,%xmm12
+       pxor    %xmm9,%xmm14
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm9,%xmm10
+
+       pand    %xmm10,%xmm12
+
+       movdqa  %xmm13,%xmm9
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm12,%xmm9
+       pxor    %xmm12,%xmm8
+
+       pand    %xmm7,%xmm9
+
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm8
+
+       pand    %xmm14,%xmm13
+
+       pxor    %xmm11,%xmm13
+       movdqa  %xmm5,%xmm11
+       movdqa  %xmm4,%xmm7
+       movdqa  %xmm14,%xmm9
+       pxor    %xmm13,%xmm9
+       pand    %xmm5,%xmm9
+       pxor    %xmm4,%xmm5
+       pand    %xmm14,%xmm4
+       pand    %xmm13,%xmm5
+       pxor    %xmm4,%xmm5
+       pxor    %xmm9,%xmm4
+       pxor    %xmm15,%xmm11
+       pxor    %xmm2,%xmm7
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm15,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm2,%xmm15
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm2
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm15
+       pxor    %xmm11,%xmm7
+       pxor    %xmm2,%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm2
+       pxor    %xmm11,%xmm5
+       pxor    %xmm11,%xmm15
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm2
+
+       movdqa  %xmm6,%xmm11
+       movdqa  %xmm0,%xmm7
+       pxor    %xmm3,%xmm11
+       pxor    %xmm1,%xmm7
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm3,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm1,%xmm3
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm1
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm3
+       pxor    %xmm11,%xmm7
+       pxor    %xmm1,%xmm3
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm1
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       pxor    %xmm13,%xmm10
+       pand    %xmm6,%xmm10
+       pxor    %xmm0,%xmm6
+       pand    %xmm14,%xmm0
+       pand    %xmm13,%xmm6
+       pxor    %xmm0,%xmm6
+       pxor    %xmm10,%xmm0
+       pxor    %xmm11,%xmm6
+       pxor    %xmm11,%xmm3
+       pxor    %xmm7,%xmm0
+       pxor    %xmm7,%xmm1
+       pxor    %xmm15,%xmm6
+       pxor    %xmm5,%xmm0
+       pxor    %xmm6,%xmm3
+       pxor    %xmm15,%xmm5
+       pxor    %xmm0,%xmm15
+
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       pxor    %xmm2,%xmm1
+       pxor    %xmm4,%xmm2
+       pxor    %xmm4,%xmm3
+
+       pxor    %xmm2,%xmm5
+       decl    %r10d
+       jl      .Lenc_done
+       pshufd  $147,%xmm15,%xmm7
+       pshufd  $147,%xmm0,%xmm8
+       pxor    %xmm7,%xmm15
+       pshufd  $147,%xmm3,%xmm9
+       pxor    %xmm8,%xmm0
+       pshufd  $147,%xmm5,%xmm10
+       pxor    %xmm9,%xmm3
+       pshufd  $147,%xmm2,%xmm11
+       pxor    %xmm10,%xmm5
+       pshufd  $147,%xmm6,%xmm12
+       pxor    %xmm11,%xmm2
+       pshufd  $147,%xmm1,%xmm13
+       pxor    %xmm12,%xmm6
+       pshufd  $147,%xmm4,%xmm14
+       pxor    %xmm13,%xmm1
+       pxor    %xmm14,%xmm4
+
+       pxor    %xmm15,%xmm8
+       pxor    %xmm4,%xmm7
+       pxor    %xmm4,%xmm8
+       pshufd  $78,%xmm15,%xmm15
+       pxor    %xmm0,%xmm9
+       pshufd  $78,%xmm0,%xmm0
+       pxor    %xmm2,%xmm12
+       pxor    %xmm7,%xmm15
+       pxor    %xmm6,%xmm13
+       pxor    %xmm8,%xmm0
+       pxor    %xmm5,%xmm11
+       pshufd  $78,%xmm2,%xmm7
+       pxor    %xmm1,%xmm14
+       pshufd  $78,%xmm6,%xmm8
+       pxor    %xmm3,%xmm10
+       pshufd  $78,%xmm5,%xmm2
+       pxor    %xmm4,%xmm10
+       pshufd  $78,%xmm4,%xmm6
+       pxor    %xmm4,%xmm11
+       pshufd  $78,%xmm1,%xmm5
+       pxor    %xmm11,%xmm7
+       pshufd  $78,%xmm3,%xmm1
+       pxor    %xmm12,%xmm8
+
+       pxor    %xmm10,%xmm2
+       pxor    %xmm14,%xmm6
+       pxor    %xmm13,%xmm5
+       movdqa  %xmm7,%xmm3
+       pxor    %xmm9,%xmm1
+       movdqa  %xmm8,%xmm4
+       movdqa  48(%r11),%xmm7
+       jnz     .Lenc_loop
+       movdqa  64(%r11),%xmm7
+       jmp     .Lenc_loop
+.align 16
+.Lenc_done:
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm1,%xmm9
+       psrlq   $1,%xmm1
+       movdqa  %xmm2,%xmm10
+       psrlq   $1,%xmm2
+       pxor    %xmm4,%xmm1
+       pxor    %xmm6,%xmm2
+       pand    %xmm7,%xmm1
+       pand    %xmm7,%xmm2
+       pxor    %xmm1,%xmm4
+       psllq   $1,%xmm1
+       pxor    %xmm2,%xmm6
+       psllq   $1,%xmm2
+       pxor    %xmm9,%xmm1
+       pxor    %xmm10,%xmm2
+       movdqa  %xmm3,%xmm9
+       psrlq   $1,%xmm3
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm5,%xmm3
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm3
+       pand    %xmm7,%xmm15
+       pxor    %xmm3,%xmm5
+       psllq   $1,%xmm3
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm3
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm6,%xmm9
+       psrlq   $2,%xmm6
+       movdqa  %xmm2,%xmm10
+       psrlq   $2,%xmm2
+       pxor    %xmm4,%xmm6
+       pxor    %xmm1,%xmm2
+       pand    %xmm8,%xmm6
+       pand    %xmm8,%xmm2
+       pxor    %xmm6,%xmm4
+       psllq   $2,%xmm6
+       pxor    %xmm2,%xmm1
+       psllq   $2,%xmm2
+       pxor    %xmm9,%xmm6
+       pxor    %xmm10,%xmm2
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm5,%xmm0
+       pxor    %xmm3,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm5
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm3
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm5,%xmm9
+       psrlq   $4,%xmm5
+       movdqa  %xmm3,%xmm10
+       psrlq   $4,%xmm3
+       pxor    %xmm4,%xmm5
+       pxor    %xmm1,%xmm3
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm3
+       pxor    %xmm5,%xmm4
+       psllq   $4,%xmm5
+       pxor    %xmm3,%xmm1
+       psllq   $4,%xmm3
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm6,%xmm0
+       pxor    %xmm2,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm6
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm2
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  (%rax),%xmm7
+       pxor    %xmm7,%xmm3
+       pxor    %xmm7,%xmm5
+       pxor    %xmm7,%xmm2
+       pxor    %xmm7,%xmm6
+       pxor    %xmm7,%xmm1
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm15
+       pxor    %xmm7,%xmm0
+       .byte   0xf3,0xc3
+.size  _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type  _bsaes_decrypt8,@function
+.align 64
+_bsaes_decrypt8:
+       leaq    .LBS0(%rip),%r11
+
+       movdqa  (%rax),%xmm8
+       leaq    16(%rax),%rax
+       movdqa  -48(%r11),%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+.byte  102,68,15,56,0,255
+       pxor    %xmm8,%xmm1
+.byte  102,15,56,0,199
+       pxor    %xmm8,%xmm2
+.byte  102,15,56,0,207
+       pxor    %xmm8,%xmm3
+.byte  102,15,56,0,215
+       pxor    %xmm8,%xmm4
+.byte  102,15,56,0,223
+       pxor    %xmm8,%xmm5
+.byte  102,15,56,0,231
+       pxor    %xmm8,%xmm6
+.byte  102,15,56,0,239
+.byte  102,15,56,0,247
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm5,%xmm9
+       psrlq   $1,%xmm5
+       movdqa  %xmm3,%xmm10
+       psrlq   $1,%xmm3
+       pxor    %xmm6,%xmm5
+       pxor    %xmm4,%xmm3
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm3
+       pxor    %xmm5,%xmm6
+       psllq   $1,%xmm5
+       pxor    %xmm3,%xmm4
+       psllq   $1,%xmm3
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm1,%xmm9
+       psrlq   $1,%xmm1
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm1
+       pand    %xmm7,%xmm15
+       pxor    %xmm1,%xmm2
+       psllq   $1,%xmm1
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm1
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm4,%xmm9
+       psrlq   $2,%xmm4
+       movdqa  %xmm3,%xmm10
+       psrlq   $2,%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm3
+       pand    %xmm8,%xmm4
+       pand    %xmm8,%xmm3
+       pxor    %xmm4,%xmm6
+       psllq   $2,%xmm4
+       pxor    %xmm3,%xmm5
+       psllq   $2,%xmm3
+       pxor    %xmm9,%xmm4
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm2,%xmm0
+       pxor    %xmm1,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm2
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm1
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm2,%xmm9
+       psrlq   $4,%xmm2
+       movdqa  %xmm1,%xmm10
+       psrlq   $4,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm5,%xmm1
+       pand    %xmm7,%xmm2
+       pand    %xmm7,%xmm1
+       pxor    %xmm2,%xmm6
+       psllq   $4,%xmm2
+       pxor    %xmm1,%xmm5
+       psllq   $4,%xmm1
+       pxor    %xmm9,%xmm2
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm4,%xmm0
+       pxor    %xmm3,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm4
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm3
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       decl    %r10d
+       jmp     .Ldec_sbox
+.align 16
+.Ldec_loop:
+       pxor    0(%rax),%xmm15
+       pxor    16(%rax),%xmm0
+.byte  102,68,15,56,0,255
+       pxor    32(%rax),%xmm1
+.byte  102,15,56,0,199
+       pxor    48(%rax),%xmm2
+.byte  102,15,56,0,207
+       pxor    64(%rax),%xmm3
+.byte  102,15,56,0,215
+       pxor    80(%rax),%xmm4
+.byte  102,15,56,0,223
+       pxor    96(%rax),%xmm5
+.byte  102,15,56,0,231
+       pxor    112(%rax),%xmm6
+.byte  102,15,56,0,239
+       leaq    128(%rax),%rax
+.byte  102,15,56,0,247
+.Ldec_sbox:
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm6,%xmm3
+       pxor    %xmm6,%xmm1
+       pxor    %xmm3,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm0
+
+       pxor    %xmm0,%xmm15
+       pxor    %xmm4,%xmm1
+       pxor    %xmm15,%xmm2
+       pxor    %xmm15,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqa  %xmm2,%xmm10
+       movdqa  %xmm6,%xmm9
+       movdqa  %xmm0,%xmm8
+       movdqa  %xmm3,%xmm12
+       movdqa  %xmm4,%xmm11
+
+       pxor    %xmm15,%xmm10
+       pxor    %xmm3,%xmm9
+       pxor    %xmm5,%xmm8
+       movdqa  %xmm10,%xmm13
+       pxor    %xmm15,%xmm12
+       movdqa  %xmm9,%xmm7
+       pxor    %xmm1,%xmm11
+       movdqa  %xmm10,%xmm14
+
+       por     %xmm8,%xmm9
+       por     %xmm11,%xmm10
+       pxor    %xmm7,%xmm14
+       pand    %xmm11,%xmm13
+       pxor    %xmm8,%xmm11
+       pand    %xmm8,%xmm7
+       pand    %xmm11,%xmm14
+       movdqa  %xmm5,%xmm11
+       pxor    %xmm1,%xmm11
+       pand    %xmm11,%xmm12
+       pxor    %xmm12,%xmm10
+       pxor    %xmm12,%xmm9
+       movdqa  %xmm2,%xmm12
+       movdqa  %xmm0,%xmm11
+       pxor    %xmm6,%xmm12
+       pxor    %xmm4,%xmm11
+       movdqa  %xmm12,%xmm8
+       pand    %xmm11,%xmm12
+       por     %xmm11,%xmm8
+       pxor    %xmm12,%xmm7
+       pxor    %xmm14,%xmm10
+       pxor    %xmm13,%xmm9
+       pxor    %xmm14,%xmm8
+       movdqa  %xmm3,%xmm11
+       pxor    %xmm13,%xmm7
+       movdqa  %xmm15,%xmm12
+       pxor    %xmm13,%xmm8
+       movdqa  %xmm6,%xmm13
+       pand    %xmm5,%xmm11
+       movdqa  %xmm2,%xmm14
+       pand    %xmm1,%xmm12
+       pand    %xmm0,%xmm13
+       por     %xmm4,%xmm14
+       pxor    %xmm11,%xmm10
+       pxor    %xmm12,%xmm9
+       pxor    %xmm13,%xmm8
+       pxor    %xmm14,%xmm7
+
+
+
+
+
+       movdqa  %xmm10,%xmm11
+       pand    %xmm8,%xmm10
+       pxor    %xmm9,%xmm11
+
+       movdqa  %xmm7,%xmm13
+       movdqa  %xmm11,%xmm14
+       pxor    %xmm10,%xmm13
+       pand    %xmm13,%xmm14
+
+       movdqa  %xmm8,%xmm12
+       pxor    %xmm9,%xmm14
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm9,%xmm10
+
+       pand    %xmm10,%xmm12
+
+       movdqa  %xmm13,%xmm9
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm12,%xmm9
+       pxor    %xmm12,%xmm8
+
+       pand    %xmm7,%xmm9
+
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm8
+
+       pand    %xmm14,%xmm13
+
+       pxor    %xmm11,%xmm13
+       movdqa  %xmm4,%xmm11
+       movdqa  %xmm0,%xmm7
+       movdqa  %xmm14,%xmm9
+       pxor    %xmm13,%xmm9
+       pand    %xmm4,%xmm9
+       pxor    %xmm0,%xmm4
+       pand    %xmm14,%xmm0
+       pand    %xmm13,%xmm4
+       pxor    %xmm0,%xmm4
+       pxor    %xmm9,%xmm0
+       pxor    %xmm1,%xmm11
+       pxor    %xmm5,%xmm7
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm1,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm5,%xmm1
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm5
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm1
+       pxor    %xmm11,%xmm7
+       pxor    %xmm5,%xmm1
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm5
+       pxor    %xmm11,%xmm4
+       pxor    %xmm11,%xmm1
+       pxor    %xmm7,%xmm0
+       pxor    %xmm7,%xmm5
+
+       movdqa  %xmm2,%xmm11
+       movdqa  %xmm6,%xmm7
+       pxor    %xmm15,%xmm11
+       pxor    %xmm3,%xmm7
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm15,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm3,%xmm15
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm3
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm15
+       pxor    %xmm11,%xmm7
+       pxor    %xmm3,%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm3
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       pxor    %xmm13,%xmm10
+       pand    %xmm2,%xmm10
+       pxor    %xmm6,%xmm2
+       pand    %xmm14,%xmm6
+       pand    %xmm13,%xmm2
+       pxor    %xmm6,%xmm2
+       pxor    %xmm10,%xmm6
+       pxor    %xmm11,%xmm2
+       pxor    %xmm11,%xmm15
+       pxor    %xmm7,%xmm6
+       pxor    %xmm7,%xmm3
+       pxor    %xmm6,%xmm0
+       pxor    %xmm4,%xmm5
+
+       pxor    %xmm0,%xmm3
+       pxor    %xmm6,%xmm1
+       pxor    %xmm6,%xmm4
+       pxor    %xmm1,%xmm3
+       pxor    %xmm15,%xmm6
+       pxor    %xmm4,%xmm3
+       pxor    %xmm5,%xmm2
+       pxor    %xmm0,%xmm5
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm15,%xmm3
+       pxor    %xmm2,%xmm6
+       decl    %r10d
+       jl      .Ldec_done
+
+       pshufd  $147,%xmm4,%xmm14
+       movdqa  %xmm5,%xmm9
+       pxor    %xmm6,%xmm4
+       pxor    %xmm6,%xmm5
+       pshufd  $147,%xmm15,%xmm7
+       movdqa  %xmm6,%xmm12
+       pxor    %xmm15,%xmm6
+       pxor    %xmm0,%xmm15
+       pshufd  $147,%xmm0,%xmm8
+       pxor    %xmm5,%xmm0
+       pxor    %xmm2,%xmm15
+       pxor    %xmm3,%xmm0
+       pshufd  $147,%xmm3,%xmm10
+       pxor    %xmm15,%xmm5
+       pxor    %xmm4,%xmm3
+       pxor    %xmm2,%xmm4
+       pshufd  $147,%xmm2,%xmm13
+       movdqa  %xmm1,%xmm11
+       pxor    %xmm1,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm3
+       pxor    %xmm12,%xmm2
+       pxor    %xmm9,%xmm3
+       pxor    %xmm11,%xmm3
+       pshufd  $147,%xmm12,%xmm12
+
+       pxor    %xmm4,%xmm6
+       pxor    %xmm7,%xmm4
+       pxor    %xmm8,%xmm6
+       pshufd  $147,%xmm9,%xmm9
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm6
+       pxor    %xmm14,%xmm4
+       pshufd  $147,%xmm11,%xmm11
+       pxor    %xmm13,%xmm14
+       pxor    %xmm4,%xmm6
+
+       pxor    %xmm7,%xmm5
+       pshufd  $147,%xmm7,%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+       pxor    %xmm9,%xmm15
+       pshufd  $147,%xmm8,%xmm8
+       pxor    %xmm9,%xmm5
+       pxor    %xmm9,%xmm3
+       pxor    %xmm14,%xmm15
+       pshufd  $147,%xmm9,%xmm9
+       pxor    %xmm10,%xmm5
+       pxor    %xmm10,%xmm1
+       pxor    %xmm10,%xmm0
+       pshufd  $147,%xmm10,%xmm10
+       pxor    %xmm11,%xmm2
+       pxor    %xmm11,%xmm3
+       pxor    %xmm14,%xmm2
+       pxor    %xmm12,%xmm5
+       pxor    %xmm11,%xmm0
+       pxor    %xmm12,%xmm14
+
+       pxor    %xmm14,%xmm3
+       pshufd  $147,%xmm11,%xmm11
+       pxor    %xmm14,%xmm1
+       pxor    %xmm14,%xmm0
+
+       pxor    %xmm12,%xmm14
+       pshufd  $147,%xmm12,%xmm12
+       pxor    %xmm13,%xmm14
+
+
+       pxor    %xmm2,%xmm0
+       pxor    %xmm11,%xmm2
+       pshufd  $147,%xmm13,%xmm13
+       pxor    %xmm7,%xmm15
+       pxor    %xmm12,%xmm2
+       pxor    %xmm9,%xmm15
+       pshufd  $147,%xmm14,%xmm14
+
+       pxor    %xmm6,%xmm5
+       pxor    %xmm8,%xmm6
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm5
+       pxor    %xmm12,%xmm6
+       pxor    %xmm12,%xmm4
+       pxor    %xmm14,%xmm6
+       pshufd  $147,%xmm7,%xmm7
+       pxor    %xmm13,%xmm4
+       pxor    %xmm6,%xmm5
+       pxor    %xmm8,%xmm0
+       pshufd  $147,%xmm8,%xmm8
+
+       pxor    %xmm14,%xmm2
+       pxor    %xmm9,%xmm0
+       pxor    %xmm9,%xmm3
+       pshufd  $147,%xmm9,%xmm9
+       pxor    %xmm13,%xmm15
+       pxor    %xmm10,%xmm13
+       pxor    %xmm2,%xmm0
+       pxor    %xmm13,%xmm5
+
+       pxor    %xmm13,%xmm1
+       pxor    %xmm12,%xmm3
+       pxor    %xmm11,%xmm1
+       pshufd  $147,%xmm11,%xmm11
+       pxor    %xmm13,%xmm3
+       pxor    %xmm14,%xmm1
+       pxor    %xmm10,%xmm13
+
+       pshufd  $147,%xmm12,%xmm12
+       pshufd  $147,%xmm13,%xmm13
+       pshufd  $147,%xmm14,%xmm14
+       pshufd  $147,%xmm10,%xmm10
+
+
+       pxor    %xmm6,%xmm0
+       pxor    %xmm6,%xmm8
+       pxor    %xmm12,%xmm7
+       pxor    %xmm12,%xmm8
+       pxor    %xmm7,%xmm5
+       pxor    %xmm4,%xmm7
+       pxor    %xmm13,%xmm8
+       pxor    %xmm14,%xmm13
+       pxor    %xmm8,%xmm0
+       pxor    %xmm11,%xmm2
+       pxor    %xmm0,%xmm11
+       pxor    %xmm10,%xmm1
+       pxor    %xmm5,%xmm10
+       pxor    %xmm9,%xmm3
+       pxor    %xmm15,%xmm9
+       pxor    %xmm14,%xmm10
+       pxor    %xmm3,%xmm12
+       pxor    %xmm13,%xmm9
+       pxor    %xmm13,%xmm12
+       pxor    %xmm1,%xmm13
+       pxor    %xmm2,%xmm14
+
+       movdqa  %xmm7,%xmm15
+       movdqa  %xmm8,%xmm0
+       movdqa  %xmm9,%xmm1
+       movdqa  %xmm10,%xmm2
+       movdqa  %xmm11,%xmm3
+       movdqa  %xmm12,%xmm4
+       movdqa  %xmm13,%xmm5
+       movdqa  %xmm14,%xmm6
+       movdqa  -16(%r11),%xmm7
+       jnz     .Ldec_loop
+       movdqa  -32(%r11),%xmm7
+       jmp     .Ldec_loop
+.align 16
+.Ldec_done:
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm2,%xmm9
+       psrlq   $1,%xmm2
+       movdqa  %xmm1,%xmm10
+       psrlq   $1,%xmm1
+       pxor    %xmm4,%xmm2
+       pxor    %xmm6,%xmm1
+       pand    %xmm7,%xmm2
+       pand    %xmm7,%xmm1
+       pxor    %xmm2,%xmm4
+       psllq   $1,%xmm2
+       pxor    %xmm1,%xmm6
+       psllq   $1,%xmm1
+       pxor    %xmm9,%xmm2
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm5,%xmm9
+       psrlq   $1,%xmm5
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm3,%xmm5
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm15
+       pxor    %xmm5,%xmm3
+       psllq   $1,%xmm5
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm6,%xmm9
+       psrlq   $2,%xmm6
+       movdqa  %xmm1,%xmm10
+       psrlq   $2,%xmm1
+       pxor    %xmm4,%xmm6
+       pxor    %xmm2,%xmm1
+       pand    %xmm8,%xmm6
+       pand    %xmm8,%xmm1
+       pxor    %xmm6,%xmm4
+       psllq   $2,%xmm6
+       pxor    %xmm1,%xmm2
+       psllq   $2,%xmm1
+       pxor    %xmm9,%xmm6
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm3,%xmm0
+       pxor    %xmm5,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm3
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm5
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm3,%xmm9
+       psrlq   $4,%xmm3
+       movdqa  %xmm5,%xmm10
+       psrlq   $4,%xmm5
+       pxor    %xmm4,%xmm3
+       pxor    %xmm2,%xmm5
+       pand    %xmm7,%xmm3
+       pand    %xmm7,%xmm5
+       pxor    %xmm3,%xmm4
+       psllq   $4,%xmm3
+       pxor    %xmm5,%xmm2
+       psllq   $4,%xmm5
+       pxor    %xmm9,%xmm3
+       pxor    %xmm10,%xmm5
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm6,%xmm0
+       pxor    %xmm1,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm6
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm1
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  (%rax),%xmm7
+       pxor    %xmm7,%xmm5
+       pxor    %xmm7,%xmm3
+       pxor    %xmm7,%xmm1
+       pxor    %xmm7,%xmm6
+       pxor    %xmm7,%xmm2
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm15
+       pxor    %xmm7,%xmm0
+       .byte   0xf3,0xc3
+.size  _bsaes_decrypt8,.-_bsaes_decrypt8
+.type  _bsaes_key_convert,@function
+.align 16
+_bsaes_key_convert:
+       leaq    .Lmasks(%rip),%r11
+       movdqu  (%rcx),%xmm7
+       leaq    16(%rcx),%rcx
+       movdqa  0(%r11),%xmm0
+       movdqa  16(%r11),%xmm1
+       movdqa  32(%r11),%xmm2
+       movdqa  48(%r11),%xmm3
+       movdqa  64(%r11),%xmm4
+       pcmpeqd %xmm5,%xmm5
+
+       movdqu  (%rcx),%xmm6
+       movdqa  %xmm7,(%rax)
+       leaq    16(%rax),%rax
+       decl    %r10d
+       jmp     .Lkey_loop
+.align 16
+.Lkey_loop:
+.byte  102,15,56,0,244
+
+       movdqa  %xmm0,%xmm8
+       movdqa  %xmm1,%xmm9
+
+       pand    %xmm6,%xmm8
+       pand    %xmm6,%xmm9
+       movdqa  %xmm2,%xmm10
+       pcmpeqb %xmm0,%xmm8
+       psllq   $4,%xmm0
+       movdqa  %xmm3,%xmm11
+       pcmpeqb %xmm1,%xmm9
+       psllq   $4,%xmm1
+
+       pand    %xmm6,%xmm10
+       pand    %xmm6,%xmm11
+       movdqa  %xmm0,%xmm12
+       pcmpeqb %xmm2,%xmm10
+       psllq   $4,%xmm2
+       movdqa  %xmm1,%xmm13
+       pcmpeqb %xmm3,%xmm11
+       psllq   $4,%xmm3
+
+       movdqa  %xmm2,%xmm14
+       movdqa  %xmm3,%xmm15
+       pxor    %xmm5,%xmm8
+       pxor    %xmm5,%xmm9
+
+       pand    %xmm6,%xmm12
+       pand    %xmm6,%xmm13
+       movdqa  %xmm8,0(%rax)
+       pcmpeqb %xmm0,%xmm12
+       psrlq   $4,%xmm0
+       movdqa  %xmm9,16(%rax)
+       pcmpeqb %xmm1,%xmm13
+       psrlq   $4,%xmm1
+       leaq    16(%rcx),%rcx
+
+       pand    %xmm6,%xmm14
+       pand    %xmm6,%xmm15
+       movdqa  %xmm10,32(%rax)
+       pcmpeqb %xmm2,%xmm14
+       psrlq   $4,%xmm2
+       movdqa  %xmm11,48(%rax)
+       pcmpeqb %xmm3,%xmm15
+       psrlq   $4,%xmm3
+       movdqu  (%rcx),%xmm6
+
+       pxor    %xmm5,%xmm13
+       pxor    %xmm5,%xmm14
+       movdqa  %xmm12,64(%rax)
+       movdqa  %xmm13,80(%rax)
+       movdqa  %xmm14,96(%rax)
+       movdqa  %xmm15,112(%rax)
+       leaq    128(%rax),%rax
+       decl    %r10d
+       jnz     .Lkey_loop
+
+       movdqa  80(%r11),%xmm7
+
+       .byte   0xf3,0xc3
+.size  _bsaes_key_convert,.-_bsaes_key_convert
+
+.globl bsaes_cbc_encrypt
+.type  bsaes_cbc_encrypt,@function
+.align 16
+bsaes_cbc_encrypt:
+       cmpl    $0,%r9d
+       jne     asm_AES_cbc_encrypt
+       cmpq    $128,%rdx
+       jb      asm_AES_cbc_encrypt
+
+       movq    %rsp,%rax
+.Lcbc_dec_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movl    240(%rcx),%eax
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+       movq    %r8,%rbx
+       shrq    $4,%r14
+
+       movl    %eax,%edx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %edx,%r10d
+       call    _bsaes_key_convert
+       pxor    (%rsp),%xmm7
+       movdqa  %xmm6,(%rax)
+       movdqa  %xmm7,(%rsp)
+
+       movdqu  (%rbx),%xmm14
+       subq    $8,%r14
+.Lcbc_dec_loop:
+       movdqu  0(%r12),%xmm15
+       movdqu  16(%r12),%xmm0
+       movdqu  32(%r12),%xmm1
+       movdqu  48(%r12),%xmm2
+       movdqu  64(%r12),%xmm3
+       movdqu  80(%r12),%xmm4
+       movq    %rsp,%rax
+       movdqu  96(%r12),%xmm5
+       movl    %edx,%r10d
+       movdqu  112(%r12),%xmm6
+       movdqa  %xmm14,32(%rbp)
+
+       call    _bsaes_decrypt8
+
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm1
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm6
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm2
+       movdqu  112(%r12),%xmm14
+       pxor    %xmm13,%xmm4
+       movdqu  %xmm15,0(%r13)
+       leaq    128(%r12),%r12
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm2,96(%r13)
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+       subq    $8,%r14
+       jnc     .Lcbc_dec_loop
+
+       addq    $8,%r14
+       jz      .Lcbc_dec_done
+
+       movdqu  0(%r12),%xmm15
+       movq    %rsp,%rax
+       movl    %edx,%r10d
+       cmpq    $2,%r14
+       jb      .Lcbc_dec_one
+       movdqu  16(%r12),%xmm0
+       je      .Lcbc_dec_two
+       movdqu  32(%r12),%xmm1
+       cmpq    $4,%r14
+       jb      .Lcbc_dec_three
+       movdqu  48(%r12),%xmm2
+       je      .Lcbc_dec_four
+       movdqu  64(%r12),%xmm3
+       cmpq    $6,%r14
+       jb      .Lcbc_dec_five
+       movdqu  80(%r12),%xmm4
+       je      .Lcbc_dec_six
+       movdqu  96(%r12),%xmm5
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm1
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm6
+       movdqu  96(%r12),%xmm14
+       pxor    %xmm12,%xmm2
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm2,96(%r13)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm1
+       movdqu  80(%r12),%xmm14
+       pxor    %xmm11,%xmm6
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm14
+       pxor    %xmm10,%xmm1
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm14
+       pxor    %xmm9,%xmm3
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm14
+       pxor    %xmm8,%xmm5
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm14
+       pxor    %xmm7,%xmm0
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+       leaq    (%r12),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm14
+       movdqu  %xmm14,(%r13)
+       movdqa  %xmm15,%xmm14
+
+.Lcbc_dec_done:
+       movdqu  %xmm14,(%rbx)
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+.Lcbc_dec_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      .Lcbc_dec_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+.Lcbc_dec_epilogue:
+       .byte   0xf3,0xc3
+.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type  bsaes_ctr32_encrypt_blocks,@function
+.align 16
+bsaes_ctr32_encrypt_blocks:
+       movq    %rsp,%rax
+.Lctr_enc_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movdqu  (%r8),%xmm0
+       movl    240(%rcx),%eax
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+       movdqa  %xmm0,32(%rbp)
+       cmpq    $8,%rdx
+       jb      .Lctr_enc_short
+
+       movl    %eax,%ebx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %ebx,%r10d
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm7,(%rax)
+
+       movdqa  (%rsp),%xmm8
+       leaq    .LADD1(%rip),%r11
+       movdqa  32(%rbp),%xmm15
+       movdqa  -32(%r11),%xmm7
+.byte  102,68,15,56,0,199
+.byte  102,68,15,56,0,255
+       movdqa  %xmm8,(%rsp)
+       jmp     .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+       movdqa  %xmm15,32(%rbp)
+       movdqa  %xmm15,%xmm0
+       movdqa  %xmm15,%xmm1
+       paddd   0(%r11),%xmm0
+       movdqa  %xmm15,%xmm2
+       paddd   16(%r11),%xmm1
+       movdqa  %xmm15,%xmm3
+       paddd   32(%r11),%xmm2
+       movdqa  %xmm15,%xmm4
+       paddd   48(%r11),%xmm3
+       movdqa  %xmm15,%xmm5
+       paddd   64(%r11),%xmm4
+       movdqa  %xmm15,%xmm6
+       paddd   80(%r11),%xmm5
+       paddd   96(%r11),%xmm6
+
+
+
+       movdqa  (%rsp),%xmm8
+       leaq    16(%rsp),%rax
+       movdqa  -16(%r11),%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+.byte  102,68,15,56,0,255
+       pxor    %xmm8,%xmm1
+.byte  102,15,56,0,199
+       pxor    %xmm8,%xmm2
+.byte  102,15,56,0,207
+       pxor    %xmm8,%xmm3
+.byte  102,15,56,0,215
+       pxor    %xmm8,%xmm4
+.byte  102,15,56,0,223
+       pxor    %xmm8,%xmm5
+.byte  102,15,56,0,231
+       pxor    %xmm8,%xmm6
+.byte  102,15,56,0,239
+       leaq    .LBS0(%rip),%r11
+.byte  102,15,56,0,247
+       movl    %ebx,%r10d
+
+       call    _bsaes_encrypt8_bitslice
+
+       subq    $8,%r14
+       jc      .Lctr_enc_loop_done
+
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       movdqu  32(%r12),%xmm9
+       movdqu  48(%r12),%xmm10
+       movdqu  64(%r12),%xmm11
+       movdqu  80(%r12),%xmm12
+       movdqu  96(%r12),%xmm13
+       movdqu  112(%r12),%xmm14
+       leaq    128(%r12),%r12
+       pxor    %xmm15,%xmm7
+       movdqa  32(%rbp),%xmm15
+       pxor    %xmm8,%xmm0
+       movdqu  %xmm7,0(%r13)
+       pxor    %xmm9,%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    %xmm10,%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    %xmm11,%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    %xmm12,%xmm6
+       movdqu  %xmm2,64(%r13)
+       pxor    %xmm13,%xmm1
+       movdqu  %xmm6,80(%r13)
+       pxor    %xmm14,%xmm4
+       movdqu  %xmm1,96(%r13)
+       leaq    .LADD1(%rip),%r11
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+       paddd   112(%r11),%xmm15
+       jnz     .Lctr_enc_loop
+
+       jmp     .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+       addq    $8,%r14
+       movdqu  0(%r12),%xmm7
+       pxor    %xmm7,%xmm15
+       movdqu  %xmm15,0(%r13)
+       cmpq    $2,%r14
+       jb      .Lctr_enc_done
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm8,%xmm0
+       movdqu  %xmm0,16(%r13)
+       je      .Lctr_enc_done
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm9,%xmm3
+       movdqu  %xmm3,32(%r13)
+       cmpq    $4,%r14
+       jb      .Lctr_enc_done
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm10,%xmm5
+       movdqu  %xmm5,48(%r13)
+       je      .Lctr_enc_done
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm11,%xmm2
+       movdqu  %xmm2,64(%r13)
+       cmpq    $6,%r14
+       jb      .Lctr_enc_done
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm12,%xmm6
+       movdqu  %xmm6,80(%r13)
+       je      .Lctr_enc_done
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm13,%xmm1
+       movdqu  %xmm1,96(%r13)
+       jmp     .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+       leaq    32(%rbp),%rdi
+       leaq    48(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    asm_AES_encrypt
+       movdqu  (%r12),%xmm0
+       leaq    16(%r12),%r12
+       movl    44(%rbp),%eax
+       bswapl  %eax
+       pxor    48(%rbp),%xmm0
+       incl    %eax
+       movdqu  %xmm0,(%r13)
+       bswapl  %eax
+       leaq    16(%r13),%r13
+       movl    %eax,44(%rsp)
+       decq    %r14
+       jnz     .Lctr_enc_short
+
+.Lctr_enc_done:
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+.Lctr_enc_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      .Lctr_enc_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+.Lctr_enc_epilogue:
+       .byte   0xf3,0xc3
+.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type  bsaes_xts_encrypt,@function
+.align 16
+bsaes_xts_encrypt:
+       movq    %rsp,%rax
+.Lxts_enc_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+
+       leaq    (%r9),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r8),%rdx
+       call    asm_AES_encrypt
+
+
+       movl    240(%r15),%eax
+       movq    %r14,%rbx
+
+       movl    %eax,%edx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %edx,%r10d
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm7,(%rax)
+
+       andq    $-16,%r14
+       subq    $128,%rsp
+       movdqa  32(%rbp),%xmm6
+
+       pxor    %xmm14,%xmm14
+       movdqa  .Lxts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+
+       subq    $128,%r14
+       jc      .Lxts_enc_short
+       jmp     .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqu  112(%r12),%xmm14
+       leaq    128(%r12),%r12
+       movdqa  %xmm6,112(%rsp)
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       pxor    %xmm14,%xmm6
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm2,64(%r13)
+       pxor    96(%rsp),%xmm1
+       movdqu  %xmm6,80(%r13)
+       pxor    112(%rsp),%xmm4
+       movdqu  %xmm1,96(%r13)
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       pxor    %xmm14,%xmm14
+       movdqa  .Lxts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+
+       subq    $128,%r14
+       jnc     .Lxts_enc_loop
+
+.Lxts_enc_short:
+       addq    $128,%r14
+       jz      .Lxts_enc_done
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       cmpq    $16,%r14
+       je      .Lxts_enc_1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       cmpq    $32,%r14
+       je      .Lxts_enc_2
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       cmpq    $48,%r14
+       je      .Lxts_enc_3
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       cmpq    $64,%r14
+       je      .Lxts_enc_4
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       cmpq    $80,%r14
+       je      .Lxts_enc_5
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       cmpq    $96,%r14
+       je      .Lxts_enc_6
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqa  %xmm6,112(%rsp)
+       leaq    112(%r12),%r12
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm2,64(%r13)
+       pxor    96(%rsp),%xmm1
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm1,96(%r13)
+       leaq    112(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+       pxor    %xmm11,%xmm3
+       leaq    96(%r12),%r12
+       pxor    %xmm12,%xmm4
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm2,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       leaq    96(%r13),%r13
+
+       movdqa  96(%rsp),%xmm6
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+       pxor    %xmm10,%xmm2
+       leaq    80(%r12),%r12
+       pxor    %xmm11,%xmm3
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       movdqu  %xmm2,64(%r13)
+       leaq    80(%r13),%r13
+
+       movdqa  80(%rsp),%xmm6
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+       pxor    %xmm9,%xmm1
+       leaq    64(%r12),%r12
+       pxor    %xmm10,%xmm2
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       movdqu  %xmm5,48(%r13)
+       leaq    64(%r13),%r13
+
+       movdqa  64(%rsp),%xmm6
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+       pxor    %xmm8,%xmm0
+       leaq    48(%r12),%r12
+       pxor    %xmm9,%xmm1
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm3,32(%r13)
+       leaq    48(%r13),%r13
+
+       movdqa  48(%rsp),%xmm6
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+       pxor    %xmm7,%xmm15
+       leaq    32(%r12),%r12
+       pxor    %xmm8,%xmm0
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       leaq    32(%r13),%r13
+
+       movdqa  32(%rsp),%xmm6
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+       pxor    %xmm15,%xmm7
+       leaq    16(%r12),%r12
+       movdqa  %xmm7,32(%rbp)
+       leaq    32(%rbp),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    asm_AES_encrypt
+
+       pxor    32(%rbp),%xmm15
+
+
+
+
+
+       movdqu  %xmm15,0(%r13)
+       leaq    16(%r13),%r13
+
+       movdqa  16(%rsp),%xmm6
+
+.Lxts_enc_done:
+       andl    $15,%ebx
+       jz      .Lxts_enc_ret
+       movq    %r13,%rdx
+
+.Lxts_enc_steal:
+       movzbl  (%r12),%eax
+       movzbl  -16(%rdx),%ecx
+       leaq    1(%r12),%r12
+       movb    %al,-16(%rdx)
+       movb    %cl,0(%rdx)
+       leaq    1(%rdx),%rdx
+       subl    $1,%ebx
+       jnz     .Lxts_enc_steal
+
+       movdqu  -16(%r13),%xmm15
+       leaq    32(%rbp),%rdi
+       pxor    %xmm6,%xmm15
+       leaq    32(%rbp),%rsi
+       movdqa  %xmm15,32(%rbp)
+       leaq    (%r15),%rdx
+       call    asm_AES_encrypt
+
+       pxor    32(%rbp),%xmm6
+       movdqu  %xmm6,-16(%r13)
+
+.Lxts_enc_ret:
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+.Lxts_enc_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      .Lxts_enc_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+.Lxts_enc_epilogue:
+       .byte   0xf3,0xc3
+.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type  bsaes_xts_decrypt,@function
+.align 16
+bsaes_xts_decrypt:
+       movq    %rsp,%rax
+.Lxts_dec_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+
+       leaq    (%r9),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r8),%rdx
+       call    asm_AES_encrypt
+
+
+       movl    240(%r15),%eax
+       movq    %r14,%rbx
+
+       movl    %eax,%edx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %edx,%r10d
+       call    _bsaes_key_convert
+       pxor    (%rsp),%xmm7
+       movdqa  %xmm6,(%rax)
+       movdqa  %xmm7,(%rsp)
+
+       xorl    %eax,%eax
+       andq    $-16,%r14
+       testl   $15,%ebx
+       setnz   %al
+       shlq    $4,%rax
+       subq    %rax,%r14
+
+       subq    $128,%rsp
+       movdqa  32(%rbp),%xmm6
+
+       pxor    %xmm14,%xmm14
+       movdqa  .Lxts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+
+       subq    $128,%r14
+       jc      .Lxts_dec_short
+       jmp     .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqu  112(%r12),%xmm14
+       leaq    128(%r12),%r12
+       movdqa  %xmm6,112(%rsp)
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       pxor    %xmm14,%xmm6
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm1,64(%r13)
+       pxor    96(%rsp),%xmm2
+       movdqu  %xmm6,80(%r13)
+       pxor    112(%rsp),%xmm4
+       movdqu  %xmm2,96(%r13)
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       pxor    %xmm14,%xmm14
+       movdqa  .Lxts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+
+       subq    $128,%r14
+       jnc     .Lxts_dec_loop
+
+.Lxts_dec_short:
+       addq    $128,%r14
+       jz      .Lxts_dec_done
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       cmpq    $16,%r14
+       je      .Lxts_dec_1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       cmpq    $32,%r14
+       je      .Lxts_dec_2
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       cmpq    $48,%r14
+       je      .Lxts_dec_3
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       cmpq    $64,%r14
+       je      .Lxts_dec_4
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       cmpq    $80,%r14
+       je      .Lxts_dec_5
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       cmpq    $96,%r14
+       je      .Lxts_dec_6
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqa  %xmm6,112(%rsp)
+       leaq    112(%r12),%r12
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm1,64(%r13)
+       pxor    96(%rsp),%xmm2
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm2,96(%r13)
+       leaq    112(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+       pxor    %xmm11,%xmm3
+       leaq    96(%r12),%r12
+       pxor    %xmm12,%xmm4
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       leaq    96(%r13),%r13
+
+       movdqa  96(%rsp),%xmm6
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+       pxor    %xmm10,%xmm2
+       leaq    80(%r12),%r12
+       pxor    %xmm11,%xmm3
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       leaq    80(%r13),%r13
+
+       movdqa  80(%rsp),%xmm6
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+       pxor    %xmm9,%xmm1
+       leaq    64(%r12),%r12
+       pxor    %xmm10,%xmm2
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       leaq    64(%r13),%r13
+
+       movdqa  64(%rsp),%xmm6
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+       pxor    %xmm8,%xmm0
+       leaq    48(%r12),%r12
+       pxor    %xmm9,%xmm1
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       leaq    48(%r13),%r13
+
+       movdqa  48(%rsp),%xmm6
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+       pxor    %xmm7,%xmm15
+       leaq    32(%r12),%r12
+       pxor    %xmm8,%xmm0
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       leaq    32(%r13),%r13
+
+       movdqa  32(%rsp),%xmm6
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+       pxor    %xmm15,%xmm7
+       leaq    16(%r12),%r12
+       movdqa  %xmm7,32(%rbp)
+       leaq    32(%rbp),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm15
+
+
+
+
+
+       movdqu  %xmm15,0(%r13)
+       leaq    16(%r13),%r13
+
+       movdqa  16(%rsp),%xmm6
+
+.Lxts_dec_done:
+       andl    $15,%ebx
+       jz      .Lxts_dec_ret
+
+       pxor    %xmm14,%xmm14
+       movdqa  .Lxts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+       pshufd  $19,%xmm14,%xmm13
+       movdqa  %xmm6,%xmm5
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       movdqu  (%r12),%xmm15
+       pxor    %xmm13,%xmm6
+
+       leaq    32(%rbp),%rdi
+       pxor    %xmm6,%xmm15
+       leaq    32(%rbp),%rsi
+       movdqa  %xmm15,32(%rbp)
+       leaq    (%r15),%rdx
+       call    asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm6
+       movq    %r13,%rdx
+       movdqu  %xmm6,(%r13)
+
+.Lxts_dec_steal:
+       movzbl  16(%r12),%eax
+       movzbl  (%rdx),%ecx
+       leaq    1(%r12),%r12
+       movb    %al,(%rdx)
+       movb    %cl,16(%rdx)
+       leaq    1(%rdx),%rdx
+       subl    $1,%ebx
+       jnz     .Lxts_dec_steal
+
+       movdqu  (%r13),%xmm15
+       leaq    32(%rbp),%rdi
+       pxor    %xmm5,%xmm15
+       leaq    32(%rbp),%rsi
+       movdqa  %xmm15,32(%rbp)
+       leaq    (%r15),%rdx
+       call    asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm5
+       movdqu  %xmm5,(%r13)
+
+.Lxts_dec_ret:
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+.Lxts_dec_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      .Lxts_dec_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+.Lxts_dec_epilogue:
+       .byte   0xf3,0xc3
+.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
+.type  _bsaes_const,@object
+.align 64
+_bsaes_const:
+.LM0ISR:
+.quad  0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+.quad  0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+.quad  0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:
+.quad  0x5555555555555555, 0x5555555555555555
+.LBS1:
+.quad  0x3333333333333333, 0x3333333333333333
+.LBS2:
+.quad  0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:
+.quad  0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad  0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+.quad  0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:
+.quad  0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+.quad  0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:
+.quad  0x0000000000000000, 0x0000000100000000
+.LADD2:
+.quad  0x0000000000000000, 0x0000000200000000
+.LADD3:
+.quad  0x0000000000000000, 0x0000000300000000
+.LADD4:
+.quad  0x0000000000000000, 0x0000000400000000
+.LADD5:
+.quad  0x0000000000000000, 0x0000000500000000
+.LADD6:
+.quad  0x0000000000000000, 0x0000000600000000
+.LADD7:
+.quad  0x0000000000000000, 0x0000000700000000
+.LADD8:
+.quad  0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+.long  0x87,0,1,0
+.Lmasks:
+.quad  0x0101010101010101, 0x0101010101010101
+.quad  0x0202020202020202, 0x0202020202020202
+.quad  0x0404040404040404, 0x0404040404040404
+.quad  0x0808080808080808, 0x0808080808080808
+.LM0:
+.quad  0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+.quad  0x6363636363636363, 0x6363636363636363
+.byte  66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.align 64
+.size  _bsaes_const,.-_bsaes_const
diff --git a/deps/openssl/asm/x64-elf-gas/aes/vpaes-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/vpaes-x86_64.s
new file mode 100644 (file)
index 0000000..87acfb3
--- /dev/null
@@ -0,0 +1,839 @@
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type  _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+       movq    %rdx,%r9
+       movq    $16,%r11
+       movl    240(%rdx),%eax
+       movdqa  %xmm9,%xmm1
+       movdqa  .Lk_ipt(%rip),%xmm2
+       pandn   %xmm0,%xmm1
+       movdqu  (%r9),%xmm5
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+.byte  102,15,56,0,208
+       movdqa  .Lk_ipt+16(%rip),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm5,%xmm2
+       pxor    %xmm2,%xmm0
+       addq    $16,%r9
+       leaq    .Lk_mc_backward(%rip),%r10
+       jmp     .Lenc_entry
+
+.align 16
+.Lenc_loop:
+
+       movdqa  %xmm13,%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+       movdqa  %xmm12,%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       movdqa  %xmm15,%xmm5
+.byte  102,15,56,0,234
+       movdqa  -64(%r11,%r10,1),%xmm1
+       movdqa  %xmm14,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm5,%xmm2
+       movdqa  (%r11,%r10,1),%xmm4
+       movdqa  %xmm0,%xmm3
+.byte  102,15,56,0,193
+       addq    $16,%r9
+       pxor    %xmm2,%xmm0
+.byte  102,15,56,0,220
+       addq    $16,%r11
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,193
+       andq    $48,%r11
+       pxor    %xmm3,%xmm0
+       subq    $1,%rax
+
+.Lenc_entry:
+
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  %xmm11,%xmm5
+.byte  102,15,56,0,232
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm5,%xmm3
+       movdqa  %xmm10,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm5,%xmm4
+       movdqa  %xmm10,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm10,%xmm3
+       movdqu  (%r9),%xmm5
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       jnz     .Lenc_loop
+
+
+       movdqa  -96(%r10),%xmm4
+       movdqa  -80(%r10),%xmm0
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+.byte  102,15,56,0,195
+       movdqa  64(%r11,%r10,1),%xmm1
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,193
+       .byte   0xf3,0xc3
+.size  _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type  _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+       movq    %rdx,%r9
+       movl    240(%rdx),%eax
+       movdqa  %xmm9,%xmm1
+       movdqa  .Lk_dipt(%rip),%xmm2
+       pandn   %xmm0,%xmm1
+       movq    %rax,%r11
+       psrld   $4,%xmm1
+       movdqu  (%r9),%xmm5
+       shlq    $4,%r11
+       pand    %xmm9,%xmm0
+.byte  102,15,56,0,208
+       movdqa  .Lk_dipt+16(%rip),%xmm0
+       xorq    $48,%r11
+       leaq    .Lk_dsbd(%rip),%r10
+.byte  102,15,56,0,193
+       andq    $48,%r11
+       pxor    %xmm5,%xmm2
+       movdqa  .Lk_mc_forward+48(%rip),%xmm5
+       pxor    %xmm2,%xmm0
+       addq    $16,%r9
+       addq    %r10,%r11
+       jmp     .Ldec_entry
+
+.align 16
+.Ldec_loop:
+
+
+
+       movdqa  -32(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  -16(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       addq    $16,%r9
+
+.byte  102,15,56,0,197
+       movdqa  0(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  16(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       subq    $1,%rax
+
+.byte  102,15,56,0,197
+       movdqa  32(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  48(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+
+.byte  102,15,56,0,197
+       movdqa  64(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  80(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+
+.byte  102,15,58,15,237,12
+
+.Ldec_entry:
+
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  %xmm11,%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm10,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm10,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqu  (%r9),%xmm0
+       jnz     .Ldec_loop
+
+
+       movdqa  96(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  112(%r10),%xmm0
+       movdqa  -352(%r11),%xmm2
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,194
+       .byte   0xf3,0xc3
+.size  _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type  _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+
+
+
+
+
+       call    _vpaes_preheat
+
+       movdqa  .Lk_rcon(%rip),%xmm8
+       movdqu  (%rdi),%xmm0
+
+
+       movdqa  %xmm0,%xmm3
+       leaq    .Lk_ipt(%rip),%r11
+       call    _vpaes_schedule_transform
+       movdqa  %xmm0,%xmm7
+
+       leaq    .Lk_sr(%rip),%r10
+       testq   %rcx,%rcx
+       jnz     .Lschedule_am_decrypting
+
+
+       movdqu  %xmm0,(%rdx)
+       jmp     .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+       movdqa  (%r8,%r10,1),%xmm1
+.byte  102,15,56,0,217
+       movdqu  %xmm3,(%rdx)
+       xorq    $48,%r8
+
+.Lschedule_go:
+       cmpl    $192,%esi
+       ja      .Lschedule_256
+       je      .Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+       movl    $10,%esi
+
+.Loop_schedule_128:
+       call    _vpaes_schedule_round
+       decq    %rsi
+       jz      .Lschedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+       jmp     .Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_192:
+       movdqu  8(%rdi),%xmm0
+       call    _vpaes_schedule_transform
+
+       movdqa  %xmm0,%xmm6
+       pxor    %xmm4,%xmm4
+       movhlps %xmm4,%xmm6
+       movl    $4,%esi
+
+.Loop_schedule_192:
+       call    _vpaes_schedule_round
+.byte  102,15,58,15,198,8
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_192_smear
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_round
+       decq    %rsi
+       jz      .Lschedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_192_smear
+       jmp     .Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_256:
+       movdqu  16(%rdi),%xmm0
+       call    _vpaes_schedule_transform
+
+       movl    $7,%esi
+
+.Loop_schedule_256:
+       call    _vpaes_schedule_mangle
+
+       movdqa  %xmm0,%xmm6
+
+
+       call    _vpaes_schedule_round
+       decq    %rsi
+       jz      .Lschedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+
+
+       pshufd  $255,%xmm0,%xmm0
+       movdqa  %xmm7,%xmm5
+       movdqa  %xmm6,%xmm7
+       call    _vpaes_schedule_low_round
+       movdqa  %xmm5,%xmm7
+
+       jmp     .Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_mangle_last:
+
+       leaq    .Lk_deskew(%rip),%r11
+       testq   %rcx,%rcx
+       jnz     .Lschedule_mangle_last_dec
+
+
+       movdqa  (%r8,%r10,1),%xmm1
+.byte  102,15,56,0,193
+       leaq    .Lk_opt(%rip),%r11
+       addq    $32,%rdx
+
+.Lschedule_mangle_last_dec:
+       addq    $-16,%rdx
+       pxor    .Lk_s63(%rip),%xmm0
+       call    _vpaes_schedule_transform
+
+       movdqu  %xmm0,(%rdx)
+
+
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       .byte   0xf3,0xc3
+.size  _vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type  _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+       pshufd  $128,%xmm6,%xmm0
+       pxor    %xmm0,%xmm6
+       pshufd  $254,%xmm7,%xmm0
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm6,%xmm0
+       pxor    %xmm1,%xmm1
+       movhlps %xmm1,%xmm6
+       .byte   0xf3,0xc3
+.size  _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type  _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+
+       pxor    %xmm1,%xmm1
+.byte  102,65,15,58,15,200,15
+.byte  102,69,15,58,15,192,15
+       pxor    %xmm1,%xmm7
+
+
+       pshufd  $255,%xmm0,%xmm0
+.byte  102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+       movdqa  %xmm7,%xmm1
+       pslldq  $4,%xmm7
+       pxor    %xmm1,%xmm7
+       movdqa  %xmm7,%xmm1
+       pslldq  $8,%xmm7
+       pxor    %xmm1,%xmm7
+       pxor    .Lk_s63(%rip),%xmm7
+
+
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  %xmm11,%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm10,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm10,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqa  %xmm13,%xmm4
+.byte  102,15,56,0,226
+       movdqa  %xmm12,%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+
+
+       pxor    %xmm7,%xmm0
+       movdqa  %xmm0,%xmm7
+       .byte   0xf3,0xc3
+.size  _vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type  _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  (%r11),%xmm2
+.byte  102,15,56,0,208
+       movdqa  16(%r11),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm2,%xmm0
+       .byte   0xf3,0xc3
+.size  _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type  _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+       movdqa  %xmm0,%xmm4
+       movdqa  .Lk_mc_forward(%rip),%xmm5
+       testq   %rcx,%rcx
+       jnz     .Lschedule_mangle_dec
+
+
+       addq    $16,%rdx
+       pxor    .Lk_s63(%rip),%xmm4
+.byte  102,15,56,0,229
+       movdqa  %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+
+       jmp     .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+
+       leaq    .Lk_dksd(%rip),%r11
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm4,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm4
+
+       movdqa  0(%r11),%xmm2
+.byte  102,15,56,0,212
+       movdqa  16(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+
+       movdqa  32(%r11),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  48(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+
+       movdqa  64(%r11),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  80(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+
+       movdqa  96(%r11),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  112(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+
+       addq    $-16,%rdx
+
+.Lschedule_mangle_both:
+       movdqa  (%r8,%r10,1),%xmm1
+.byte  102,15,56,0,217
+       addq    $-16,%r8
+       andq    $48,%r8
+       movdqu  %xmm3,(%rdx)
+       .byte   0xf3,0xc3
+.size  _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl vpaes_set_encrypt_key
+.type  vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+       movl    %esi,%eax
+       shrl    $5,%eax
+       addl    $5,%eax
+       movl    %eax,240(%rdx)
+
+       movl    $0,%ecx
+       movl    $48,%r8d
+       call    _vpaes_schedule_core
+       xorl    %eax,%eax
+       .byte   0xf3,0xc3
+.size  vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.type  vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+       movl    %esi,%eax
+       shrl    $5,%eax
+       addl    $5,%eax
+       movl    %eax,240(%rdx)
+       shll    $4,%eax
+       leaq    16(%rdx,%rax,1),%rdx
+
+       movl    $1,%ecx
+       movl    %esi,%r8d
+       shrl    $1,%r8d
+       andl    $32,%r8d
+       xorl    $32,%r8d
+       call    _vpaes_schedule_core
+       xorl    %eax,%eax
+       .byte   0xf3,0xc3
+.size  vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl vpaes_encrypt
+.type  vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+       movdqu  (%rdi),%xmm0
+       call    _vpaes_preheat
+       call    _vpaes_encrypt_core
+       movdqu  %xmm0,(%rsi)
+       .byte   0xf3,0xc3
+.size  vpaes_encrypt,.-vpaes_encrypt
+
+.globl vpaes_decrypt
+.type  vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+       movdqu  (%rdi),%xmm0
+       call    _vpaes_preheat
+       call    _vpaes_decrypt_core
+       movdqu  %xmm0,(%rsi)
+       .byte   0xf3,0xc3
+.size  vpaes_decrypt,.-vpaes_decrypt
+.globl vpaes_cbc_encrypt
+.type  vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+       xchgq   %rcx,%rdx
+       subq    $16,%rcx
+       jc      .Lcbc_abort
+       movdqu  (%r8),%xmm6
+       subq    %rdi,%rsi
+       call    _vpaes_preheat
+       cmpl    $0,%r9d
+       je      .Lcbc_dec_loop
+       jmp     .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+       movdqu  (%rdi),%xmm0
+       pxor    %xmm6,%xmm0
+       call    _vpaes_encrypt_core
+       movdqa  %xmm0,%xmm6
+       movdqu  %xmm0,(%rsi,%rdi,1)
+       leaq    16(%rdi),%rdi
+       subq    $16,%rcx
+       jnc     .Lcbc_enc_loop
+       jmp     .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+       movdqu  (%rdi),%xmm0
+       movdqa  %xmm0,%xmm7
+       call    _vpaes_decrypt_core
+       pxor    %xmm6,%xmm0
+       movdqa  %xmm7,%xmm6
+       movdqu  %xmm0,(%rsi,%rdi,1)
+       leaq    16(%rdi),%rdi
+       subq    $16,%rcx
+       jnc     .Lcbc_dec_loop
+.Lcbc_done:
+       movdqu  %xmm6,(%r8)
+.Lcbc_abort:
+       .byte   0xf3,0xc3
+.size  vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type  _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+       leaq    .Lk_s0F(%rip),%r10
+       movdqa  -32(%r10),%xmm10
+       movdqa  -16(%r10),%xmm11
+       movdqa  0(%r10),%xmm9
+       movdqa  48(%r10),%xmm13
+       movdqa  64(%r10),%xmm12
+       movdqa  80(%r10),%xmm15
+       movdqa  96(%r10),%xmm14
+       .byte   0xf3,0xc3
+.size  _vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type  _vpaes_consts,@object
+.align 64
+_vpaes_consts:
+.Lk_inv:
+.quad  0x0E05060F0D080180, 0x040703090A0B0C02
+.quad  0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad  0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad  0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad  0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad  0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad  0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad  0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad  0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad  0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad  0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad  0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad  0x080B0A0904070605, 0x000302010C0F0E0D
+.quad  0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad  0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad  0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad  0x020100030E0D0C0F, 0x0A09080B06050407
+.quad  0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad  0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad  0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad  0x030E09040F0A0500, 0x0B06010C07020D08
+.quad  0x0F060D040B020900, 0x070E050C030A0108
+.quad  0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad  0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad  0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad  0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad  0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad  0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad  0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad  0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad  0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad  0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad  0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad  0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad  0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad  0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad  0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad  0x0F505B040B545F00, 0x154A411E114E451A
+.quad  0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad  0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad  0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad  0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad  0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad  0xD022649296B44200, 0x602646F6B0F2D404
+.quad  0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad  0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad  0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad  0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad  0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte  86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 64
+.size  _vpaes_consts,.-_vpaes_consts
diff --git a/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s b/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s
new file mode 100644 (file)
index 0000000..f5411f3
--- /dev/null
@@ -0,0 +1,1027 @@
+.text
+
+
+.globl gcm_gmult_4bit
+.type  gcm_gmult_4bit,@function
+.align 16
+gcm_gmult_4bit:
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+.Lgmult_prologue:
+
+       movzbq  15(%rdi),%r8
+       leaq    .Lrem_4bit(%rip),%r11
+       xorq    %rax,%rax
+       xorq    %rbx,%rbx
+       movb    %r8b,%al
+       movb    %r8b,%bl
+       shlb    $4,%al
+       movq    $14,%rcx
+       movq    8(%rsi,%rax,1),%r8
+       movq    (%rsi,%rax,1),%r9
+       andb    $240,%bl
+       movq    %r8,%rdx
+       jmp     .Loop1
+
+.align 16
+.Loop1:
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       movb    (%rdi,%rcx,1),%al
+       shrq    $4,%r9
+       xorq    8(%rsi,%rbx,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rbx,1),%r9
+       movb    %al,%bl
+       xorq    (%r11,%rdx,8),%r9
+       movq    %r8,%rdx
+       shlb    $4,%al
+       xorq    %r10,%r8
+       decq    %rcx
+       js      .Lbreak1
+
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       xorq    8(%rsi,%rax,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rax,1),%r9
+       andb    $240,%bl
+       xorq    (%r11,%rdx,8),%r9
+       movq    %r8,%rdx
+       xorq    %r10,%r8
+       jmp     .Loop1
+
+.align 16
+.Lbreak1:
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       xorq    8(%rsi,%rax,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rax,1),%r9
+       andb    $240,%bl
+       xorq    (%r11,%rdx,8),%r9
+       movq    %r8,%rdx
+       xorq    %r10,%r8
+
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       xorq    8(%rsi,%rbx,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rbx,1),%r9
+       xorq    %r10,%r8
+       xorq    (%r11,%rdx,8),%r9
+
+       bswapq  %r8
+       bswapq  %r9
+       movq    %r8,8(%rdi)
+       movq    %r9,(%rdi)
+
+       movq    16(%rsp),%rbx
+       leaq    24(%rsp),%rsp
+.Lgmult_epilogue:
+       .byte   0xf3,0xc3
+.size  gcm_gmult_4bit,.-gcm_gmult_4bit
+.globl gcm_ghash_4bit
+.type  gcm_ghash_4bit,@function
+.align 16
+gcm_ghash_4bit:
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       subq    $280,%rsp
+.Lghash_prologue:
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+       subq    $-128,%rsi
+       leaq    16+128(%rsp),%rbp
+       xorl    %edx,%edx
+       movq    0+0-128(%rsi),%r8
+       movq    0+8-128(%rsi),%rax
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    16+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    16+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,0(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,0(%rbp)
+       movq    32+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,0-128(%rbp)
+       movq    32+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,1(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,8(%rbp)
+       movq    48+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,8-128(%rbp)
+       movq    48+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,2(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,16(%rbp)
+       movq    64+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,16-128(%rbp)
+       movq    64+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,3(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,24(%rbp)
+       movq    80+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,24-128(%rbp)
+       movq    80+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,4(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,32(%rbp)
+       movq    96+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,32-128(%rbp)
+       movq    96+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,5(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,40(%rbp)
+       movq    112+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,40-128(%rbp)
+       movq    112+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,6(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,48(%rbp)
+       movq    128+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,48-128(%rbp)
+       movq    128+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,7(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,56(%rbp)
+       movq    144+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,56-128(%rbp)
+       movq    144+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,8(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,64(%rbp)
+       movq    160+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,64-128(%rbp)
+       movq    160+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,9(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,72(%rbp)
+       movq    176+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,72-128(%rbp)
+       movq    176+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,10(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,80(%rbp)
+       movq    192+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,80-128(%rbp)
+       movq    192+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,11(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,88(%rbp)
+       movq    208+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,88-128(%rbp)
+       movq    208+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,12(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,96(%rbp)
+       movq    224+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,96-128(%rbp)
+       movq    224+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,13(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,104(%rbp)
+       movq    240+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,104-128(%rbp)
+       movq    240+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,14(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,112(%rbp)
+       shlb    $4,%dl
+       movq    %rax,112-128(%rbp)
+       shlq    $60,%r10
+       movb    %dl,15(%rsp)
+       orq     %r10,%rbx
+       movq    %r9,120(%rbp)
+       movq    %rbx,120-128(%rbp)
+       addq    $-128,%rsi
+       movq    8(%rdi),%r8
+       movq    0(%rdi),%r9
+       addq    %r14,%r15
+       leaq    .Lrem_8bit(%rip),%r11
+       jmp     .Louter_loop
+.align 16
+.Louter_loop:
+       xorq    (%r14),%r9
+       movq    8(%r14),%rdx
+       leaq    16(%r14),%r14
+       xorq    %r8,%rdx
+       movq    %r9,(%rdi)
+       movq    %rdx,8(%rdi)
+       shrq    $32,%rdx
+       xorq    %rax,%rax
+       roll    $8,%edx
+       movb    %dl,%al
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       shrl    $4,%ebx
+       roll    $8,%edx
+       movq    8(%rsi,%rax,1),%r8
+       movq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    8(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    4(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    0(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       andl    $240,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    -4(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       movzwq  (%r11,%r12,2),%r12
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       shlq    $48,%r12
+       xorq    %r10,%r8
+       xorq    %r12,%r9
+       movzbq  %r8b,%r13
+       shrq    $4,%r8
+       movq    %r9,%r10
+       shlb    $4,%r13b
+       shrq    $4,%r9
+       xorq    8(%rsi,%rcx,1),%r8
+       movzwq  (%r11,%r13,2),%r13
+       shlq    $60,%r10
+       xorq    (%rsi,%rcx,1),%r9
+       xorq    %r10,%r8
+       shlq    $48,%r13
+       bswapq  %r8
+       xorq    %r13,%r9
+       bswapq  %r9
+       cmpq    %r15,%r14
+       jb      .Louter_loop
+       movq    %r8,8(%rdi)
+       movq    %r9,(%rdi)
+
+       leaq    280(%rsp),%rsi
+       movq    0(%rsi),%r15
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r13
+       movq    24(%rsi),%r12
+       movq    32(%rsi),%rbp
+       movq    40(%rsi),%rbx
+       leaq    48(%rsi),%rsp
+.Lghash_epilogue:
+       .byte   0xf3,0xc3
+.size  gcm_ghash_4bit,.-gcm_ghash_4bit
+.globl gcm_init_clmul
+.type  gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+       movdqu  (%rsi),%xmm2
+       pshufd  $78,%xmm2,%xmm2
+
+
+       pshufd  $255,%xmm2,%xmm4
+       movdqa  %xmm2,%xmm3
+       psllq   $1,%xmm2
+       pxor    %xmm5,%xmm5
+       psrlq   $63,%xmm3
+       pcmpgtd %xmm4,%xmm5
+       pslldq  $8,%xmm3
+       por     %xmm3,%xmm2
+
+
+       pand    .L0x1c2_polynomial(%rip),%xmm5
+       pxor    %xmm5,%xmm2
+
+
+       movdqa  %xmm2,%xmm0
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,194,0
+.byte  102,15,58,68,202,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       movdqu  %xmm2,(%rdi)
+       movdqu  %xmm0,16(%rdi)
+       .byte   0xf3,0xc3
+.size  gcm_init_clmul,.-gcm_init_clmul
+.globl gcm_gmult_clmul
+.type  gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+       movdqu  (%rdi),%xmm0
+       movdqa  .Lbswap_mask(%rip),%xmm5
+       movdqu  (%rsi),%xmm2
+.byte  102,15,56,0,197
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,194,0
+.byte  102,15,58,68,202,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,197
+       movdqu  %xmm0,(%rdi)
+       .byte   0xf3,0xc3
+.size  gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl gcm_ghash_clmul
+.type  gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+       movdqa  .Lbswap_mask(%rip),%xmm5
+
+       movdqu  (%rdi),%xmm0
+       movdqu  (%rsi),%xmm2
+.byte  102,15,56,0,197
+
+       subq    $16,%rcx
+       jz      .Lodd_tail
+
+       movdqu  16(%rsi),%xmm8
+
+
+
+
+
+       movdqu  (%rdx),%xmm3
+       movdqu  16(%rdx),%xmm6
+.byte  102,15,56,0,221
+.byte  102,15,56,0,245
+       pxor    %xmm3,%xmm0
+       movdqa  %xmm6,%xmm7
+       pshufd  $78,%xmm6,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm6,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,242,0
+.byte  102,15,58,68,250,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm6,%xmm3
+       pxor    %xmm7,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm7
+       pxor    %xmm4,%xmm6
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm8,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm8,%xmm4
+
+       leaq    32(%rdx),%rdx
+       subq    $32,%rcx
+       jbe     .Leven_tail
+
+.Lmod_loop:
+.byte  102,65,15,58,68,192,0
+.byte  102,65,15,58,68,200,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+       movdqu  (%rdx),%xmm3
+       pxor    %xmm6,%xmm0
+       pxor    %xmm7,%xmm1
+
+       movdqu  16(%rdx),%xmm6
+.byte  102,15,56,0,221
+.byte  102,15,56,0,245
+
+       movdqa  %xmm6,%xmm7
+       pshufd  $78,%xmm6,%xmm9
+       pshufd  $78,%xmm2,%xmm10
+       pxor    %xmm6,%xmm9
+       pxor    %xmm2,%xmm10
+       pxor    %xmm3,%xmm1
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+.byte  102,15,58,68,242,0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+.byte  102,15,58,68,250,17
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+
+.byte  102,69,15,58,68,202,0
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm8,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm8,%xmm4
+
+       pxor    %xmm6,%xmm9
+       pxor    %xmm7,%xmm9
+       movdqa  %xmm9,%xmm10
+       psrldq  $8,%xmm9
+       pslldq  $8,%xmm10
+       pxor    %xmm9,%xmm7
+       pxor    %xmm10,%xmm6
+
+       leaq    32(%rdx),%rdx
+       subq    $32,%rcx
+       ja      .Lmod_loop
+
+.Leven_tail:
+.byte  102,65,15,58,68,192,0
+.byte  102,65,15,58,68,200,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+       pxor    %xmm6,%xmm0
+       pxor    %xmm7,%xmm1
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       testq   %rcx,%rcx
+       jnz     .Ldone
+
+.Lodd_tail:
+       movdqu  (%rdx),%xmm3
+.byte  102,15,56,0,221
+       pxor    %xmm3,%xmm0
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,194,0
+.byte  102,15,58,68,202,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+.Ldone:
+.byte  102,15,56,0,197
+       movdqu  %xmm0,(%rdi)
+       .byte   0xf3,0xc3
+.LSEH_end_gcm_ghash_clmul:
+.size  gcm_ghash_clmul,.-gcm_ghash_clmul
+.align 64
+.Lbswap_mask:
+.byte  15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte  1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align 64
+.type  .Lrem_4bit,@object
+.Lrem_4bit:
+.long  0,0,0,471859200,0,943718400,0,610271232
+.long  0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long  0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long  0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type  .Lrem_8bit,@object
+.Lrem_8bit:
+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte  71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/deps/openssl/asm/x64-macosx-gas/aes/bsaes-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/bsaes-x86_64.s
new file mode 100644 (file)
index 0000000..e2911ba
--- /dev/null
@@ -0,0 +1,2569 @@
+.text
+
+
+
+
+
+
+.p2align       6
+_bsaes_encrypt8:
+       leaq    L$BS0(%rip),%r11
+
+       movdqa  (%rax),%xmm8
+       leaq    16(%rax),%rax
+       movdqa  80(%r11),%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+.byte  102,68,15,56,0,255
+       pxor    %xmm8,%xmm1
+.byte  102,15,56,0,199
+       pxor    %xmm8,%xmm2
+.byte  102,15,56,0,207
+       pxor    %xmm8,%xmm3
+.byte  102,15,56,0,215
+       pxor    %xmm8,%xmm4
+.byte  102,15,56,0,223
+       pxor    %xmm8,%xmm5
+.byte  102,15,56,0,231
+       pxor    %xmm8,%xmm6
+.byte  102,15,56,0,239
+.byte  102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm5,%xmm9
+       psrlq   $1,%xmm5
+       movdqa  %xmm3,%xmm10
+       psrlq   $1,%xmm3
+       pxor    %xmm6,%xmm5
+       pxor    %xmm4,%xmm3
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm3
+       pxor    %xmm5,%xmm6
+       psllq   $1,%xmm5
+       pxor    %xmm3,%xmm4
+       psllq   $1,%xmm3
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm1,%xmm9
+       psrlq   $1,%xmm1
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm1
+       pand    %xmm7,%xmm15
+       pxor    %xmm1,%xmm2
+       psllq   $1,%xmm1
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm1
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm4,%xmm9
+       psrlq   $2,%xmm4
+       movdqa  %xmm3,%xmm10
+       psrlq   $2,%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm3
+       pand    %xmm8,%xmm4
+       pand    %xmm8,%xmm3
+       pxor    %xmm4,%xmm6
+       psllq   $2,%xmm4
+       pxor    %xmm3,%xmm5
+       psllq   $2,%xmm3
+       pxor    %xmm9,%xmm4
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm2,%xmm0
+       pxor    %xmm1,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm2
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm1
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm2,%xmm9
+       psrlq   $4,%xmm2
+       movdqa  %xmm1,%xmm10
+       psrlq   $4,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm5,%xmm1
+       pand    %xmm7,%xmm2
+       pand    %xmm7,%xmm1
+       pxor    %xmm2,%xmm6
+       psllq   $4,%xmm2
+       pxor    %xmm1,%xmm5
+       psllq   $4,%xmm1
+       pxor    %xmm9,%xmm2
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm4,%xmm0
+       pxor    %xmm3,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm4
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm3
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       decl    %r10d
+       jmp     L$enc_sbox
+.p2align       4
+L$enc_loop:
+       pxor    0(%rax),%xmm15
+       pxor    16(%rax),%xmm0
+.byte  102,68,15,56,0,255
+       pxor    32(%rax),%xmm1
+.byte  102,15,56,0,199
+       pxor    48(%rax),%xmm2
+.byte  102,15,56,0,207
+       pxor    64(%rax),%xmm3
+.byte  102,15,56,0,215
+       pxor    80(%rax),%xmm4
+.byte  102,15,56,0,223
+       pxor    96(%rax),%xmm5
+.byte  102,15,56,0,231
+       pxor    112(%rax),%xmm6
+.byte  102,15,56,0,239
+       leaq    128(%rax),%rax
+.byte  102,15,56,0,247
+L$enc_sbox:
+       pxor    %xmm5,%xmm4
+       pxor    %xmm0,%xmm1
+       pxor    %xmm15,%xmm2
+       pxor    %xmm1,%xmm5
+       pxor    %xmm15,%xmm4
+
+       pxor    %xmm2,%xmm5
+       pxor    %xmm6,%xmm2
+       pxor    %xmm4,%xmm6
+       pxor    %xmm3,%xmm2
+       pxor    %xmm4,%xmm3
+       pxor    %xmm0,%xmm2
+
+       pxor    %xmm6,%xmm1
+       pxor    %xmm4,%xmm0
+       movdqa  %xmm6,%xmm10
+       movdqa  %xmm0,%xmm9
+       movdqa  %xmm4,%xmm8
+       movdqa  %xmm1,%xmm12
+       movdqa  %xmm5,%xmm11
+
+       pxor    %xmm3,%xmm10
+       pxor    %xmm1,%xmm9
+       pxor    %xmm2,%xmm8
+       movdqa  %xmm10,%xmm13
+       pxor    %xmm3,%xmm12
+       movdqa  %xmm9,%xmm7
+       pxor    %xmm15,%xmm11
+       movdqa  %xmm10,%xmm14
+
+       por     %xmm8,%xmm9
+       por     %xmm11,%xmm10
+       pxor    %xmm7,%xmm14
+       pand    %xmm11,%xmm13
+       pxor    %xmm8,%xmm11
+       pand    %xmm8,%xmm7
+       pand    %xmm11,%xmm14
+       movdqa  %xmm2,%xmm11
+       pxor    %xmm15,%xmm11
+       pand    %xmm11,%xmm12
+       pxor    %xmm12,%xmm10
+       pxor    %xmm12,%xmm9
+       movdqa  %xmm6,%xmm12
+       movdqa  %xmm4,%xmm11
+       pxor    %xmm0,%xmm12
+       pxor    %xmm5,%xmm11
+       movdqa  %xmm12,%xmm8
+       pand    %xmm11,%xmm12
+       por     %xmm11,%xmm8
+       pxor    %xmm12,%xmm7
+       pxor    %xmm14,%xmm10
+       pxor    %xmm13,%xmm9
+       pxor    %xmm14,%xmm8
+       movdqa  %xmm1,%xmm11
+       pxor    %xmm13,%xmm7
+       movdqa  %xmm3,%xmm12
+       pxor    %xmm13,%xmm8
+       movdqa  %xmm0,%xmm13
+       pand    %xmm2,%xmm11
+       movdqa  %xmm6,%xmm14
+       pand    %xmm15,%xmm12
+       pand    %xmm4,%xmm13
+       por     %xmm5,%xmm14
+       pxor    %xmm11,%xmm10
+       pxor    %xmm12,%xmm9
+       pxor    %xmm13,%xmm8
+       pxor    %xmm14,%xmm7
+
+
+
+
+
+       movdqa  %xmm10,%xmm11
+       pand    %xmm8,%xmm10
+       pxor    %xmm9,%xmm11
+
+       movdqa  %xmm7,%xmm13
+       movdqa  %xmm11,%xmm14
+       pxor    %xmm10,%xmm13
+       pand    %xmm13,%xmm14
+
+       movdqa  %xmm8,%xmm12
+       pxor    %xmm9,%xmm14
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm9,%xmm10
+
+       pand    %xmm10,%xmm12
+
+       movdqa  %xmm13,%xmm9
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm12,%xmm9
+       pxor    %xmm12,%xmm8
+
+       pand    %xmm7,%xmm9
+
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm8
+
+       pand    %xmm14,%xmm13
+
+       pxor    %xmm11,%xmm13
+       movdqa  %xmm5,%xmm11
+       movdqa  %xmm4,%xmm7
+       movdqa  %xmm14,%xmm9
+       pxor    %xmm13,%xmm9
+       pand    %xmm5,%xmm9
+       pxor    %xmm4,%xmm5
+       pand    %xmm14,%xmm4
+       pand    %xmm13,%xmm5
+       pxor    %xmm4,%xmm5
+       pxor    %xmm9,%xmm4
+       pxor    %xmm15,%xmm11
+       pxor    %xmm2,%xmm7
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm15,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm2,%xmm15
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm2
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm15
+       pxor    %xmm11,%xmm7
+       pxor    %xmm2,%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm2
+       pxor    %xmm11,%xmm5
+       pxor    %xmm11,%xmm15
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm2
+
+       movdqa  %xmm6,%xmm11
+       movdqa  %xmm0,%xmm7
+       pxor    %xmm3,%xmm11
+       pxor    %xmm1,%xmm7
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm3,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm1,%xmm3
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm1
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm3
+       pxor    %xmm11,%xmm7
+       pxor    %xmm1,%xmm3
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm1
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       pxor    %xmm13,%xmm10
+       pand    %xmm6,%xmm10
+       pxor    %xmm0,%xmm6
+       pand    %xmm14,%xmm0
+       pand    %xmm13,%xmm6
+       pxor    %xmm0,%xmm6
+       pxor    %xmm10,%xmm0
+       pxor    %xmm11,%xmm6
+       pxor    %xmm11,%xmm3
+       pxor    %xmm7,%xmm0
+       pxor    %xmm7,%xmm1
+       pxor    %xmm15,%xmm6
+       pxor    %xmm5,%xmm0
+       pxor    %xmm6,%xmm3
+       pxor    %xmm15,%xmm5
+       pxor    %xmm0,%xmm15
+
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       pxor    %xmm2,%xmm1
+       pxor    %xmm4,%xmm2
+       pxor    %xmm4,%xmm3
+
+       pxor    %xmm2,%xmm5
+       decl    %r10d
+       jl      L$enc_done
+       pshufd  $147,%xmm15,%xmm7
+       pshufd  $147,%xmm0,%xmm8
+       pxor    %xmm7,%xmm15
+       pshufd  $147,%xmm3,%xmm9
+       pxor    %xmm8,%xmm0
+       pshufd  $147,%xmm5,%xmm10
+       pxor    %xmm9,%xmm3
+       pshufd  $147,%xmm2,%xmm11
+       pxor    %xmm10,%xmm5
+       pshufd  $147,%xmm6,%xmm12
+       pxor    %xmm11,%xmm2
+       pshufd  $147,%xmm1,%xmm13
+       pxor    %xmm12,%xmm6
+       pshufd  $147,%xmm4,%xmm14
+       pxor    %xmm13,%xmm1
+       pxor    %xmm14,%xmm4
+
+       pxor    %xmm15,%xmm8
+       pxor    %xmm4,%xmm7
+       pxor    %xmm4,%xmm8
+       pshufd  $78,%xmm15,%xmm15
+       pxor    %xmm0,%xmm9
+       pshufd  $78,%xmm0,%xmm0
+       pxor    %xmm2,%xmm12
+       pxor    %xmm7,%xmm15
+       pxor    %xmm6,%xmm13
+       pxor    %xmm8,%xmm0
+       pxor    %xmm5,%xmm11
+       pshufd  $78,%xmm2,%xmm7
+       pxor    %xmm1,%xmm14
+       pshufd  $78,%xmm6,%xmm8
+       pxor    %xmm3,%xmm10
+       pshufd  $78,%xmm5,%xmm2
+       pxor    %xmm4,%xmm10
+       pshufd  $78,%xmm4,%xmm6
+       pxor    %xmm4,%xmm11
+       pshufd  $78,%xmm1,%xmm5
+       pxor    %xmm11,%xmm7
+       pshufd  $78,%xmm3,%xmm1
+       pxor    %xmm12,%xmm8
+
+       pxor    %xmm10,%xmm2
+       pxor    %xmm14,%xmm6
+       pxor    %xmm13,%xmm5
+       movdqa  %xmm7,%xmm3
+       pxor    %xmm9,%xmm1
+       movdqa  %xmm8,%xmm4
+       movdqa  48(%r11),%xmm7
+       jnz     L$enc_loop
+       movdqa  64(%r11),%xmm7
+       jmp     L$enc_loop
+.p2align       4
+L$enc_done:
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm1,%xmm9
+       psrlq   $1,%xmm1
+       movdqa  %xmm2,%xmm10
+       psrlq   $1,%xmm2
+       pxor    %xmm4,%xmm1
+       pxor    %xmm6,%xmm2
+       pand    %xmm7,%xmm1
+       pand    %xmm7,%xmm2
+       pxor    %xmm1,%xmm4
+       psllq   $1,%xmm1
+       pxor    %xmm2,%xmm6
+       psllq   $1,%xmm2
+       pxor    %xmm9,%xmm1
+       pxor    %xmm10,%xmm2
+       movdqa  %xmm3,%xmm9
+       psrlq   $1,%xmm3
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm5,%xmm3
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm3
+       pand    %xmm7,%xmm15
+       pxor    %xmm3,%xmm5
+       psllq   $1,%xmm3
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm3
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm6,%xmm9
+       psrlq   $2,%xmm6
+       movdqa  %xmm2,%xmm10
+       psrlq   $2,%xmm2
+       pxor    %xmm4,%xmm6
+       pxor    %xmm1,%xmm2
+       pand    %xmm8,%xmm6
+       pand    %xmm8,%xmm2
+       pxor    %xmm6,%xmm4
+       psllq   $2,%xmm6
+       pxor    %xmm2,%xmm1
+       psllq   $2,%xmm2
+       pxor    %xmm9,%xmm6
+       pxor    %xmm10,%xmm2
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm5,%xmm0
+       pxor    %xmm3,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm5
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm3
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm5,%xmm9
+       psrlq   $4,%xmm5
+       movdqa  %xmm3,%xmm10
+       psrlq   $4,%xmm3
+       pxor    %xmm4,%xmm5
+       pxor    %xmm1,%xmm3
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm3
+       pxor    %xmm5,%xmm4
+       psllq   $4,%xmm5
+       pxor    %xmm3,%xmm1
+       psllq   $4,%xmm3
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm6,%xmm0
+       pxor    %xmm2,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm6
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm2
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  (%rax),%xmm7
+       pxor    %xmm7,%xmm3
+       pxor    %xmm7,%xmm5
+       pxor    %xmm7,%xmm2
+       pxor    %xmm7,%xmm6
+       pxor    %xmm7,%xmm1
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm15
+       pxor    %xmm7,%xmm0
+       .byte   0xf3,0xc3
+
+
+
+.p2align       6
+_bsaes_decrypt8:
+       leaq    L$BS0(%rip),%r11
+
+       movdqa  (%rax),%xmm8
+       leaq    16(%rax),%rax
+       movdqa  -48(%r11),%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+.byte  102,68,15,56,0,255
+       pxor    %xmm8,%xmm1
+.byte  102,15,56,0,199
+       pxor    %xmm8,%xmm2
+.byte  102,15,56,0,207
+       pxor    %xmm8,%xmm3
+.byte  102,15,56,0,215
+       pxor    %xmm8,%xmm4
+.byte  102,15,56,0,223
+       pxor    %xmm8,%xmm5
+.byte  102,15,56,0,231
+       pxor    %xmm8,%xmm6
+.byte  102,15,56,0,239
+.byte  102,15,56,0,247
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm5,%xmm9
+       psrlq   $1,%xmm5
+       movdqa  %xmm3,%xmm10
+       psrlq   $1,%xmm3
+       pxor    %xmm6,%xmm5
+       pxor    %xmm4,%xmm3
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm3
+       pxor    %xmm5,%xmm6
+       psllq   $1,%xmm5
+       pxor    %xmm3,%xmm4
+       psllq   $1,%xmm3
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm1,%xmm9
+       psrlq   $1,%xmm1
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm2,%xmm1
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm1
+       pand    %xmm7,%xmm15
+       pxor    %xmm1,%xmm2
+       psllq   $1,%xmm1
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm1
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm4,%xmm9
+       psrlq   $2,%xmm4
+       movdqa  %xmm3,%xmm10
+       psrlq   $2,%xmm3
+       pxor    %xmm6,%xmm4
+       pxor    %xmm5,%xmm3
+       pand    %xmm8,%xmm4
+       pand    %xmm8,%xmm3
+       pxor    %xmm4,%xmm6
+       psllq   $2,%xmm4
+       pxor    %xmm3,%xmm5
+       psllq   $2,%xmm3
+       pxor    %xmm9,%xmm4
+       pxor    %xmm10,%xmm3
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm2,%xmm0
+       pxor    %xmm1,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm2
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm1
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm2,%xmm9
+       psrlq   $4,%xmm2
+       movdqa  %xmm1,%xmm10
+       psrlq   $4,%xmm1
+       pxor    %xmm6,%xmm2
+       pxor    %xmm5,%xmm1
+       pand    %xmm7,%xmm2
+       pand    %xmm7,%xmm1
+       pxor    %xmm2,%xmm6
+       psllq   $4,%xmm2
+       pxor    %xmm1,%xmm5
+       psllq   $4,%xmm1
+       pxor    %xmm9,%xmm2
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm4,%xmm0
+       pxor    %xmm3,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm4
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm3
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       decl    %r10d
+       jmp     L$dec_sbox
+.p2align       4
+L$dec_loop:
+       pxor    0(%rax),%xmm15
+       pxor    16(%rax),%xmm0
+.byte  102,68,15,56,0,255
+       pxor    32(%rax),%xmm1
+.byte  102,15,56,0,199
+       pxor    48(%rax),%xmm2
+.byte  102,15,56,0,207
+       pxor    64(%rax),%xmm3
+.byte  102,15,56,0,215
+       pxor    80(%rax),%xmm4
+.byte  102,15,56,0,223
+       pxor    96(%rax),%xmm5
+.byte  102,15,56,0,231
+       pxor    112(%rax),%xmm6
+.byte  102,15,56,0,239
+       leaq    128(%rax),%rax
+.byte  102,15,56,0,247
+L$dec_sbox:
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm6,%xmm3
+       pxor    %xmm6,%xmm1
+       pxor    %xmm3,%xmm5
+       pxor    %xmm5,%xmm6
+       pxor    %xmm6,%xmm0
+
+       pxor    %xmm0,%xmm15
+       pxor    %xmm4,%xmm1
+       pxor    %xmm15,%xmm2
+       pxor    %xmm15,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqa  %xmm2,%xmm10
+       movdqa  %xmm6,%xmm9
+       movdqa  %xmm0,%xmm8
+       movdqa  %xmm3,%xmm12
+       movdqa  %xmm4,%xmm11
+
+       pxor    %xmm15,%xmm10
+       pxor    %xmm3,%xmm9
+       pxor    %xmm5,%xmm8
+       movdqa  %xmm10,%xmm13
+       pxor    %xmm15,%xmm12
+       movdqa  %xmm9,%xmm7
+       pxor    %xmm1,%xmm11
+       movdqa  %xmm10,%xmm14
+
+       por     %xmm8,%xmm9
+       por     %xmm11,%xmm10
+       pxor    %xmm7,%xmm14
+       pand    %xmm11,%xmm13
+       pxor    %xmm8,%xmm11
+       pand    %xmm8,%xmm7
+       pand    %xmm11,%xmm14
+       movdqa  %xmm5,%xmm11
+       pxor    %xmm1,%xmm11
+       pand    %xmm11,%xmm12
+       pxor    %xmm12,%xmm10
+       pxor    %xmm12,%xmm9
+       movdqa  %xmm2,%xmm12
+       movdqa  %xmm0,%xmm11
+       pxor    %xmm6,%xmm12
+       pxor    %xmm4,%xmm11
+       movdqa  %xmm12,%xmm8
+       pand    %xmm11,%xmm12
+       por     %xmm11,%xmm8
+       pxor    %xmm12,%xmm7
+       pxor    %xmm14,%xmm10
+       pxor    %xmm13,%xmm9
+       pxor    %xmm14,%xmm8
+       movdqa  %xmm3,%xmm11
+       pxor    %xmm13,%xmm7
+       movdqa  %xmm15,%xmm12
+       pxor    %xmm13,%xmm8
+       movdqa  %xmm6,%xmm13
+       pand    %xmm5,%xmm11
+       movdqa  %xmm2,%xmm14
+       pand    %xmm1,%xmm12
+       pand    %xmm0,%xmm13
+       por     %xmm4,%xmm14
+       pxor    %xmm11,%xmm10
+       pxor    %xmm12,%xmm9
+       pxor    %xmm13,%xmm8
+       pxor    %xmm14,%xmm7
+
+
+
+
+
+       movdqa  %xmm10,%xmm11
+       pand    %xmm8,%xmm10
+       pxor    %xmm9,%xmm11
+
+       movdqa  %xmm7,%xmm13
+       movdqa  %xmm11,%xmm14
+       pxor    %xmm10,%xmm13
+       pand    %xmm13,%xmm14
+
+       movdqa  %xmm8,%xmm12
+       pxor    %xmm9,%xmm14
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm9,%xmm10
+
+       pand    %xmm10,%xmm12
+
+       movdqa  %xmm13,%xmm9
+       pxor    %xmm7,%xmm12
+
+       pxor    %xmm12,%xmm9
+       pxor    %xmm12,%xmm8
+
+       pand    %xmm7,%xmm9
+
+       pxor    %xmm9,%xmm13
+       pxor    %xmm9,%xmm8
+
+       pand    %xmm14,%xmm13
+
+       pxor    %xmm11,%xmm13
+       movdqa  %xmm4,%xmm11
+       movdqa  %xmm0,%xmm7
+       movdqa  %xmm14,%xmm9
+       pxor    %xmm13,%xmm9
+       pand    %xmm4,%xmm9
+       pxor    %xmm0,%xmm4
+       pand    %xmm14,%xmm0
+       pand    %xmm13,%xmm4
+       pxor    %xmm0,%xmm4
+       pxor    %xmm9,%xmm0
+       pxor    %xmm1,%xmm11
+       pxor    %xmm5,%xmm7
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm1,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm5,%xmm1
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm5
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm1
+       pxor    %xmm11,%xmm7
+       pxor    %xmm5,%xmm1
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm5
+       pxor    %xmm11,%xmm4
+       pxor    %xmm11,%xmm1
+       pxor    %xmm7,%xmm0
+       pxor    %xmm7,%xmm5
+
+       movdqa  %xmm2,%xmm11
+       movdqa  %xmm6,%xmm7
+       pxor    %xmm15,%xmm11
+       pxor    %xmm3,%xmm7
+       movdqa  %xmm14,%xmm10
+       movdqa  %xmm12,%xmm9
+       pxor    %xmm13,%xmm10
+       pxor    %xmm8,%xmm9
+       pand    %xmm11,%xmm10
+       pand    %xmm15,%xmm9
+       pxor    %xmm7,%xmm11
+       pxor    %xmm3,%xmm15
+       pand    %xmm14,%xmm7
+       pand    %xmm12,%xmm3
+       pand    %xmm13,%xmm11
+       pand    %xmm8,%xmm15
+       pxor    %xmm11,%xmm7
+       pxor    %xmm3,%xmm15
+       pxor    %xmm10,%xmm11
+       pxor    %xmm9,%xmm3
+       pxor    %xmm12,%xmm14
+       pxor    %xmm8,%xmm13
+       movdqa  %xmm14,%xmm10
+       pxor    %xmm13,%xmm10
+       pand    %xmm2,%xmm10
+       pxor    %xmm6,%xmm2
+       pand    %xmm14,%xmm6
+       pand    %xmm13,%xmm2
+       pxor    %xmm6,%xmm2
+       pxor    %xmm10,%xmm6
+       pxor    %xmm11,%xmm2
+       pxor    %xmm11,%xmm15
+       pxor    %xmm7,%xmm6
+       pxor    %xmm7,%xmm3
+       pxor    %xmm6,%xmm0
+       pxor    %xmm4,%xmm5
+
+       pxor    %xmm0,%xmm3
+       pxor    %xmm6,%xmm1
+       pxor    %xmm6,%xmm4
+       pxor    %xmm1,%xmm3
+       pxor    %xmm15,%xmm6
+       pxor    %xmm4,%xmm3
+       pxor    %xmm5,%xmm2
+       pxor    %xmm0,%xmm5
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm15,%xmm3
+       pxor    %xmm2,%xmm6
+       decl    %r10d
+       jl      L$dec_done
+
+       pshufd  $147,%xmm4,%xmm14
+       movdqa  %xmm5,%xmm9
+       pxor    %xmm6,%xmm4
+       pxor    %xmm6,%xmm5
+       pshufd  $147,%xmm15,%xmm7
+       movdqa  %xmm6,%xmm12
+       pxor    %xmm15,%xmm6
+       pxor    %xmm0,%xmm15
+       pshufd  $147,%xmm0,%xmm8
+       pxor    %xmm5,%xmm0
+       pxor    %xmm2,%xmm15
+       pxor    %xmm3,%xmm0
+       pshufd  $147,%xmm3,%xmm10
+       pxor    %xmm15,%xmm5
+       pxor    %xmm4,%xmm3
+       pxor    %xmm2,%xmm4
+       pshufd  $147,%xmm2,%xmm13
+       movdqa  %xmm1,%xmm11
+       pxor    %xmm1,%xmm2
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm3
+       pxor    %xmm12,%xmm2
+       pxor    %xmm9,%xmm3
+       pxor    %xmm11,%xmm3
+       pshufd  $147,%xmm12,%xmm12
+
+       pxor    %xmm4,%xmm6
+       pxor    %xmm7,%xmm4
+       pxor    %xmm8,%xmm6
+       pshufd  $147,%xmm9,%xmm9
+       pxor    %xmm12,%xmm4
+       pxor    %xmm13,%xmm6
+       pxor    %xmm14,%xmm4
+       pshufd  $147,%xmm11,%xmm11
+       pxor    %xmm13,%xmm14
+       pxor    %xmm4,%xmm6
+
+       pxor    %xmm7,%xmm5
+       pshufd  $147,%xmm7,%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+       pxor    %xmm9,%xmm15
+       pshufd  $147,%xmm8,%xmm8
+       pxor    %xmm9,%xmm5
+       pxor    %xmm9,%xmm3
+       pxor    %xmm14,%xmm15
+       pshufd  $147,%xmm9,%xmm9
+       pxor    %xmm10,%xmm5
+       pxor    %xmm10,%xmm1
+       pxor    %xmm10,%xmm0
+       pshufd  $147,%xmm10,%xmm10
+       pxor    %xmm11,%xmm2
+       pxor    %xmm11,%xmm3
+       pxor    %xmm14,%xmm2
+       pxor    %xmm12,%xmm5
+       pxor    %xmm11,%xmm0
+       pxor    %xmm12,%xmm14
+
+       pxor    %xmm14,%xmm3
+       pshufd  $147,%xmm11,%xmm11
+       pxor    %xmm14,%xmm1
+       pxor    %xmm14,%xmm0
+
+       pxor    %xmm12,%xmm14
+       pshufd  $147,%xmm12,%xmm12
+       pxor    %xmm13,%xmm14
+
+
+       pxor    %xmm2,%xmm0
+       pxor    %xmm11,%xmm2
+       pshufd  $147,%xmm13,%xmm13
+       pxor    %xmm7,%xmm15
+       pxor    %xmm12,%xmm2
+       pxor    %xmm9,%xmm15
+       pshufd  $147,%xmm14,%xmm14
+
+       pxor    %xmm6,%xmm5
+       pxor    %xmm8,%xmm6
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm5
+       pxor    %xmm12,%xmm6
+       pxor    %xmm12,%xmm4
+       pxor    %xmm14,%xmm6
+       pshufd  $147,%xmm7,%xmm7
+       pxor    %xmm13,%xmm4
+       pxor    %xmm6,%xmm5
+       pxor    %xmm8,%xmm0
+       pshufd  $147,%xmm8,%xmm8
+
+       pxor    %xmm14,%xmm2
+       pxor    %xmm9,%xmm0
+       pxor    %xmm9,%xmm3
+       pshufd  $147,%xmm9,%xmm9
+       pxor    %xmm13,%xmm15
+       pxor    %xmm10,%xmm13
+       pxor    %xmm2,%xmm0
+       pxor    %xmm13,%xmm5
+
+       pxor    %xmm13,%xmm1
+       pxor    %xmm12,%xmm3
+       pxor    %xmm11,%xmm1
+       pshufd  $147,%xmm11,%xmm11
+       pxor    %xmm13,%xmm3
+       pxor    %xmm14,%xmm1
+       pxor    %xmm10,%xmm13
+
+       pshufd  $147,%xmm12,%xmm12
+       pshufd  $147,%xmm13,%xmm13
+       pshufd  $147,%xmm14,%xmm14
+       pshufd  $147,%xmm10,%xmm10
+
+
+       pxor    %xmm6,%xmm0
+       pxor    %xmm6,%xmm8
+       pxor    %xmm12,%xmm7
+       pxor    %xmm12,%xmm8
+       pxor    %xmm7,%xmm5
+       pxor    %xmm4,%xmm7
+       pxor    %xmm13,%xmm8
+       pxor    %xmm14,%xmm13
+       pxor    %xmm8,%xmm0
+       pxor    %xmm11,%xmm2
+       pxor    %xmm0,%xmm11
+       pxor    %xmm10,%xmm1
+       pxor    %xmm5,%xmm10
+       pxor    %xmm9,%xmm3
+       pxor    %xmm15,%xmm9
+       pxor    %xmm14,%xmm10
+       pxor    %xmm3,%xmm12
+       pxor    %xmm13,%xmm9
+       pxor    %xmm13,%xmm12
+       pxor    %xmm1,%xmm13
+       pxor    %xmm2,%xmm14
+
+       movdqa  %xmm7,%xmm15
+       movdqa  %xmm8,%xmm0
+       movdqa  %xmm9,%xmm1
+       movdqa  %xmm10,%xmm2
+       movdqa  %xmm11,%xmm3
+       movdqa  %xmm12,%xmm4
+       movdqa  %xmm13,%xmm5
+       movdqa  %xmm14,%xmm6
+       movdqa  -16(%r11),%xmm7
+       jnz     L$dec_loop
+       movdqa  -32(%r11),%xmm7
+       jmp     L$dec_loop
+.p2align       4
+L$dec_done:
+       movdqa  0(%r11),%xmm7
+       movdqa  16(%r11),%xmm8
+       movdqa  %xmm2,%xmm9
+       psrlq   $1,%xmm2
+       movdqa  %xmm1,%xmm10
+       psrlq   $1,%xmm1
+       pxor    %xmm4,%xmm2
+       pxor    %xmm6,%xmm1
+       pand    %xmm7,%xmm2
+       pand    %xmm7,%xmm1
+       pxor    %xmm2,%xmm4
+       psllq   $1,%xmm2
+       pxor    %xmm1,%xmm6
+       psllq   $1,%xmm1
+       pxor    %xmm9,%xmm2
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm5,%xmm9
+       psrlq   $1,%xmm5
+       movdqa  %xmm15,%xmm10
+       psrlq   $1,%xmm15
+       pxor    %xmm3,%xmm5
+       pxor    %xmm0,%xmm15
+       pand    %xmm7,%xmm5
+       pand    %xmm7,%xmm15
+       pxor    %xmm5,%xmm3
+       psllq   $1,%xmm5
+       pxor    %xmm15,%xmm0
+       psllq   $1,%xmm15
+       pxor    %xmm9,%xmm5
+       pxor    %xmm10,%xmm15
+       movdqa  32(%r11),%xmm7
+       movdqa  %xmm6,%xmm9
+       psrlq   $2,%xmm6
+       movdqa  %xmm1,%xmm10
+       psrlq   $2,%xmm1
+       pxor    %xmm4,%xmm6
+       pxor    %xmm2,%xmm1
+       pand    %xmm8,%xmm6
+       pand    %xmm8,%xmm1
+       pxor    %xmm6,%xmm4
+       psllq   $2,%xmm6
+       pxor    %xmm1,%xmm2
+       psllq   $2,%xmm1
+       pxor    %xmm9,%xmm6
+       pxor    %xmm10,%xmm1
+       movdqa  %xmm0,%xmm9
+       psrlq   $2,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $2,%xmm15
+       pxor    %xmm3,%xmm0
+       pxor    %xmm5,%xmm15
+       pand    %xmm8,%xmm0
+       pand    %xmm8,%xmm15
+       pxor    %xmm0,%xmm3
+       psllq   $2,%xmm0
+       pxor    %xmm15,%xmm5
+       psllq   $2,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  %xmm3,%xmm9
+       psrlq   $4,%xmm3
+       movdqa  %xmm5,%xmm10
+       psrlq   $4,%xmm5
+       pxor    %xmm4,%xmm3
+       pxor    %xmm2,%xmm5
+       pand    %xmm7,%xmm3
+       pand    %xmm7,%xmm5
+       pxor    %xmm3,%xmm4
+       psllq   $4,%xmm3
+       pxor    %xmm5,%xmm2
+       psllq   $4,%xmm5
+       pxor    %xmm9,%xmm3
+       pxor    %xmm10,%xmm5
+       movdqa  %xmm0,%xmm9
+       psrlq   $4,%xmm0
+       movdqa  %xmm15,%xmm10
+       psrlq   $4,%xmm15
+       pxor    %xmm6,%xmm0
+       pxor    %xmm1,%xmm15
+       pand    %xmm7,%xmm0
+       pand    %xmm7,%xmm15
+       pxor    %xmm0,%xmm6
+       psllq   $4,%xmm0
+       pxor    %xmm15,%xmm1
+       psllq   $4,%xmm15
+       pxor    %xmm9,%xmm0
+       pxor    %xmm10,%xmm15
+       movdqa  (%rax),%xmm7
+       pxor    %xmm7,%xmm5
+       pxor    %xmm7,%xmm3
+       pxor    %xmm7,%xmm1
+       pxor    %xmm7,%xmm6
+       pxor    %xmm7,%xmm2
+       pxor    %xmm7,%xmm4
+       pxor    %xmm7,%xmm15
+       pxor    %xmm7,%xmm0
+       .byte   0xf3,0xc3
+
+
+.p2align       4
+_bsaes_key_convert:
+       leaq    L$masks(%rip),%r11
+       movdqu  (%rcx),%xmm7
+       leaq    16(%rcx),%rcx
+       movdqa  0(%r11),%xmm0
+       movdqa  16(%r11),%xmm1
+       movdqa  32(%r11),%xmm2
+       movdqa  48(%r11),%xmm3
+       movdqa  64(%r11),%xmm4
+       pcmpeqd %xmm5,%xmm5
+
+       movdqu  (%rcx),%xmm6
+       movdqa  %xmm7,(%rax)
+       leaq    16(%rax),%rax
+       decl    %r10d
+       jmp     L$key_loop
+.p2align       4
+L$key_loop:
+.byte  102,15,56,0,244
+
+       movdqa  %xmm0,%xmm8
+       movdqa  %xmm1,%xmm9
+
+       pand    %xmm6,%xmm8
+       pand    %xmm6,%xmm9
+       movdqa  %xmm2,%xmm10
+       pcmpeqb %xmm0,%xmm8
+       psllq   $4,%xmm0
+       movdqa  %xmm3,%xmm11
+       pcmpeqb %xmm1,%xmm9
+       psllq   $4,%xmm1
+
+       pand    %xmm6,%xmm10
+       pand    %xmm6,%xmm11
+       movdqa  %xmm0,%xmm12
+       pcmpeqb %xmm2,%xmm10
+       psllq   $4,%xmm2
+       movdqa  %xmm1,%xmm13
+       pcmpeqb %xmm3,%xmm11
+       psllq   $4,%xmm3
+
+       movdqa  %xmm2,%xmm14
+       movdqa  %xmm3,%xmm15
+       pxor    %xmm5,%xmm8
+       pxor    %xmm5,%xmm9
+
+       pand    %xmm6,%xmm12
+       pand    %xmm6,%xmm13
+       movdqa  %xmm8,0(%rax)
+       pcmpeqb %xmm0,%xmm12
+       psrlq   $4,%xmm0
+       movdqa  %xmm9,16(%rax)
+       pcmpeqb %xmm1,%xmm13
+       psrlq   $4,%xmm1
+       leaq    16(%rcx),%rcx
+
+       pand    %xmm6,%xmm14
+       pand    %xmm6,%xmm15
+       movdqa  %xmm10,32(%rax)
+       pcmpeqb %xmm2,%xmm14
+       psrlq   $4,%xmm2
+       movdqa  %xmm11,48(%rax)
+       pcmpeqb %xmm3,%xmm15
+       psrlq   $4,%xmm3
+       movdqu  (%rcx),%xmm6
+
+       pxor    %xmm5,%xmm13
+       pxor    %xmm5,%xmm14
+       movdqa  %xmm12,64(%rax)
+       movdqa  %xmm13,80(%rax)
+       movdqa  %xmm14,96(%rax)
+       movdqa  %xmm15,112(%rax)
+       leaq    128(%rax),%rax
+       decl    %r10d
+       jnz     L$key_loop
+
+       movdqa  80(%r11),%xmm7
+
+       .byte   0xf3,0xc3
+
+
+.globl _bsaes_cbc_encrypt
+
+.p2align       4
+_bsaes_cbc_encrypt:
+       cmpl    $0,%r9d
+       jne     _asm_AES_cbc_encrypt
+       cmpq    $128,%rdx
+       jb      _asm_AES_cbc_encrypt
+
+       movq    %rsp,%rax
+L$cbc_dec_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movl    240(%rcx),%eax
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+       movq    %r8,%rbx
+       shrq    $4,%r14
+
+       movl    %eax,%edx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %edx,%r10d
+       call    _bsaes_key_convert
+       pxor    (%rsp),%xmm7
+       movdqa  %xmm6,(%rax)
+       movdqa  %xmm7,(%rsp)
+
+       movdqu  (%rbx),%xmm14
+       subq    $8,%r14
+L$cbc_dec_loop:
+       movdqu  0(%r12),%xmm15
+       movdqu  16(%r12),%xmm0
+       movdqu  32(%r12),%xmm1
+       movdqu  48(%r12),%xmm2
+       movdqu  64(%r12),%xmm3
+       movdqu  80(%r12),%xmm4
+       movq    %rsp,%rax
+       movdqu  96(%r12),%xmm5
+       movl    %edx,%r10d
+       movdqu  112(%r12),%xmm6
+       movdqa  %xmm14,32(%rbp)
+
+       call    _bsaes_decrypt8
+
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm1
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm6
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm2
+       movdqu  112(%r12),%xmm14
+       pxor    %xmm13,%xmm4
+       movdqu  %xmm15,0(%r13)
+       leaq    128(%r12),%r12
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm2,96(%r13)
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+       subq    $8,%r14
+       jnc     L$cbc_dec_loop
+
+       addq    $8,%r14
+       jz      L$cbc_dec_done
+
+       movdqu  0(%r12),%xmm15
+       movq    %rsp,%rax
+       movl    %edx,%r10d
+       cmpq    $2,%r14
+       jb      L$cbc_dec_one
+       movdqu  16(%r12),%xmm0
+       je      L$cbc_dec_two
+       movdqu  32(%r12),%xmm1
+       cmpq    $4,%r14
+       jb      L$cbc_dec_three
+       movdqu  48(%r12),%xmm2
+       je      L$cbc_dec_four
+       movdqu  64(%r12),%xmm3
+       cmpq    $6,%r14
+       jb      L$cbc_dec_five
+       movdqu  80(%r12),%xmm4
+       je      L$cbc_dec_six
+       movdqu  96(%r12),%xmm5
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm1
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm6
+       movdqu  96(%r12),%xmm14
+       pxor    %xmm12,%xmm2
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm2,96(%r13)
+       jmp     L$cbc_dec_done
+.p2align       4
+L$cbc_dec_six:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm1
+       movdqu  80(%r12),%xmm14
+       pxor    %xmm11,%xmm6
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       jmp     L$cbc_dec_done
+.p2align       4
+L$cbc_dec_five:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm3
+       movdqu  64(%r12),%xmm14
+       pxor    %xmm10,%xmm1
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       jmp     L$cbc_dec_done
+.p2align       4
+L$cbc_dec_four:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm5
+       movdqu  48(%r12),%xmm14
+       pxor    %xmm9,%xmm3
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       jmp     L$cbc_dec_done
+.p2align       4
+L$cbc_dec_three:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm0
+       movdqu  32(%r12),%xmm14
+       pxor    %xmm8,%xmm5
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       jmp     L$cbc_dec_done
+.p2align       4
+L$cbc_dec_two:
+       movdqa  %xmm14,32(%rbp)
+       call    _bsaes_decrypt8
+       pxor    32(%rbp),%xmm15
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm14
+       pxor    %xmm7,%xmm0
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       jmp     L$cbc_dec_done
+.p2align       4
+L$cbc_dec_one:
+       leaq    (%r12),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    _asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm14
+       movdqu  %xmm14,(%r13)
+       movdqa  %xmm15,%xmm14
+
+L$cbc_dec_done:
+       movdqu  %xmm14,(%rbx)
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+L$cbc_dec_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      L$cbc_dec_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+L$cbc_dec_epilogue:
+       .byte   0xf3,0xc3
+
+
+.globl _bsaes_ctr32_encrypt_blocks
+
+.p2align       4
+_bsaes_ctr32_encrypt_blocks:
+       movq    %rsp,%rax
+L$ctr_enc_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movdqu  (%r8),%xmm0
+       movl    240(%rcx),%eax
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+       movdqa  %xmm0,32(%rbp)
+       cmpq    $8,%rdx
+       jb      L$ctr_enc_short
+
+       movl    %eax,%ebx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %ebx,%r10d
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm7,(%rax)
+
+       movdqa  (%rsp),%xmm8
+       leaq    L$ADD1(%rip),%r11
+       movdqa  32(%rbp),%xmm15
+       movdqa  -32(%r11),%xmm7
+.byte  102,68,15,56,0,199
+.byte  102,68,15,56,0,255
+       movdqa  %xmm8,(%rsp)
+       jmp     L$ctr_enc_loop
+.p2align       4
+L$ctr_enc_loop:
+       movdqa  %xmm15,32(%rbp)
+       movdqa  %xmm15,%xmm0
+       movdqa  %xmm15,%xmm1
+       paddd   0(%r11),%xmm0
+       movdqa  %xmm15,%xmm2
+       paddd   16(%r11),%xmm1
+       movdqa  %xmm15,%xmm3
+       paddd   32(%r11),%xmm2
+       movdqa  %xmm15,%xmm4
+       paddd   48(%r11),%xmm3
+       movdqa  %xmm15,%xmm5
+       paddd   64(%r11),%xmm4
+       movdqa  %xmm15,%xmm6
+       paddd   80(%r11),%xmm5
+       paddd   96(%r11),%xmm6
+
+
+
+       movdqa  (%rsp),%xmm8
+       leaq    16(%rsp),%rax
+       movdqa  -16(%r11),%xmm7
+       pxor    %xmm8,%xmm15
+       pxor    %xmm8,%xmm0
+.byte  102,68,15,56,0,255
+       pxor    %xmm8,%xmm1
+.byte  102,15,56,0,199
+       pxor    %xmm8,%xmm2
+.byte  102,15,56,0,207
+       pxor    %xmm8,%xmm3
+.byte  102,15,56,0,215
+       pxor    %xmm8,%xmm4
+.byte  102,15,56,0,223
+       pxor    %xmm8,%xmm5
+.byte  102,15,56,0,231
+       pxor    %xmm8,%xmm6
+.byte  102,15,56,0,239
+       leaq    L$BS0(%rip),%r11
+.byte  102,15,56,0,247
+       movl    %ebx,%r10d
+
+       call    _bsaes_encrypt8_bitslice
+
+       subq    $8,%r14
+       jc      L$ctr_enc_loop_done
+
+       movdqu  0(%r12),%xmm7
+       movdqu  16(%r12),%xmm8
+       movdqu  32(%r12),%xmm9
+       movdqu  48(%r12),%xmm10
+       movdqu  64(%r12),%xmm11
+       movdqu  80(%r12),%xmm12
+       movdqu  96(%r12),%xmm13
+       movdqu  112(%r12),%xmm14
+       leaq    128(%r12),%r12
+       pxor    %xmm15,%xmm7
+       movdqa  32(%rbp),%xmm15
+       pxor    %xmm8,%xmm0
+       movdqu  %xmm7,0(%r13)
+       pxor    %xmm9,%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    %xmm10,%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    %xmm11,%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    %xmm12,%xmm6
+       movdqu  %xmm2,64(%r13)
+       pxor    %xmm13,%xmm1
+       movdqu  %xmm6,80(%r13)
+       pxor    %xmm14,%xmm4
+       movdqu  %xmm1,96(%r13)
+       leaq    L$ADD1(%rip),%r11
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+       paddd   112(%r11),%xmm15
+       jnz     L$ctr_enc_loop
+
+       jmp     L$ctr_enc_done
+.p2align       4
+L$ctr_enc_loop_done:
+       addq    $8,%r14
+       movdqu  0(%r12),%xmm7
+       pxor    %xmm7,%xmm15
+       movdqu  %xmm15,0(%r13)
+       cmpq    $2,%r14
+       jb      L$ctr_enc_done
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm8,%xmm0
+       movdqu  %xmm0,16(%r13)
+       je      L$ctr_enc_done
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm9,%xmm3
+       movdqu  %xmm3,32(%r13)
+       cmpq    $4,%r14
+       jb      L$ctr_enc_done
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm10,%xmm5
+       movdqu  %xmm5,48(%r13)
+       je      L$ctr_enc_done
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm11,%xmm2
+       movdqu  %xmm2,64(%r13)
+       cmpq    $6,%r14
+       jb      L$ctr_enc_done
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm12,%xmm6
+       movdqu  %xmm6,80(%r13)
+       je      L$ctr_enc_done
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm13,%xmm1
+       movdqu  %xmm1,96(%r13)
+       jmp     L$ctr_enc_done
+
+.p2align       4
+L$ctr_enc_short:
+       leaq    32(%rbp),%rdi
+       leaq    48(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    _asm_AES_encrypt
+       movdqu  (%r12),%xmm0
+       leaq    16(%r12),%r12
+       movl    44(%rbp),%eax
+       bswapl  %eax
+       pxor    48(%rbp),%xmm0
+       incl    %eax
+       movdqu  %xmm0,(%r13)
+       bswapl  %eax
+       leaq    16(%r13),%r13
+       movl    %eax,44(%rsp)
+       decq    %r14
+       jnz     L$ctr_enc_short
+
+L$ctr_enc_done:
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+L$ctr_enc_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      L$ctr_enc_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+L$ctr_enc_epilogue:
+       .byte   0xf3,0xc3
+
+.globl _bsaes_xts_encrypt
+
+.p2align       4
+_bsaes_xts_encrypt:
+       movq    %rsp,%rax
+L$xts_enc_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+
+       leaq    (%r9),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r8),%rdx
+       call    _asm_AES_encrypt
+
+
+       movl    240(%r15),%eax
+       movq    %r14,%rbx
+
+       movl    %eax,%edx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %edx,%r10d
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7
+       movdqa  %xmm7,(%rax)
+
+       andq    $-16,%r14
+       subq    $128,%rsp
+       movdqa  32(%rbp),%xmm6
+
+       pxor    %xmm14,%xmm14
+       movdqa  L$xts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+
+       subq    $128,%r14
+       jc      L$xts_enc_short
+       jmp     L$xts_enc_loop
+
+.p2align       4
+L$xts_enc_loop:
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqu  112(%r12),%xmm14
+       leaq    128(%r12),%r12
+       movdqa  %xmm6,112(%rsp)
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       pxor    %xmm14,%xmm6
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm2,64(%r13)
+       pxor    96(%rsp),%xmm1
+       movdqu  %xmm6,80(%r13)
+       pxor    112(%rsp),%xmm4
+       movdqu  %xmm1,96(%r13)
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       pxor    %xmm14,%xmm14
+       movdqa  L$xts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+
+       subq    $128,%r14
+       jnc     L$xts_enc_loop
+
+L$xts_enc_short:
+       addq    $128,%r14
+       jz      L$xts_enc_done
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       cmpq    $16,%r14
+       je      L$xts_enc_1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       cmpq    $32,%r14
+       je      L$xts_enc_2
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       cmpq    $48,%r14
+       je      L$xts_enc_3
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       cmpq    $64,%r14
+       je      L$xts_enc_4
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       cmpq    $80,%r14
+       je      L$xts_enc_5
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       cmpq    $96,%r14
+       je      L$xts_enc_6
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqa  %xmm6,112(%rsp)
+       leaq    112(%r12),%r12
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm2,64(%r13)
+       pxor    96(%rsp),%xmm1
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm1,96(%r13)
+       leaq    112(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       jmp     L$xts_enc_done
+.p2align       4
+L$xts_enc_6:
+       pxor    %xmm11,%xmm3
+       leaq    96(%r12),%r12
+       pxor    %xmm12,%xmm4
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm2,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       leaq    96(%r13),%r13
+
+       movdqa  96(%rsp),%xmm6
+       jmp     L$xts_enc_done
+.p2align       4
+L$xts_enc_5:
+       pxor    %xmm10,%xmm2
+       leaq    80(%r12),%r12
+       pxor    %xmm11,%xmm3
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       pxor    64(%rsp),%xmm2
+       movdqu  %xmm5,48(%r13)
+       movdqu  %xmm2,64(%r13)
+       leaq    80(%r13),%r13
+
+       movdqa  80(%rsp),%xmm6
+       jmp     L$xts_enc_done
+.p2align       4
+L$xts_enc_4:
+       pxor    %xmm9,%xmm1
+       leaq    64(%r12),%r12
+       pxor    %xmm10,%xmm2
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm5
+       movdqu  %xmm3,32(%r13)
+       movdqu  %xmm5,48(%r13)
+       leaq    64(%r13),%r13
+
+       movdqa  64(%rsp),%xmm6
+       jmp     L$xts_enc_done
+.p2align       4
+L$xts_enc_3:
+       pxor    %xmm8,%xmm0
+       leaq    48(%r12),%r12
+       pxor    %xmm9,%xmm1
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm3
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm3,32(%r13)
+       leaq    48(%r13),%r13
+
+       movdqa  48(%rsp),%xmm6
+       jmp     L$xts_enc_done
+.p2align       4
+L$xts_enc_2:
+       pxor    %xmm7,%xmm15
+       leaq    32(%r12),%r12
+       pxor    %xmm8,%xmm0
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_encrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       leaq    32(%r13),%r13
+
+       movdqa  32(%rsp),%xmm6
+       jmp     L$xts_enc_done
+.p2align       4
+L$xts_enc_1:
+       pxor    %xmm15,%xmm7
+       leaq    16(%r12),%r12
+       movdqa  %xmm7,32(%rbp)
+       leaq    32(%rbp),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    _asm_AES_encrypt
+
+       pxor    32(%rbp),%xmm15
+
+
+
+
+
+       movdqu  %xmm15,0(%r13)
+       leaq    16(%r13),%r13
+
+       movdqa  16(%rsp),%xmm6
+
+L$xts_enc_done:
+       andl    $15,%ebx
+       jz      L$xts_enc_ret
+       movq    %r13,%rdx
+
+L$xts_enc_steal:
+       movzbl  (%r12),%eax
+       movzbl  -16(%rdx),%ecx
+       leaq    1(%r12),%r12
+       movb    %al,-16(%rdx)
+       movb    %cl,0(%rdx)
+       leaq    1(%rdx),%rdx
+       subl    $1,%ebx
+       jnz     L$xts_enc_steal
+
+       movdqu  -16(%r13),%xmm15
+       leaq    32(%rbp),%rdi
+       pxor    %xmm6,%xmm15
+       leaq    32(%rbp),%rsi
+       movdqa  %xmm15,32(%rbp)
+       leaq    (%r15),%rdx
+       call    _asm_AES_encrypt
+
+       pxor    32(%rbp),%xmm6
+       movdqu  %xmm6,-16(%r13)
+
+L$xts_enc_ret:
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+L$xts_enc_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      L$xts_enc_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+L$xts_enc_epilogue:
+       .byte   0xf3,0xc3
+
+
+.globl _bsaes_xts_decrypt
+
+.p2align       4
+_bsaes_xts_decrypt:
+       movq    %rsp,%rax
+L$xts_dec_prologue:
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       leaq    -72(%rsp),%rsp
+       movq    %rsp,%rbp
+       movq    %rdi,%r12
+       movq    %rsi,%r13
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+
+       leaq    (%r9),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r8),%rdx
+       call    _asm_AES_encrypt
+
+
+       movl    240(%r15),%eax
+       movq    %r14,%rbx
+
+       movl    %eax,%edx
+       shlq    $7,%rax
+       subq    $96,%rax
+       subq    %rax,%rsp
+
+       movq    %rsp,%rax
+       movq    %r15,%rcx
+       movl    %edx,%r10d
+       call    _bsaes_key_convert
+       pxor    (%rsp),%xmm7
+       movdqa  %xmm6,(%rax)
+       movdqa  %xmm7,(%rsp)
+
+       xorl    %eax,%eax
+       andq    $-16,%r14
+       testl   $15,%ebx
+       setnz   %al
+       shlq    $4,%rax
+       subq    %rax,%r14
+
+       subq    $128,%rsp
+       movdqa  32(%rbp),%xmm6
+
+       pxor    %xmm14,%xmm14
+       movdqa  L$xts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+
+       subq    $128,%r14
+       jc      L$xts_dec_short
+       jmp     L$xts_dec_loop
+
+.p2align       4
+L$xts_dec_loop:
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqu  112(%r12),%xmm14
+       leaq    128(%r12),%r12
+       movdqa  %xmm6,112(%rsp)
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       pxor    %xmm14,%xmm6
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm1,64(%r13)
+       pxor    96(%rsp),%xmm2
+       movdqu  %xmm6,80(%r13)
+       pxor    112(%rsp),%xmm4
+       movdqu  %xmm2,96(%r13)
+       movdqu  %xmm4,112(%r13)
+       leaq    128(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       pxor    %xmm14,%xmm14
+       movdqa  L$xts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+
+       subq    $128,%r14
+       jnc     L$xts_dec_loop
+
+L$xts_dec_short:
+       addq    $128,%r14
+       jz      L$xts_dec_done
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm15
+       movdqa  %xmm6,0(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm0
+       movdqa  %xmm6,16(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  0(%r12),%xmm7
+       cmpq    $16,%r14
+       je      L$xts_dec_1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm1
+       movdqa  %xmm6,32(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  16(%r12),%xmm8
+       cmpq    $32,%r14
+       je      L$xts_dec_2
+       pxor    %xmm7,%xmm15
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm2
+       movdqa  %xmm6,48(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  32(%r12),%xmm9
+       cmpq    $48,%r14
+       je      L$xts_dec_3
+       pxor    %xmm8,%xmm0
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm3
+       movdqa  %xmm6,64(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  48(%r12),%xmm10
+       cmpq    $64,%r14
+       je      L$xts_dec_4
+       pxor    %xmm9,%xmm1
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm4
+       movdqa  %xmm6,80(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  64(%r12),%xmm11
+       cmpq    $80,%r14
+       je      L$xts_dec_5
+       pxor    %xmm10,%xmm2
+       pshufd  $19,%xmm14,%xmm13
+       pxor    %xmm14,%xmm14
+       movdqa  %xmm6,%xmm5
+       movdqa  %xmm6,96(%rsp)
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       pcmpgtd %xmm6,%xmm14
+       pxor    %xmm13,%xmm6
+       movdqu  80(%r12),%xmm12
+       cmpq    $96,%r14
+       je      L$xts_dec_6
+       pxor    %xmm11,%xmm3
+       movdqu  96(%r12),%xmm13
+       pxor    %xmm12,%xmm4
+       movdqa  %xmm6,112(%rsp)
+       leaq    112(%r12),%r12
+       pxor    %xmm13,%xmm5
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm1,64(%r13)
+       pxor    96(%rsp),%xmm2
+       movdqu  %xmm6,80(%r13)
+       movdqu  %xmm2,96(%r13)
+       leaq    112(%r13),%r13
+
+       movdqa  112(%rsp),%xmm6
+       jmp     L$xts_dec_done
+.p2align       4
+L$xts_dec_6:
+       pxor    %xmm11,%xmm3
+       leaq    96(%r12),%r12
+       pxor    %xmm12,%xmm4
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       pxor    80(%rsp),%xmm6
+       movdqu  %xmm1,64(%r13)
+       movdqu  %xmm6,80(%r13)
+       leaq    96(%r13),%r13
+
+       movdqa  96(%rsp),%xmm6
+       jmp     L$xts_dec_done
+.p2align       4
+L$xts_dec_5:
+       pxor    %xmm10,%xmm2
+       leaq    80(%r12),%r12
+       pxor    %xmm11,%xmm3
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       pxor    64(%rsp),%xmm1
+       movdqu  %xmm3,48(%r13)
+       movdqu  %xmm1,64(%r13)
+       leaq    80(%r13),%r13
+
+       movdqa  80(%rsp),%xmm6
+       jmp     L$xts_dec_done
+.p2align       4
+L$xts_dec_4:
+       pxor    %xmm9,%xmm1
+       leaq    64(%r12),%r12
+       pxor    %xmm10,%xmm2
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       pxor    48(%rsp),%xmm3
+       movdqu  %xmm5,32(%r13)
+       movdqu  %xmm3,48(%r13)
+       leaq    64(%r13),%r13
+
+       movdqa  64(%rsp),%xmm6
+       jmp     L$xts_dec_done
+.p2align       4
+L$xts_dec_3:
+       pxor    %xmm8,%xmm0
+       leaq    48(%r12),%r12
+       pxor    %xmm9,%xmm1
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       pxor    32(%rsp),%xmm5
+       movdqu  %xmm0,16(%r13)
+       movdqu  %xmm5,32(%r13)
+       leaq    48(%r13),%r13
+
+       movdqa  48(%rsp),%xmm6
+       jmp     L$xts_dec_done
+.p2align       4
+L$xts_dec_2:
+       pxor    %xmm7,%xmm15
+       leaq    32(%r12),%r12
+       pxor    %xmm8,%xmm0
+       leaq    128(%rsp),%rax
+       movl    %edx,%r10d
+
+       call    _bsaes_decrypt8
+
+       pxor    0(%rsp),%xmm15
+       pxor    16(%rsp),%xmm0
+       movdqu  %xmm15,0(%r13)
+       movdqu  %xmm0,16(%r13)
+       leaq    32(%r13),%r13
+
+       movdqa  32(%rsp),%xmm6
+       jmp     L$xts_dec_done
+.p2align       4
+L$xts_dec_1:
+       pxor    %xmm15,%xmm7
+       leaq    16(%r12),%r12
+       movdqa  %xmm7,32(%rbp)
+       leaq    32(%rbp),%rdi
+       leaq    32(%rbp),%rsi
+       leaq    (%r15),%rdx
+       call    _asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm15
+
+
+
+
+
+       movdqu  %xmm15,0(%r13)
+       leaq    16(%r13),%r13
+
+       movdqa  16(%rsp),%xmm6
+
+L$xts_dec_done:
+       andl    $15,%ebx
+       jz      L$xts_dec_ret
+
+       pxor    %xmm14,%xmm14
+       movdqa  L$xts_magic(%rip),%xmm12
+       pcmpgtd %xmm6,%xmm14
+       pshufd  $19,%xmm14,%xmm13
+       movdqa  %xmm6,%xmm5
+       paddq   %xmm6,%xmm6
+       pand    %xmm12,%xmm13
+       movdqu  (%r12),%xmm15
+       pxor    %xmm13,%xmm6
+
+       leaq    32(%rbp),%rdi
+       pxor    %xmm6,%xmm15
+       leaq    32(%rbp),%rsi
+       movdqa  %xmm15,32(%rbp)
+       leaq    (%r15),%rdx
+       call    _asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm6
+       movq    %r13,%rdx
+       movdqu  %xmm6,(%r13)
+
+L$xts_dec_steal:
+       movzbl  16(%r12),%eax
+       movzbl  (%rdx),%ecx
+       leaq    1(%r12),%r12
+       movb    %al,(%rdx)
+       movb    %cl,16(%rdx)
+       leaq    1(%rdx),%rdx
+       subl    $1,%ebx
+       jnz     L$xts_dec_steal
+
+       movdqu  (%r13),%xmm15
+       leaq    32(%rbp),%rdi
+       pxor    %xmm5,%xmm15
+       leaq    32(%rbp),%rsi
+       movdqa  %xmm15,32(%rbp)
+       leaq    (%r15),%rdx
+       call    _asm_AES_decrypt
+
+       pxor    32(%rbp),%xmm5
+       movdqu  %xmm5,(%r13)
+
+L$xts_dec_ret:
+       leaq    (%rsp),%rax
+       pxor    %xmm0,%xmm0
+L$xts_dec_bzero:
+       movdqa  %xmm0,0(%rax)
+       movdqa  %xmm0,16(%rax)
+       leaq    32(%rax),%rax
+       cmpq    %rax,%rbp
+       ja      L$xts_dec_bzero
+
+       leaq    (%rbp),%rsp
+       movq    72(%rsp),%r15
+       movq    80(%rsp),%r14
+       movq    88(%rsp),%r13
+       movq    96(%rsp),%r12
+       movq    104(%rsp),%rbx
+       movq    112(%rsp),%rax
+       leaq    120(%rsp),%rsp
+       movq    %rax,%rbp
+L$xts_dec_epilogue:
+       .byte   0xf3,0xc3
+
+
+.p2align       6
+_bsaes_const:
+L$M0ISR:
+.quad  0x0a0e0206070b0f03, 0x0004080c0d010509
+L$ISRM0:
+.quad  0x01040b0e0205080f, 0x0306090c00070a0d
+L$ISR:
+.quad  0x0504070602010003, 0x0f0e0d0c080b0a09
+L$BS0:
+.quad  0x5555555555555555, 0x5555555555555555
+L$BS1:
+.quad  0x3333333333333333, 0x3333333333333333
+L$BS2:
+.quad  0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+L$SR:
+.quad  0x0504070600030201, 0x0f0e0d0c0a09080b
+L$SRM0:
+.quad  0x0304090e00050a0f, 0x01060b0c0207080d
+L$M0SR:
+.quad  0x0a0e02060f03070b, 0x0004080c05090d01
+L$SWPUP:
+.quad  0x0706050403020100, 0x0c0d0e0f0b0a0908
+L$SWPUPM0SR:
+.quad  0x0a0d02060c03070b, 0x0004080f05090e01
+L$ADD1:
+.quad  0x0000000000000000, 0x0000000100000000
+L$ADD2:
+.quad  0x0000000000000000, 0x0000000200000000
+L$ADD3:
+.quad  0x0000000000000000, 0x0000000300000000
+L$ADD4:
+.quad  0x0000000000000000, 0x0000000400000000
+L$ADD5:
+.quad  0x0000000000000000, 0x0000000500000000
+L$ADD6:
+.quad  0x0000000000000000, 0x0000000600000000
+L$ADD7:
+.quad  0x0000000000000000, 0x0000000700000000
+L$ADD8:
+.quad  0x0000000000000000, 0x0000000800000000
+L$xts_magic:
+.long  0x87,0,1,0
+L$masks:
+.quad  0x0101010101010101, 0x0101010101010101
+.quad  0x0202020202020202, 0x0202020202020202
+.quad  0x0404040404040404, 0x0404040404040404
+.quad  0x0808080808080808, 0x0808080808080808
+L$M0:
+.quad  0x02060a0e03070b0f, 0x0004080c0105090d
+L$63:
+.quad  0x6363636363636363, 0x6363636363636363
+.byte  66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.p2align       6
diff --git a/deps/openssl/asm/x64-macosx-gas/aes/vpaes-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/vpaes-x86_64.s
new file mode 100644 (file)
index 0000000..d9d7371
--- /dev/null
@@ -0,0 +1,838 @@
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_encrypt_core:
+       movq    %rdx,%r9
+       movq    $16,%r11
+       movl    240(%rdx),%eax
+       movdqa  %xmm9,%xmm1
+       movdqa  L$k_ipt(%rip),%xmm2
+       pandn   %xmm0,%xmm1
+       movdqu  (%r9),%xmm5
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+.byte  102,15,56,0,208
+       movdqa  L$k_ipt+16(%rip),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm5,%xmm2
+       pxor    %xmm2,%xmm0
+       addq    $16,%r9
+       leaq    L$k_mc_backward(%rip),%r10
+       jmp     L$enc_entry
+
+.p2align       4
+L$enc_loop:
+
+       movdqa  %xmm13,%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+       movdqa  %xmm12,%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       movdqa  %xmm15,%xmm5
+.byte  102,15,56,0,234
+       movdqa  -64(%r11,%r10,1),%xmm1
+       movdqa  %xmm14,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm5,%xmm2
+       movdqa  (%r11,%r10,1),%xmm4
+       movdqa  %xmm0,%xmm3
+.byte  102,15,56,0,193
+       addq    $16,%r9
+       pxor    %xmm2,%xmm0
+.byte  102,15,56,0,220
+       addq    $16,%r11
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,193
+       andq    $48,%r11
+       pxor    %xmm3,%xmm0
+       subq    $1,%rax
+
+L$enc_entry:
+
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  %xmm11,%xmm5
+.byte  102,15,56,0,232
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm5,%xmm3
+       movdqa  %xmm10,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm5,%xmm4
+       movdqa  %xmm10,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm10,%xmm3
+       movdqu  (%r9),%xmm5
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       jnz     L$enc_loop
+
+
+       movdqa  -96(%r10),%xmm4
+       movdqa  -80(%r10),%xmm0
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+.byte  102,15,56,0,195
+       movdqa  64(%r11,%r10,1),%xmm1
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,193
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_decrypt_core:
+       movq    %rdx,%r9
+       movl    240(%rdx),%eax
+       movdqa  %xmm9,%xmm1
+       movdqa  L$k_dipt(%rip),%xmm2
+       pandn   %xmm0,%xmm1
+       movq    %rax,%r11
+       psrld   $4,%xmm1
+       movdqu  (%r9),%xmm5
+       shlq    $4,%r11
+       pand    %xmm9,%xmm0
+.byte  102,15,56,0,208
+       movdqa  L$k_dipt+16(%rip),%xmm0
+       xorq    $48,%r11
+       leaq    L$k_dsbd(%rip),%r10
+.byte  102,15,56,0,193
+       andq    $48,%r11
+       pxor    %xmm5,%xmm2
+       movdqa  L$k_mc_forward+48(%rip),%xmm5
+       pxor    %xmm2,%xmm0
+       addq    $16,%r9
+       addq    %r10,%r11
+       jmp     L$dec_entry
+
+.p2align       4
+L$dec_loop:
+
+
+
+       movdqa  -32(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  -16(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       addq    $16,%r9
+
+.byte  102,15,56,0,197
+       movdqa  0(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  16(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       subq    $1,%rax
+
+.byte  102,15,56,0,197
+       movdqa  32(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  48(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+
+.byte  102,15,56,0,197
+       movdqa  64(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  80(%r10),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+
+.byte  102,15,58,15,237,12
+
+L$dec_entry:
+
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  %xmm11,%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm10,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm10,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqu  (%r9),%xmm0
+       jnz     L$dec_loop
+
+
+       movdqa  96(%r10),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  112(%r10),%xmm0
+       movdqa  -352(%r11),%xmm2
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,194
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_schedule_core:
+
+
+
+
+
+       call    _vpaes_preheat
+
+       movdqa  L$k_rcon(%rip),%xmm8
+       movdqu  (%rdi),%xmm0
+
+
+       movdqa  %xmm0,%xmm3
+       leaq    L$k_ipt(%rip),%r11
+       call    _vpaes_schedule_transform
+       movdqa  %xmm0,%xmm7
+
+       leaq    L$k_sr(%rip),%r10
+       testq   %rcx,%rcx
+       jnz     L$schedule_am_decrypting
+
+
+       movdqu  %xmm0,(%rdx)
+       jmp     L$schedule_go
+
+L$schedule_am_decrypting:
+
+       movdqa  (%r8,%r10,1),%xmm1
+.byte  102,15,56,0,217
+       movdqu  %xmm3,(%rdx)
+       xorq    $48,%r8
+
+L$schedule_go:
+       cmpl    $192,%esi
+       ja      L$schedule_256
+       je      L$schedule_192
+
+
+
+
+
+
+
+
+
+
+L$schedule_128:
+       movl    $10,%esi
+
+L$oop_schedule_128:
+       call    _vpaes_schedule_round
+       decq    %rsi
+       jz      L$schedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+       jmp     L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+L$schedule_192:
+       movdqu  8(%rdi),%xmm0
+       call    _vpaes_schedule_transform
+
+       movdqa  %xmm0,%xmm6
+       pxor    %xmm4,%xmm4
+       movhlps %xmm4,%xmm6
+       movl    $4,%esi
+
+L$oop_schedule_192:
+       call    _vpaes_schedule_round
+.byte  102,15,58,15,198,8
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_192_smear
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_round
+       decq    %rsi
+       jz      L$schedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_192_smear
+       jmp     L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+L$schedule_256:
+       movdqu  16(%rdi),%xmm0
+       call    _vpaes_schedule_transform
+
+       movl    $7,%esi
+
+L$oop_schedule_256:
+       call    _vpaes_schedule_mangle
+
+       movdqa  %xmm0,%xmm6
+
+
+       call    _vpaes_schedule_round
+       decq    %rsi
+       jz      L$schedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+
+
+       pshufd  $255,%xmm0,%xmm0
+       movdqa  %xmm7,%xmm5
+       movdqa  %xmm6,%xmm7
+       call    _vpaes_schedule_low_round
+       movdqa  %xmm5,%xmm7
+
+       jmp     L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+L$schedule_mangle_last:
+
+       leaq    L$k_deskew(%rip),%r11
+       testq   %rcx,%rcx
+       jnz     L$schedule_mangle_last_dec
+
+
+       movdqa  (%r8,%r10,1),%xmm1
+.byte  102,15,56,0,193
+       leaq    L$k_opt(%rip),%r11
+       addq    $32,%rdx
+
+L$schedule_mangle_last_dec:
+       addq    $-16,%rdx
+       pxor    L$k_s63(%rip),%xmm0
+       call    _vpaes_schedule_transform
+
+       movdqu  %xmm0,(%rdx)
+
+
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_schedule_192_smear:
+       pshufd  $128,%xmm6,%xmm0
+       pxor    %xmm0,%xmm6
+       pshufd  $254,%xmm7,%xmm0
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm6,%xmm0
+       pxor    %xmm1,%xmm1
+       movhlps %xmm1,%xmm6
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_schedule_round:
+
+       pxor    %xmm1,%xmm1
+.byte  102,65,15,58,15,200,15
+.byte  102,69,15,58,15,192,15
+       pxor    %xmm1,%xmm7
+
+
+       pshufd  $255,%xmm0,%xmm0
+.byte  102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+       movdqa  %xmm7,%xmm1
+       pslldq  $4,%xmm7
+       pxor    %xmm1,%xmm7
+       movdqa  %xmm7,%xmm1
+       pslldq  $8,%xmm7
+       pxor    %xmm1,%xmm7
+       pxor    L$k_s63(%rip),%xmm7
+
+
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  %xmm11,%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm10,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm10,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm10,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqa  %xmm13,%xmm4
+.byte  102,15,56,0,226
+       movdqa  %xmm12,%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+
+
+       pxor    %xmm7,%xmm0
+       movdqa  %xmm0,%xmm7
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_schedule_transform:
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm0
+       movdqa  (%r11),%xmm2
+.byte  102,15,56,0,208
+       movdqa  16(%r11),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm2,%xmm0
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_schedule_mangle:
+       movdqa  %xmm0,%xmm4
+       movdqa  L$k_mc_forward(%rip),%xmm5
+       testq   %rcx,%rcx
+       jnz     L$schedule_mangle_dec
+
+
+       addq    $16,%rdx
+       pxor    L$k_s63(%rip),%xmm4
+.byte  102,15,56,0,229
+       movdqa  %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+
+       jmp     L$schedule_mangle_both
+.p2align       4
+L$schedule_mangle_dec:
+
+       leaq    L$k_dksd(%rip),%r11
+       movdqa  %xmm9,%xmm1
+       pandn   %xmm4,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm9,%xmm4
+
+       movdqa  0(%r11),%xmm2
+.byte  102,15,56,0,212
+       movdqa  16(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+
+       movdqa  32(%r11),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  48(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+
+       movdqa  64(%r11),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  80(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+
+       movdqa  96(%r11),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  112(%r11),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+
+       addq    $-16,%rdx
+
+L$schedule_mangle_both:
+       movdqa  (%r8,%r10,1),%xmm1
+.byte  102,15,56,0,217
+       addq    $-16,%r8
+       andq    $48,%r8
+       movdqu  %xmm3,(%rdx)
+       .byte   0xf3,0xc3
+
+
+
+
+
+.globl _vpaes_set_encrypt_key
+
+.p2align       4
+_vpaes_set_encrypt_key:
+       movl    %esi,%eax
+       shrl    $5,%eax
+       addl    $5,%eax
+       movl    %eax,240(%rdx)
+
+       movl    $0,%ecx
+       movl    $48,%r8d
+       call    _vpaes_schedule_core
+       xorl    %eax,%eax
+       .byte   0xf3,0xc3
+
+
+.globl _vpaes_set_decrypt_key
+
+.p2align       4
+_vpaes_set_decrypt_key:
+       movl    %esi,%eax
+       shrl    $5,%eax
+       addl    $5,%eax
+       movl    %eax,240(%rdx)
+       shll    $4,%eax
+       leaq    16(%rdx,%rax,1),%rdx
+
+       movl    $1,%ecx
+       movl    %esi,%r8d
+       shrl    $1,%r8d
+       andl    $32,%r8d
+       xorl    $32,%r8d
+       call    _vpaes_schedule_core
+       xorl    %eax,%eax
+       .byte   0xf3,0xc3
+
+
+.globl _vpaes_encrypt
+
+.p2align       4
+_vpaes_encrypt:
+       movdqu  (%rdi),%xmm0
+       call    _vpaes_preheat
+       call    _vpaes_encrypt_core
+       movdqu  %xmm0,(%rsi)
+       .byte   0xf3,0xc3
+
+
+.globl _vpaes_decrypt
+
+.p2align       4
+_vpaes_decrypt:
+       movdqu  (%rdi),%xmm0
+       call    _vpaes_preheat
+       call    _vpaes_decrypt_core
+       movdqu  %xmm0,(%rsi)
+       .byte   0xf3,0xc3
+
+.globl _vpaes_cbc_encrypt
+
+.p2align       4
+_vpaes_cbc_encrypt:
+       xchgq   %rcx,%rdx
+       subq    $16,%rcx
+       jc      L$cbc_abort
+       movdqu  (%r8),%xmm6
+       subq    %rdi,%rsi
+       call    _vpaes_preheat
+       cmpl    $0,%r9d
+       je      L$cbc_dec_loop
+       jmp     L$cbc_enc_loop
+.p2align       4
+L$cbc_enc_loop:
+       movdqu  (%rdi),%xmm0
+       pxor    %xmm6,%xmm0
+       call    _vpaes_encrypt_core
+       movdqa  %xmm0,%xmm6
+       movdqu  %xmm0,(%rsi,%rdi,1)
+       leaq    16(%rdi),%rdi
+       subq    $16,%rcx
+       jnc     L$cbc_enc_loop
+       jmp     L$cbc_done
+.p2align       4
+L$cbc_dec_loop:
+       movdqu  (%rdi),%xmm0
+       movdqa  %xmm0,%xmm7
+       call    _vpaes_decrypt_core
+       pxor    %xmm6,%xmm0
+       movdqa  %xmm7,%xmm6
+       movdqu  %xmm0,(%rsi,%rdi,1)
+       leaq    16(%rdi),%rdi
+       subq    $16,%rcx
+       jnc     L$cbc_dec_loop
+L$cbc_done:
+       movdqu  %xmm6,(%r8)
+L$cbc_abort:
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+
+.p2align       4
+_vpaes_preheat:
+       leaq    L$k_s0F(%rip),%r10
+       movdqa  -32(%r10),%xmm10
+       movdqa  -16(%r10),%xmm11
+       movdqa  0(%r10),%xmm9
+       movdqa  48(%r10),%xmm13
+       movdqa  64(%r10),%xmm12
+       movdqa  80(%r10),%xmm15
+       movdqa  96(%r10),%xmm14
+       .byte   0xf3,0xc3
+
+
+
+
+
+
+
+.p2align       6
+_vpaes_consts:
+L$k_inv:
+.quad  0x0E05060F0D080180, 0x040703090A0B0C02
+.quad  0x01040A060F0B0780, 0x030D0E0C02050809
+
+L$k_s0F:
+.quad  0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+L$k_ipt:
+.quad  0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad  0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+L$k_sb1:
+.quad  0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad  0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+L$k_sb2:
+.quad  0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad  0x69EB88400AE12900, 0xC2A163C8AB82234A
+L$k_sbo:
+.quad  0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad  0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+L$k_mc_forward:
+.quad  0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad  0x080B0A0904070605, 0x000302010C0F0E0D
+.quad  0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad  0x000302010C0F0E0D, 0x080B0A0904070605
+
+L$k_mc_backward:
+.quad  0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad  0x020100030E0D0C0F, 0x0A09080B06050407
+.quad  0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad  0x0A09080B06050407, 0x020100030E0D0C0F
+
+L$k_sr:
+.quad  0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad  0x030E09040F0A0500, 0x0B06010C07020D08
+.quad  0x0F060D040B020900, 0x070E050C030A0108
+.quad  0x0B0E0104070A0D00, 0x0306090C0F020508
+
+L$k_rcon:
+.quad  0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+L$k_s63:
+.quad  0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+L$k_opt:
+.quad  0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad  0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+L$k_deskew:
+.quad  0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad  0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+L$k_dksd:
+.quad  0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad  0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+L$k_dksb:
+.quad  0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad  0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+L$k_dkse:
+.quad  0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad  0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+L$k_dks9:
+.quad  0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad  0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+L$k_dipt:
+.quad  0x0F505B040B545F00, 0x154A411E114E451A
+.quad  0x86E383E660056500, 0x12771772F491F194
+
+L$k_dsb9:
+.quad  0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad  0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+L$k_dsbd:
+.quad  0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad  0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+L$k_dsbb:
+.quad  0xD022649296B44200, 0x602646F6B0F2D404
+.quad  0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+L$k_dsbe:
+.quad  0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad  0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+L$k_dsbo:
+.quad  0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad  0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte  86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.p2align       6
diff --git a/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s b/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s
new file mode 100644 (file)
index 0000000..189f17f
--- /dev/null
@@ -0,0 +1,1027 @@
+.text
+
+
+.globl _gcm_gmult_4bit
+
+.p2align       4
+_gcm_gmult_4bit:
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+L$gmult_prologue:
+
+       movzbq  15(%rdi),%r8
+       leaq    L$rem_4bit(%rip),%r11
+       xorq    %rax,%rax
+       xorq    %rbx,%rbx
+       movb    %r8b,%al
+       movb    %r8b,%bl
+       shlb    $4,%al
+       movq    $14,%rcx
+       movq    8(%rsi,%rax,1),%r8
+       movq    (%rsi,%rax,1),%r9
+       andb    $240,%bl
+       movq    %r8,%rdx
+       jmp     L$oop1
+
+.p2align       4
+L$oop1:
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       movb    (%rdi,%rcx,1),%al
+       shrq    $4,%r9
+       xorq    8(%rsi,%rbx,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rbx,1),%r9
+       movb    %al,%bl
+       xorq    (%r11,%rdx,8),%r9
+       movq    %r8,%rdx
+       shlb    $4,%al
+       xorq    %r10,%r8
+       decq    %rcx
+       js      L$break1
+
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       xorq    8(%rsi,%rax,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rax,1),%r9
+       andb    $240,%bl
+       xorq    (%r11,%rdx,8),%r9
+       movq    %r8,%rdx
+       xorq    %r10,%r8
+       jmp     L$oop1
+
+.p2align       4
+L$break1:
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       xorq    8(%rsi,%rax,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rax,1),%r9
+       andb    $240,%bl
+       xorq    (%r11,%rdx,8),%r9
+       movq    %r8,%rdx
+       xorq    %r10,%r8
+
+       shrq    $4,%r8
+       andq    $15,%rdx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       xorq    8(%rsi,%rbx,1),%r8
+       shlq    $60,%r10
+       xorq    (%rsi,%rbx,1),%r9
+       xorq    %r10,%r8
+       xorq    (%r11,%rdx,8),%r9
+
+       bswapq  %r8
+       bswapq  %r9
+       movq    %r8,8(%rdi)
+       movq    %r9,(%rdi)
+
+       movq    16(%rsp),%rbx
+       leaq    24(%rsp),%rsp
+L$gmult_epilogue:
+       .byte   0xf3,0xc3
+
+.globl _gcm_ghash_4bit
+
+.p2align       4
+_gcm_ghash_4bit:
+       pushq   %rbx
+       pushq   %rbp
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+       subq    $280,%rsp
+L$ghash_prologue:
+       movq    %rdx,%r14
+       movq    %rcx,%r15
+       subq    $-128,%rsi
+       leaq    16+128(%rsp),%rbp
+       xorl    %edx,%edx
+       movq    0+0-128(%rsi),%r8
+       movq    0+8-128(%rsi),%rax
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    16+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    16+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,0(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,0(%rbp)
+       movq    32+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,0-128(%rbp)
+       movq    32+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,1(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,8(%rbp)
+       movq    48+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,8-128(%rbp)
+       movq    48+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,2(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,16(%rbp)
+       movq    64+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,16-128(%rbp)
+       movq    64+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,3(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,24(%rbp)
+       movq    80+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,24-128(%rbp)
+       movq    80+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,4(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,32(%rbp)
+       movq    96+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,32-128(%rbp)
+       movq    96+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,5(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,40(%rbp)
+       movq    112+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,40-128(%rbp)
+       movq    112+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,6(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,48(%rbp)
+       movq    128+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,48-128(%rbp)
+       movq    128+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,7(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,56(%rbp)
+       movq    144+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,56-128(%rbp)
+       movq    144+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,8(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,64(%rbp)
+       movq    160+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,64-128(%rbp)
+       movq    160+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,9(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,72(%rbp)
+       movq    176+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,72-128(%rbp)
+       movq    176+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,10(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,80(%rbp)
+       movq    192+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,80-128(%rbp)
+       movq    192+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,11(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,88(%rbp)
+       movq    208+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,88-128(%rbp)
+       movq    208+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,12(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,96(%rbp)
+       movq    224+0-128(%rsi),%r8
+       shlb    $4,%dl
+       movq    %rax,96-128(%rbp)
+       movq    224+8-128(%rsi),%rax
+       shlq    $60,%r10
+       movb    %dl,13(%rsp)
+       orq     %r10,%rbx
+       movb    %al,%dl
+       shrq    $4,%rax
+       movq    %r8,%r10
+       shrq    $4,%r8
+       movq    %r9,104(%rbp)
+       movq    240+0-128(%rsi),%r9
+       shlb    $4,%dl
+       movq    %rbx,104-128(%rbp)
+       movq    240+8-128(%rsi),%rbx
+       shlq    $60,%r10
+       movb    %dl,14(%rsp)
+       orq     %r10,%rax
+       movb    %bl,%dl
+       shrq    $4,%rbx
+       movq    %r9,%r10
+       shrq    $4,%r9
+       movq    %r8,112(%rbp)
+       shlb    $4,%dl
+       movq    %rax,112-128(%rbp)
+       shlq    $60,%r10
+       movb    %dl,15(%rsp)
+       orq     %r10,%rbx
+       movq    %r9,120(%rbp)
+       movq    %rbx,120-128(%rbp)
+       addq    $-128,%rsi
+       movq    8(%rdi),%r8
+       movq    0(%rdi),%r9
+       addq    %r14,%r15
+       leaq    L$rem_8bit(%rip),%r11
+       jmp     L$outer_loop
+.p2align       4
+L$outer_loop:
+       xorq    (%r14),%r9
+       movq    8(%r14),%rdx
+       leaq    16(%r14),%r14
+       xorq    %r8,%rdx
+       movq    %r9,(%rdi)
+       movq    %rdx,8(%rdi)
+       shrq    $32,%rdx
+       xorq    %rax,%rax
+       roll    $8,%edx
+       movb    %dl,%al
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       shrl    $4,%ebx
+       roll    $8,%edx
+       movq    8(%rsi,%rax,1),%r8
+       movq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    8(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    4(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    0(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       shrl    $4,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r12,2),%r12
+       movzbl  %dl,%ebx
+       shlb    $4,%al
+       movzbq  (%rsp,%rcx,1),%r13
+       shrl    $4,%ebx
+       shlq    $48,%r12
+       xorq    %r8,%r13
+       movq    %r9,%r10
+       xorq    %r12,%r9
+       shrq    $8,%r8
+       movzbq  %r13b,%r13
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rcx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rcx,8),%r9
+       roll    $8,%edx
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       movb    %dl,%al
+       xorq    %r10,%r8
+       movzwq  (%r11,%r13,2),%r13
+       movzbl  %dl,%ecx
+       shlb    $4,%al
+       movzbq  (%rsp,%rbx,1),%r12
+       andl    $240,%ecx
+       shlq    $48,%r13
+       xorq    %r8,%r12
+       movq    %r9,%r10
+       xorq    %r13,%r9
+       shrq    $8,%r8
+       movzbq  %r12b,%r12
+       movl    -4(%rdi),%edx
+       shrq    $8,%r9
+       xorq    -128(%rbp,%rbx,8),%r8
+       shlq    $56,%r10
+       xorq    (%rbp,%rbx,8),%r9
+       movzwq  (%r11,%r12,2),%r12
+       xorq    8(%rsi,%rax,1),%r8
+       xorq    (%rsi,%rax,1),%r9
+       shlq    $48,%r12
+       xorq    %r10,%r8
+       xorq    %r12,%r9
+       movzbq  %r8b,%r13
+       shrq    $4,%r8
+       movq    %r9,%r10
+       shlb    $4,%r13b
+       shrq    $4,%r9
+       xorq    8(%rsi,%rcx,1),%r8
+       movzwq  (%r11,%r13,2),%r13
+       shlq    $60,%r10
+       xorq    (%rsi,%rcx,1),%r9
+       xorq    %r10,%r8
+       shlq    $48,%r13
+       bswapq  %r8
+       xorq    %r13,%r9
+       bswapq  %r9
+       cmpq    %r15,%r14
+       jb      L$outer_loop
+       movq    %r8,8(%rdi)
+       movq    %r9,(%rdi)
+
+       leaq    280(%rsp),%rsi
+       movq    0(%rsi),%r15
+       movq    8(%rsi),%r14
+       movq    16(%rsi),%r13
+       movq    24(%rsi),%r12
+       movq    32(%rsi),%rbp
+       movq    40(%rsi),%rbx
+       leaq    48(%rsi),%rsp
+L$ghash_epilogue:
+       .byte   0xf3,0xc3
+
+.globl _gcm_init_clmul
+
+.p2align       4
+_gcm_init_clmul:
+       movdqu  (%rsi),%xmm2
+       pshufd  $78,%xmm2,%xmm2
+
+
+       pshufd  $255,%xmm2,%xmm4
+       movdqa  %xmm2,%xmm3
+       psllq   $1,%xmm2
+       pxor    %xmm5,%xmm5
+       psrlq   $63,%xmm3
+       pcmpgtd %xmm4,%xmm5
+       pslldq  $8,%xmm3
+       por     %xmm3,%xmm2
+
+
+       pand    L$0x1c2_polynomial(%rip),%xmm5
+       pxor    %xmm5,%xmm2
+
+
+       movdqa  %xmm2,%xmm0
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,194,0
+.byte  102,15,58,68,202,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       movdqu  %xmm2,(%rdi)
+       movdqu  %xmm0,16(%rdi)
+       .byte   0xf3,0xc3
+
+.globl _gcm_gmult_clmul
+
+.p2align       4
+_gcm_gmult_clmul:
+       movdqu  (%rdi),%xmm0
+       movdqa  L$bswap_mask(%rip),%xmm5
+       movdqu  (%rsi),%xmm2
+.byte  102,15,56,0,197
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,194,0
+.byte  102,15,58,68,202,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,197
+       movdqu  %xmm0,(%rdi)
+       .byte   0xf3,0xc3
+
+.globl _gcm_ghash_clmul
+
+.p2align       4
+_gcm_ghash_clmul:
+       movdqa  L$bswap_mask(%rip),%xmm5
+
+       movdqu  (%rdi),%xmm0
+       movdqu  (%rsi),%xmm2
+.byte  102,15,56,0,197
+
+       subq    $16,%rcx
+       jz      L$odd_tail
+
+       movdqu  16(%rsi),%xmm8
+
+
+
+
+
+       movdqu  (%rdx),%xmm3
+       movdqu  16(%rdx),%xmm6
+.byte  102,15,56,0,221
+.byte  102,15,56,0,245
+       pxor    %xmm3,%xmm0
+       movdqa  %xmm6,%xmm7
+       pshufd  $78,%xmm6,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm6,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,242,0
+.byte  102,15,58,68,250,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm6,%xmm3
+       pxor    %xmm7,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm7
+       pxor    %xmm4,%xmm6
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm8,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm8,%xmm4
+
+       leaq    32(%rdx),%rdx
+       subq    $32,%rcx
+       jbe     L$even_tail
+
+L$mod_loop:
+.byte  102,65,15,58,68,192,0
+.byte  102,65,15,58,68,200,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+       movdqu  (%rdx),%xmm3
+       pxor    %xmm6,%xmm0
+       pxor    %xmm7,%xmm1
+
+       movdqu  16(%rdx),%xmm6
+.byte  102,15,56,0,221
+.byte  102,15,56,0,245
+
+       movdqa  %xmm6,%xmm7
+       pshufd  $78,%xmm6,%xmm9
+       pshufd  $78,%xmm2,%xmm10
+       pxor    %xmm6,%xmm9
+       pxor    %xmm2,%xmm10
+       pxor    %xmm3,%xmm1
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+.byte  102,15,58,68,242,0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+.byte  102,15,58,68,250,17
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+
+.byte  102,69,15,58,68,202,0
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm8,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm8,%xmm4
+
+       pxor    %xmm6,%xmm9
+       pxor    %xmm7,%xmm9
+       movdqa  %xmm9,%xmm10
+       psrldq  $8,%xmm9
+       pslldq  $8,%xmm10
+       pxor    %xmm9,%xmm7
+       pxor    %xmm10,%xmm6
+
+       leaq    32(%rdx),%rdx
+       subq    $32,%rcx
+       ja      L$mod_loop
+
+L$even_tail:
+.byte  102,65,15,58,68,192,0
+.byte  102,65,15,58,68,200,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+       pxor    %xmm6,%xmm0
+       pxor    %xmm7,%xmm1
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       testq   %rcx,%rcx
+       jnz     L$done
+
+L$odd_tail:
+       movdqu  (%rdx),%xmm3
+.byte  102,15,56,0,221
+       pxor    %xmm3,%xmm0
+       movdqa  %xmm0,%xmm1
+       pshufd  $78,%xmm0,%xmm3
+       pshufd  $78,%xmm2,%xmm4
+       pxor    %xmm0,%xmm3
+       pxor    %xmm2,%xmm4
+.byte  102,15,58,68,194,0
+.byte  102,15,58,68,202,17
+.byte  102,15,58,68,220,0
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+
+       movdqa  %xmm3,%xmm4
+       psrldq  $8,%xmm3
+       pslldq  $8,%xmm4
+       pxor    %xmm3,%xmm1
+       pxor    %xmm4,%xmm0
+
+       movdqa  %xmm0,%xmm3
+       psllq   $1,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $5,%xmm0
+       pxor    %xmm3,%xmm0
+       psllq   $57,%xmm0
+       movdqa  %xmm0,%xmm4
+       pslldq  $8,%xmm0
+       psrldq  $8,%xmm4
+       pxor    %xmm3,%xmm0
+       pxor    %xmm4,%xmm1
+
+
+       movdqa  %xmm0,%xmm4
+       psrlq   $5,%xmm0
+       pxor    %xmm4,%xmm0
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+       pxor    %xmm1,%xmm4
+       psrlq   $1,%xmm0
+       pxor    %xmm4,%xmm0
+L$done:
+.byte  102,15,56,0,197
+       movdqu  %xmm0,(%rdi)
+       .byte   0xf3,0xc3
+L$SEH_end_gcm_ghash_clmul:
+
+.p2align       6
+L$bswap_mask:
+.byte  15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$0x1c2_polynomial:
+.byte  1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.p2align       6
+
+L$rem_4bit:
+.long  0,0,0,471859200,0,943718400,0,610271232
+.long  0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long  0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long  0,2441084928,0,2376073216,0,2847932416,0,3051356160
+
+L$rem_8bit:
+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte  71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align       6
diff --git a/deps/openssl/asm/x64-win32-masm/aes/bsaes-x86_64.asm b/deps/openssl/asm/x64-win32-masm/aes/bsaes-x86_64.asm
new file mode 100644 (file)
index 0000000..cc055e7
--- /dev/null
@@ -0,0 +1,2811 @@
+OPTION DOTNAME
+.text$ SEGMENT ALIGN(64) 'CODE'
+
+EXTERN asm_AES_encrypt:NEAR
+EXTERN asm_AES_decrypt:NEAR
+
+
+ALIGN  64
+_bsaes_encrypt8        PROC PRIVATE
+       lea     r11,QWORD PTR[$L$BS0]
+
+       movdqa  xmm8,XMMWORD PTR[rax]
+       lea     rax,QWORD PTR[16+rax]
+       movdqa  xmm7,XMMWORD PTR[80+r11]
+       pxor    xmm15,xmm8
+       pxor    xmm0,xmm8
+DB     102,68,15,56,0,255
+       pxor    xmm1,xmm8
+DB     102,15,56,0,199
+       pxor    xmm2,xmm8
+DB     102,15,56,0,207
+       pxor    xmm3,xmm8
+DB     102,15,56,0,215
+       pxor    xmm4,xmm8
+DB     102,15,56,0,223
+       pxor    xmm5,xmm8
+DB     102,15,56,0,231
+       pxor    xmm6,xmm8
+DB     102,15,56,0,239
+DB     102,15,56,0,247
+_bsaes_encrypt8_bitslice::
+       movdqa  xmm7,XMMWORD PTR[r11]
+       movdqa  xmm8,XMMWORD PTR[16+r11]
+       movdqa  xmm9,xmm5
+       psrlq   xmm5,1
+       movdqa  xmm10,xmm3
+       psrlq   xmm3,1
+       pxor    xmm5,xmm6
+       pxor    xmm3,xmm4
+       pand    xmm5,xmm7
+       pand    xmm3,xmm7
+       pxor    xmm6,xmm5
+       psllq   xmm5,1
+       pxor    xmm4,xmm3
+       psllq   xmm3,1
+       pxor    xmm5,xmm9
+       pxor    xmm3,xmm10
+       movdqa  xmm9,xmm1
+       psrlq   xmm1,1
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,1
+       pxor    xmm1,xmm2
+       pxor    xmm15,xmm0
+       pand    xmm1,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm2,xmm1
+       psllq   xmm1,1
+       pxor    xmm0,xmm15
+       psllq   xmm15,1
+       pxor    xmm1,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm7,XMMWORD PTR[32+r11]
+       movdqa  xmm9,xmm4
+       psrlq   xmm4,2
+       movdqa  xmm10,xmm3
+       psrlq   xmm3,2
+       pxor    xmm4,xmm6
+       pxor    xmm3,xmm5
+       pand    xmm4,xmm8
+       pand    xmm3,xmm8
+       pxor    xmm6,xmm4
+       psllq   xmm4,2
+       pxor    xmm5,xmm3
+       psllq   xmm3,2
+       pxor    xmm4,xmm9
+       pxor    xmm3,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,2
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,2
+       pxor    xmm0,xmm2
+       pxor    xmm15,xmm1
+       pand    xmm0,xmm8
+       pand    xmm15,xmm8
+       pxor    xmm2,xmm0
+       psllq   xmm0,2
+       pxor    xmm1,xmm15
+       psllq   xmm15,2
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm9,xmm2
+       psrlq   xmm2,4
+       movdqa  xmm10,xmm1
+       psrlq   xmm1,4
+       pxor    xmm2,xmm6
+       pxor    xmm1,xmm5
+       pand    xmm2,xmm7
+       pand    xmm1,xmm7
+       pxor    xmm6,xmm2
+       psllq   xmm2,4
+       pxor    xmm5,xmm1
+       psllq   xmm1,4
+       pxor    xmm2,xmm9
+       pxor    xmm1,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,4
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,4
+       pxor    xmm0,xmm4
+       pxor    xmm15,xmm3
+       pand    xmm0,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm4,xmm0
+       psllq   xmm0,4
+       pxor    xmm3,xmm15
+       psllq   xmm15,4
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       dec     r10d
+       jmp     $L$enc_sbox
+ALIGN  16
+$L$enc_loop::
+       pxor    xmm15,XMMWORD PTR[rax]
+       pxor    xmm0,XMMWORD PTR[16+rax]
+DB     102,68,15,56,0,255
+       pxor    xmm1,XMMWORD PTR[32+rax]
+DB     102,15,56,0,199
+       pxor    xmm2,XMMWORD PTR[48+rax]
+DB     102,15,56,0,207
+       pxor    xmm3,XMMWORD PTR[64+rax]
+DB     102,15,56,0,215
+       pxor    xmm4,XMMWORD PTR[80+rax]
+DB     102,15,56,0,223
+       pxor    xmm5,XMMWORD PTR[96+rax]
+DB     102,15,56,0,231
+       pxor    xmm6,XMMWORD PTR[112+rax]
+DB     102,15,56,0,239
+       lea     rax,QWORD PTR[128+rax]
+DB     102,15,56,0,247
+$L$enc_sbox::
+       pxor    xmm4,xmm5
+       pxor    xmm1,xmm0
+       pxor    xmm2,xmm15
+       pxor    xmm5,xmm1
+       pxor    xmm4,xmm15
+
+       pxor    xmm5,xmm2
+       pxor    xmm2,xmm6
+       pxor    xmm6,xmm4
+       pxor    xmm2,xmm3
+       pxor    xmm3,xmm4
+       pxor    xmm2,xmm0
+
+       pxor    xmm1,xmm6
+       pxor    xmm0,xmm4
+       movdqa  xmm10,xmm6
+       movdqa  xmm9,xmm0
+       movdqa  xmm8,xmm4
+       movdqa  xmm12,xmm1
+       movdqa  xmm11,xmm5
+
+       pxor    xmm10,xmm3
+       pxor    xmm9,xmm1
+       pxor    xmm8,xmm2
+       movdqa  xmm13,xmm10
+       pxor    xmm12,xmm3
+       movdqa  xmm7,xmm9
+       pxor    xmm11,xmm15
+       movdqa  xmm14,xmm10
+
+       por     xmm9,xmm8
+       por     xmm10,xmm11
+       pxor    xmm14,xmm7
+       pand    xmm13,xmm11
+       pxor    xmm11,xmm8
+       pand    xmm7,xmm8
+       pand    xmm14,xmm11
+       movdqa  xmm11,xmm2
+       pxor    xmm11,xmm15
+       pand    xmm12,xmm11
+       pxor    xmm10,xmm12
+       pxor    xmm9,xmm12
+       movdqa  xmm12,xmm6
+       movdqa  xmm11,xmm4
+       pxor    xmm12,xmm0
+       pxor    xmm11,xmm5
+       movdqa  xmm8,xmm12
+       pand    xmm12,xmm11
+       por     xmm8,xmm11
+       pxor    xmm7,xmm12
+       pxor    xmm10,xmm14
+       pxor    xmm9,xmm13
+       pxor    xmm8,xmm14
+       movdqa  xmm11,xmm1
+       pxor    xmm7,xmm13
+       movdqa  xmm12,xmm3
+       pxor    xmm8,xmm13
+       movdqa  xmm13,xmm0
+       pand    xmm11,xmm2
+       movdqa  xmm14,xmm6
+       pand    xmm12,xmm15
+       pand    xmm13,xmm4
+       por     xmm14,xmm5
+       pxor    xmm10,xmm11
+       pxor    xmm9,xmm12
+       pxor    xmm8,xmm13
+       pxor    xmm7,xmm14
+
+
+
+
+
+       movdqa  xmm11,xmm10
+       pand    xmm10,xmm8
+       pxor    xmm11,xmm9
+
+       movdqa  xmm13,xmm7
+       movdqa  xmm14,xmm11
+       pxor    xmm13,xmm10
+       pand    xmm14,xmm13
+
+       movdqa  xmm12,xmm8
+       pxor    xmm14,xmm9
+       pxor    xmm12,xmm7
+
+       pxor    xmm10,xmm9
+
+       pand    xmm12,xmm10
+
+       movdqa  xmm9,xmm13
+       pxor    xmm12,xmm7
+
+       pxor    xmm9,xmm12
+       pxor    xmm8,xmm12
+
+       pand    xmm9,xmm7
+
+       pxor    xmm13,xmm9
+       pxor    xmm8,xmm9
+
+       pand    xmm13,xmm14
+
+       pxor    xmm13,xmm11
+       movdqa  xmm11,xmm5
+       movdqa  xmm7,xmm4
+       movdqa  xmm9,xmm14
+       pxor    xmm9,xmm13
+       pand    xmm9,xmm5
+       pxor    xmm5,xmm4
+       pand    xmm4,xmm14
+       pand    xmm5,xmm13
+       pxor    xmm5,xmm4
+       pxor    xmm4,xmm9
+       pxor    xmm11,xmm15
+       pxor    xmm7,xmm2
+       pxor    xmm14,xmm12
+       pxor    xmm13,xmm8
+       movdqa  xmm10,xmm14
+       movdqa  xmm9,xmm12
+       pxor    xmm10,xmm13
+       pxor    xmm9,xmm8
+       pand    xmm10,xmm11
+       pand    xmm9,xmm15
+       pxor    xmm11,xmm7
+       pxor    xmm15,xmm2
+       pand    xmm7,xmm14
+       pand    xmm2,xmm12
+       pand    xmm11,xmm13
+       pand    xmm15,xmm8
+       pxor    xmm7,xmm11
+       pxor    xmm15,xmm2
+       pxor    xmm11,xmm10
+       pxor    xmm2,xmm9
+       pxor    xmm5,xmm11
+       pxor    xmm15,xmm11
+       pxor    xmm4,xmm7
+       pxor    xmm2,xmm7
+
+       movdqa  xmm11,xmm6
+       movdqa  xmm7,xmm0
+       pxor    xmm11,xmm3
+       pxor    xmm7,xmm1
+       movdqa  xmm10,xmm14
+       movdqa  xmm9,xmm12
+       pxor    xmm10,xmm13
+       pxor    xmm9,xmm8
+       pand    xmm10,xmm11
+       pand    xmm9,xmm3
+       pxor    xmm11,xmm7
+       pxor    xmm3,xmm1
+       pand    xmm7,xmm14
+       pand    xmm1,xmm12
+       pand    xmm11,xmm13
+       pand    xmm3,xmm8
+       pxor    xmm7,xmm11
+       pxor    xmm3,xmm1
+       pxor    xmm11,xmm10
+       pxor    xmm1,xmm9
+       pxor    xmm14,xmm12
+       pxor    xmm13,xmm8
+       movdqa  xmm10,xmm14
+       pxor    xmm10,xmm13
+       pand    xmm10,xmm6
+       pxor    xmm6,xmm0
+       pand    xmm0,xmm14
+       pand    xmm6,xmm13
+       pxor    xmm6,xmm0
+       pxor    xmm0,xmm10
+       pxor    xmm6,xmm11
+       pxor    xmm3,xmm11
+       pxor    xmm0,xmm7
+       pxor    xmm1,xmm7
+       pxor    xmm6,xmm15
+       pxor    xmm0,xmm5
+       pxor    xmm3,xmm6
+       pxor    xmm5,xmm15
+       pxor    xmm15,xmm0
+
+       pxor    xmm0,xmm4
+       pxor    xmm4,xmm1
+       pxor    xmm1,xmm2
+       pxor    xmm2,xmm4
+       pxor    xmm3,xmm4
+
+       pxor    xmm5,xmm2
+       dec     r10d
+       jl      $L$enc_done
+       pshufd  xmm7,xmm15,093h
+       pshufd  xmm8,xmm0,093h
+       pxor    xmm15,xmm7
+       pshufd  xmm9,xmm3,093h
+       pxor    xmm0,xmm8
+       pshufd  xmm10,xmm5,093h
+       pxor    xmm3,xmm9
+       pshufd  xmm11,xmm2,093h
+       pxor    xmm5,xmm10
+       pshufd  xmm12,xmm6,093h
+       pxor    xmm2,xmm11
+       pshufd  xmm13,xmm1,093h
+       pxor    xmm6,xmm12
+       pshufd  xmm14,xmm4,093h
+       pxor    xmm1,xmm13
+       pxor    xmm4,xmm14
+
+       pxor    xmm8,xmm15
+       pxor    xmm7,xmm4
+       pxor    xmm8,xmm4
+       pshufd  xmm15,xmm15,04Eh
+       pxor    xmm9,xmm0
+       pshufd  xmm0,xmm0,04Eh
+       pxor    xmm12,xmm2
+       pxor    xmm15,xmm7
+       pxor    xmm13,xmm6
+       pxor    xmm0,xmm8
+       pxor    xmm11,xmm5
+       pshufd  xmm7,xmm2,04Eh
+       pxor    xmm14,xmm1
+       pshufd  xmm8,xmm6,04Eh
+       pxor    xmm10,xmm3
+       pshufd  xmm2,xmm5,04Eh
+       pxor    xmm10,xmm4
+       pshufd  xmm6,xmm4,04Eh
+       pxor    xmm11,xmm4
+       pshufd  xmm5,xmm1,04Eh
+       pxor    xmm7,xmm11
+       pshufd  xmm1,xmm3,04Eh
+       pxor    xmm8,xmm12
+
+       pxor    xmm2,xmm10
+       pxor    xmm6,xmm14
+       pxor    xmm5,xmm13
+       movdqa  xmm3,xmm7
+       pxor    xmm1,xmm9
+       movdqa  xmm4,xmm8
+       movdqa  xmm7,XMMWORD PTR[48+r11]
+       jnz     $L$enc_loop
+       movdqa  xmm7,XMMWORD PTR[64+r11]
+       jmp     $L$enc_loop
+ALIGN  16
+$L$enc_done::
+       movdqa  xmm7,XMMWORD PTR[r11]
+       movdqa  xmm8,XMMWORD PTR[16+r11]
+       movdqa  xmm9,xmm1
+       psrlq   xmm1,1
+       movdqa  xmm10,xmm2
+       psrlq   xmm2,1
+       pxor    xmm1,xmm4
+       pxor    xmm2,xmm6
+       pand    xmm1,xmm7
+       pand    xmm2,xmm7
+       pxor    xmm4,xmm1
+       psllq   xmm1,1
+       pxor    xmm6,xmm2
+       psllq   xmm2,1
+       pxor    xmm1,xmm9
+       pxor    xmm2,xmm10
+       movdqa  xmm9,xmm3
+       psrlq   xmm3,1
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,1
+       pxor    xmm3,xmm5
+       pxor    xmm15,xmm0
+       pand    xmm3,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm5,xmm3
+       psllq   xmm3,1
+       pxor    xmm0,xmm15
+       psllq   xmm15,1
+       pxor    xmm3,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm7,XMMWORD PTR[32+r11]
+       movdqa  xmm9,xmm6
+       psrlq   xmm6,2
+       movdqa  xmm10,xmm2
+       psrlq   xmm2,2
+       pxor    xmm6,xmm4
+       pxor    xmm2,xmm1
+       pand    xmm6,xmm8
+       pand    xmm2,xmm8
+       pxor    xmm4,xmm6
+       psllq   xmm6,2
+       pxor    xmm1,xmm2
+       psllq   xmm2,2
+       pxor    xmm6,xmm9
+       pxor    xmm2,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,2
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,2
+       pxor    xmm0,xmm5
+       pxor    xmm15,xmm3
+       pand    xmm0,xmm8
+       pand    xmm15,xmm8
+       pxor    xmm5,xmm0
+       psllq   xmm0,2
+       pxor    xmm3,xmm15
+       psllq   xmm15,2
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm9,xmm5
+       psrlq   xmm5,4
+       movdqa  xmm10,xmm3
+       psrlq   xmm3,4
+       pxor    xmm5,xmm4
+       pxor    xmm3,xmm1
+       pand    xmm5,xmm7
+       pand    xmm3,xmm7
+       pxor    xmm4,xmm5
+       psllq   xmm5,4
+       pxor    xmm1,xmm3
+       psllq   xmm3,4
+       pxor    xmm5,xmm9
+       pxor    xmm3,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,4
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,4
+       pxor    xmm0,xmm6
+       pxor    xmm15,xmm2
+       pand    xmm0,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm6,xmm0
+       psllq   xmm0,4
+       pxor    xmm2,xmm15
+       psllq   xmm15,4
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm7,XMMWORD PTR[rax]
+       pxor    xmm3,xmm7
+       pxor    xmm5,xmm7
+       pxor    xmm2,xmm7
+       pxor    xmm6,xmm7
+       pxor    xmm1,xmm7
+       pxor    xmm4,xmm7
+       pxor    xmm15,xmm7
+       pxor    xmm0,xmm7
+       DB      0F3h,0C3h               ;repret
+_bsaes_encrypt8        ENDP
+
+
+ALIGN  64
+_bsaes_decrypt8        PROC PRIVATE
+       lea     r11,QWORD PTR[$L$BS0]
+
+       movdqa  xmm8,XMMWORD PTR[rax]
+       lea     rax,QWORD PTR[16+rax]
+       movdqa  xmm7,XMMWORD PTR[((-48))+r11]
+       pxor    xmm15,xmm8
+       pxor    xmm0,xmm8
+DB     102,68,15,56,0,255
+       pxor    xmm1,xmm8
+DB     102,15,56,0,199
+       pxor    xmm2,xmm8
+DB     102,15,56,0,207
+       pxor    xmm3,xmm8
+DB     102,15,56,0,215
+       pxor    xmm4,xmm8
+DB     102,15,56,0,223
+       pxor    xmm5,xmm8
+DB     102,15,56,0,231
+       pxor    xmm6,xmm8
+DB     102,15,56,0,239
+DB     102,15,56,0,247
+       movdqa  xmm7,XMMWORD PTR[r11]
+       movdqa  xmm8,XMMWORD PTR[16+r11]
+       movdqa  xmm9,xmm5
+       psrlq   xmm5,1
+       movdqa  xmm10,xmm3
+       psrlq   xmm3,1
+       pxor    xmm5,xmm6
+       pxor    xmm3,xmm4
+       pand    xmm5,xmm7
+       pand    xmm3,xmm7
+       pxor    xmm6,xmm5
+       psllq   xmm5,1
+       pxor    xmm4,xmm3
+       psllq   xmm3,1
+       pxor    xmm5,xmm9
+       pxor    xmm3,xmm10
+       movdqa  xmm9,xmm1
+       psrlq   xmm1,1
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,1
+       pxor    xmm1,xmm2
+       pxor    xmm15,xmm0
+       pand    xmm1,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm2,xmm1
+       psllq   xmm1,1
+       pxor    xmm0,xmm15
+       psllq   xmm15,1
+       pxor    xmm1,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm7,XMMWORD PTR[32+r11]
+       movdqa  xmm9,xmm4
+       psrlq   xmm4,2
+       movdqa  xmm10,xmm3
+       psrlq   xmm3,2
+       pxor    xmm4,xmm6
+       pxor    xmm3,xmm5
+       pand    xmm4,xmm8
+       pand    xmm3,xmm8
+       pxor    xmm6,xmm4
+       psllq   xmm4,2
+       pxor    xmm5,xmm3
+       psllq   xmm3,2
+       pxor    xmm4,xmm9
+       pxor    xmm3,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,2
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,2
+       pxor    xmm0,xmm2
+       pxor    xmm15,xmm1
+       pand    xmm0,xmm8
+       pand    xmm15,xmm8
+       pxor    xmm2,xmm0
+       psllq   xmm0,2
+       pxor    xmm1,xmm15
+       psllq   xmm15,2
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm9,xmm2
+       psrlq   xmm2,4
+       movdqa  xmm10,xmm1
+       psrlq   xmm1,4
+       pxor    xmm2,xmm6
+       pxor    xmm1,xmm5
+       pand    xmm2,xmm7
+       pand    xmm1,xmm7
+       pxor    xmm6,xmm2
+       psllq   xmm2,4
+       pxor    xmm5,xmm1
+       psllq   xmm1,4
+       pxor    xmm2,xmm9
+       pxor    xmm1,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,4
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,4
+       pxor    xmm0,xmm4
+       pxor    xmm15,xmm3
+       pand    xmm0,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm4,xmm0
+       psllq   xmm0,4
+       pxor    xmm3,xmm15
+       psllq   xmm15,4
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       dec     r10d
+       jmp     $L$dec_sbox
+ALIGN  16
+$L$dec_loop::
+       pxor    xmm15,XMMWORD PTR[rax]
+       pxor    xmm0,XMMWORD PTR[16+rax]
+DB     102,68,15,56,0,255
+       pxor    xmm1,XMMWORD PTR[32+rax]
+DB     102,15,56,0,199
+       pxor    xmm2,XMMWORD PTR[48+rax]
+DB     102,15,56,0,207
+       pxor    xmm3,XMMWORD PTR[64+rax]
+DB     102,15,56,0,215
+       pxor    xmm4,XMMWORD PTR[80+rax]
+DB     102,15,56,0,223
+       pxor    xmm5,XMMWORD PTR[96+rax]
+DB     102,15,56,0,231
+       pxor    xmm6,XMMWORD PTR[112+rax]
+DB     102,15,56,0,239
+       lea     rax,QWORD PTR[128+rax]
+DB     102,15,56,0,247
+$L$dec_sbox::
+       pxor    xmm2,xmm3
+
+       pxor    xmm3,xmm6
+       pxor    xmm1,xmm6
+       pxor    xmm5,xmm3
+       pxor    xmm6,xmm5
+       pxor    xmm0,xmm6
+
+       pxor    xmm15,xmm0
+       pxor    xmm1,xmm4
+       pxor    xmm2,xmm15
+       pxor    xmm4,xmm15
+       pxor    xmm0,xmm2
+       movdqa  xmm10,xmm2
+       movdqa  xmm9,xmm6
+       movdqa  xmm8,xmm0
+       movdqa  xmm12,xmm3
+       movdqa  xmm11,xmm4
+
+       pxor    xmm10,xmm15
+       pxor    xmm9,xmm3
+       pxor    xmm8,xmm5
+       movdqa  xmm13,xmm10
+       pxor    xmm12,xmm15
+       movdqa  xmm7,xmm9
+       pxor    xmm11,xmm1
+       movdqa  xmm14,xmm10
+
+       por     xmm9,xmm8
+       por     xmm10,xmm11
+       pxor    xmm14,xmm7
+       pand    xmm13,xmm11
+       pxor    xmm11,xmm8
+       pand    xmm7,xmm8
+       pand    xmm14,xmm11
+       movdqa  xmm11,xmm5
+       pxor    xmm11,xmm1
+       pand    xmm12,xmm11
+       pxor    xmm10,xmm12
+       pxor    xmm9,xmm12
+       movdqa  xmm12,xmm2
+       movdqa  xmm11,xmm0
+       pxor    xmm12,xmm6
+       pxor    xmm11,xmm4
+       movdqa  xmm8,xmm12
+       pand    xmm12,xmm11
+       por     xmm8,xmm11
+       pxor    xmm7,xmm12
+       pxor    xmm10,xmm14
+       pxor    xmm9,xmm13
+       pxor    xmm8,xmm14
+       movdqa  xmm11,xmm3
+       pxor    xmm7,xmm13
+       movdqa  xmm12,xmm15
+       pxor    xmm8,xmm13
+       movdqa  xmm13,xmm6
+       pand    xmm11,xmm5
+       movdqa  xmm14,xmm2
+       pand    xmm12,xmm1
+       pand    xmm13,xmm0
+       por     xmm14,xmm4
+       pxor    xmm10,xmm11
+       pxor    xmm9,xmm12
+       pxor    xmm8,xmm13
+       pxor    xmm7,xmm14
+
+
+
+
+
+       movdqa  xmm11,xmm10
+       pand    xmm10,xmm8
+       pxor    xmm11,xmm9
+
+       movdqa  xmm13,xmm7
+       movdqa  xmm14,xmm11
+       pxor    xmm13,xmm10
+       pand    xmm14,xmm13
+
+       movdqa  xmm12,xmm8
+       pxor    xmm14,xmm9
+       pxor    xmm12,xmm7
+
+       pxor    xmm10,xmm9
+
+       pand    xmm12,xmm10
+
+       movdqa  xmm9,xmm13
+       pxor    xmm12,xmm7
+
+       pxor    xmm9,xmm12
+       pxor    xmm8,xmm12
+
+       pand    xmm9,xmm7
+
+       pxor    xmm13,xmm9
+       pxor    xmm8,xmm9
+
+       pand    xmm13,xmm14
+
+       pxor    xmm13,xmm11
+       movdqa  xmm11,xmm4
+       movdqa  xmm7,xmm0
+       movdqa  xmm9,xmm14
+       pxor    xmm9,xmm13
+       pand    xmm9,xmm4
+       pxor    xmm4,xmm0
+       pand    xmm0,xmm14
+       pand    xmm4,xmm13
+       pxor    xmm4,xmm0
+       pxor    xmm0,xmm9
+       pxor    xmm11,xmm1
+       pxor    xmm7,xmm5
+       pxor    xmm14,xmm12
+       pxor    xmm13,xmm8
+       movdqa  xmm10,xmm14
+       movdqa  xmm9,xmm12
+       pxor    xmm10,xmm13
+       pxor    xmm9,xmm8
+       pand    xmm10,xmm11
+       pand    xmm9,xmm1
+       pxor    xmm11,xmm7
+       pxor    xmm1,xmm5
+       pand    xmm7,xmm14
+       pand    xmm5,xmm12
+       pand    xmm11,xmm13
+       pand    xmm1,xmm8
+       pxor    xmm7,xmm11
+       pxor    xmm1,xmm5
+       pxor    xmm11,xmm10
+       pxor    xmm5,xmm9
+       pxor    xmm4,xmm11
+       pxor    xmm1,xmm11
+       pxor    xmm0,xmm7
+       pxor    xmm5,xmm7
+
+       movdqa  xmm11,xmm2
+       movdqa  xmm7,xmm6
+       pxor    xmm11,xmm15
+       pxor    xmm7,xmm3
+       movdqa  xmm10,xmm14
+       movdqa  xmm9,xmm12
+       pxor    xmm10,xmm13
+       pxor    xmm9,xmm8
+       pand    xmm10,xmm11
+       pand    xmm9,xmm15
+       pxor    xmm11,xmm7
+       pxor    xmm15,xmm3
+       pand    xmm7,xmm14
+       pand    xmm3,xmm12
+       pand    xmm11,xmm13
+       pand    xmm15,xmm8
+       pxor    xmm7,xmm11
+       pxor    xmm15,xmm3
+       pxor    xmm11,xmm10
+       pxor    xmm3,xmm9
+       pxor    xmm14,xmm12
+       pxor    xmm13,xmm8
+       movdqa  xmm10,xmm14
+       pxor    xmm10,xmm13
+       pand    xmm10,xmm2
+       pxor    xmm2,xmm6
+       pand    xmm6,xmm14
+       pand    xmm2,xmm13
+       pxor    xmm2,xmm6
+       pxor    xmm6,xmm10
+       pxor    xmm2,xmm11
+       pxor    xmm15,xmm11
+       pxor    xmm6,xmm7
+       pxor    xmm3,xmm7
+       pxor    xmm0,xmm6
+       pxor    xmm5,xmm4
+
+       pxor    xmm3,xmm0
+       pxor    xmm1,xmm6
+       pxor    xmm4,xmm6
+       pxor    xmm3,xmm1
+       pxor    xmm6,xmm15
+       pxor    xmm3,xmm4
+       pxor    xmm2,xmm5
+       pxor    xmm5,xmm0
+       pxor    xmm2,xmm3
+
+       pxor    xmm3,xmm15
+       pxor    xmm6,xmm2
+       dec     r10d
+       jl      $L$dec_done
+
+       pshufd  xmm14,xmm4,093h
+       movdqa  xmm9,xmm5
+       pxor    xmm4,xmm6
+       pxor    xmm5,xmm6
+       pshufd  xmm7,xmm15,093h
+       movdqa  xmm12,xmm6
+       pxor    xmm6,xmm15
+       pxor    xmm15,xmm0
+       pshufd  xmm8,xmm0,093h
+       pxor    xmm0,xmm5
+       pxor    xmm15,xmm2
+       pxor    xmm0,xmm3
+       pshufd  xmm10,xmm3,093h
+       pxor    xmm5,xmm15
+       pxor    xmm3,xmm4
+       pxor    xmm4,xmm2
+       pshufd  xmm13,xmm2,093h
+       movdqa  xmm11,xmm1
+       pxor    xmm2,xmm1
+       pxor    xmm1,xmm3
+       pxor    xmm3,xmm4
+       pxor    xmm2,xmm12
+       pxor    xmm3,xmm9
+       pxor    xmm3,xmm11
+       pshufd  xmm12,xmm12,093h
+
+       pxor    xmm6,xmm4
+       pxor    xmm4,xmm7
+       pxor    xmm6,xmm8
+       pshufd  xmm9,xmm9,093h
+       pxor    xmm4,xmm12
+       pxor    xmm6,xmm13
+       pxor    xmm4,xmm14
+       pshufd  xmm11,xmm11,093h
+       pxor    xmm14,xmm13
+       pxor    xmm6,xmm4
+
+       pxor    xmm5,xmm7
+       pshufd  xmm7,xmm7,093h
+       pxor    xmm15,xmm8
+       pxor    xmm0,xmm8
+       pxor    xmm15,xmm9
+       pshufd  xmm8,xmm8,093h
+       pxor    xmm5,xmm9
+       pxor    xmm3,xmm9
+       pxor    xmm15,xmm14
+       pshufd  xmm9,xmm9,093h
+       pxor    xmm5,xmm10
+       pxor    xmm1,xmm10
+       pxor    xmm0,xmm10
+       pshufd  xmm10,xmm10,093h
+       pxor    xmm2,xmm11
+       pxor    xmm3,xmm11
+       pxor    xmm2,xmm14
+       pxor    xmm5,xmm12
+       pxor    xmm0,xmm11
+       pxor    xmm14,xmm12
+
+       pxor    xmm3,xmm14
+       pshufd  xmm11,xmm11,093h
+       pxor    xmm1,xmm14
+       pxor    xmm0,xmm14
+
+       pxor    xmm14,xmm12
+       pshufd  xmm12,xmm12,093h
+       pxor    xmm14,xmm13
+
+
+       pxor    xmm0,xmm2
+       pxor    xmm2,xmm11
+       pshufd  xmm13,xmm13,093h
+       pxor    xmm15,xmm7
+       pxor    xmm2,xmm12
+       pxor    xmm15,xmm9
+       pshufd  xmm14,xmm14,093h
+
+       pxor    xmm5,xmm6
+       pxor    xmm6,xmm8
+       pxor    xmm4,xmm7
+       pxor    xmm5,xmm7
+       pxor    xmm6,xmm12
+       pxor    xmm4,xmm12
+       pxor    xmm6,xmm14
+       pshufd  xmm7,xmm7,093h
+       pxor    xmm4,xmm13
+       pxor    xmm5,xmm6
+       pxor    xmm0,xmm8
+       pshufd  xmm8,xmm8,093h
+
+       pxor    xmm2,xmm14
+       pxor    xmm0,xmm9
+       pxor    xmm3,xmm9
+       pshufd  xmm9,xmm9,093h
+       pxor    xmm15,xmm13
+       pxor    xmm13,xmm10
+       pxor    xmm0,xmm2
+       pxor    xmm5,xmm13
+
+       pxor    xmm1,xmm13
+       pxor    xmm3,xmm12
+       pxor    xmm1,xmm11
+       pshufd  xmm11,xmm11,093h
+       pxor    xmm3,xmm13
+       pxor    xmm1,xmm14
+       pxor    xmm13,xmm10
+
+       pshufd  xmm12,xmm12,093h
+       pshufd  xmm13,xmm13,093h
+       pshufd  xmm14,xmm14,093h
+       pshufd  xmm10,xmm10,093h
+
+
+       pxor    xmm0,xmm6
+       pxor    xmm8,xmm6
+       pxor    xmm7,xmm12
+       pxor    xmm8,xmm12
+       pxor    xmm5,xmm7
+       pxor    xmm7,xmm4
+       pxor    xmm8,xmm13
+       pxor    xmm13,xmm14
+       pxor    xmm0,xmm8
+       pxor    xmm2,xmm11
+       pxor    xmm11,xmm0
+       pxor    xmm1,xmm10
+       pxor    xmm10,xmm5
+       pxor    xmm3,xmm9
+       pxor    xmm9,xmm15
+       pxor    xmm10,xmm14
+       pxor    xmm12,xmm3
+       pxor    xmm9,xmm13
+       pxor    xmm12,xmm13
+       pxor    xmm13,xmm1
+       pxor    xmm14,xmm2
+
+       movdqa  xmm15,xmm7
+       movdqa  xmm0,xmm8
+       movdqa  xmm1,xmm9
+       movdqa  xmm2,xmm10
+       movdqa  xmm3,xmm11
+       movdqa  xmm4,xmm12
+       movdqa  xmm5,xmm13
+       movdqa  xmm6,xmm14
+       movdqa  xmm7,XMMWORD PTR[((-16))+r11]
+       jnz     $L$dec_loop
+       movdqa  xmm7,XMMWORD PTR[((-32))+r11]
+       jmp     $L$dec_loop
+ALIGN  16
+$L$dec_done::
+       movdqa  xmm7,XMMWORD PTR[r11]
+       movdqa  xmm8,XMMWORD PTR[16+r11]
+       movdqa  xmm9,xmm2
+       psrlq   xmm2,1
+       movdqa  xmm10,xmm1
+       psrlq   xmm1,1
+       pxor    xmm2,xmm4
+       pxor    xmm1,xmm6
+       pand    xmm2,xmm7
+       pand    xmm1,xmm7
+       pxor    xmm4,xmm2
+       psllq   xmm2,1
+       pxor    xmm6,xmm1
+       psllq   xmm1,1
+       pxor    xmm2,xmm9
+       pxor    xmm1,xmm10
+       movdqa  xmm9,xmm5
+       psrlq   xmm5,1
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,1
+       pxor    xmm5,xmm3
+       pxor    xmm15,xmm0
+       pand    xmm5,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm3,xmm5
+       psllq   xmm5,1
+       pxor    xmm0,xmm15
+       psllq   xmm15,1
+       pxor    xmm5,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm7,XMMWORD PTR[32+r11]
+       movdqa  xmm9,xmm6
+       psrlq   xmm6,2
+       movdqa  xmm10,xmm1
+       psrlq   xmm1,2
+       pxor    xmm6,xmm4
+       pxor    xmm1,xmm2
+       pand    xmm6,xmm8
+       pand    xmm1,xmm8
+       pxor    xmm4,xmm6
+       psllq   xmm6,2
+       pxor    xmm2,xmm1
+       psllq   xmm1,2
+       pxor    xmm6,xmm9
+       pxor    xmm1,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,2
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,2
+       pxor    xmm0,xmm3
+       pxor    xmm15,xmm5
+       pand    xmm0,xmm8
+       pand    xmm15,xmm8
+       pxor    xmm3,xmm0
+       psllq   xmm0,2
+       pxor    xmm5,xmm15
+       psllq   xmm15,2
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm9,xmm3
+       psrlq   xmm3,4
+       movdqa  xmm10,xmm5
+       psrlq   xmm5,4
+       pxor    xmm3,xmm4
+       pxor    xmm5,xmm2
+       pand    xmm3,xmm7
+       pand    xmm5,xmm7
+       pxor    xmm4,xmm3
+       psllq   xmm3,4
+       pxor    xmm2,xmm5
+       psllq   xmm5,4
+       pxor    xmm3,xmm9
+       pxor    xmm5,xmm10
+       movdqa  xmm9,xmm0
+       psrlq   xmm0,4
+       movdqa  xmm10,xmm15
+       psrlq   xmm15,4
+       pxor    xmm0,xmm6
+       pxor    xmm15,xmm1
+       pand    xmm0,xmm7
+       pand    xmm15,xmm7
+       pxor    xmm6,xmm0
+       psllq   xmm0,4
+       pxor    xmm1,xmm15
+       psllq   xmm15,4
+       pxor    xmm0,xmm9
+       pxor    xmm15,xmm10
+       movdqa  xmm7,XMMWORD PTR[rax]
+       pxor    xmm5,xmm7
+       pxor    xmm3,xmm7
+       pxor    xmm1,xmm7
+       pxor    xmm6,xmm7
+       pxor    xmm2,xmm7
+       pxor    xmm4,xmm7
+       pxor    xmm15,xmm7
+       pxor    xmm0,xmm7
+       DB      0F3h,0C3h               ;repret
+_bsaes_decrypt8        ENDP
+
+ALIGN  16
+_bsaes_key_convert     PROC PRIVATE
+       lea     r11,QWORD PTR[$L$masks]
+       movdqu  xmm7,XMMWORD PTR[rcx]
+       lea     rcx,QWORD PTR[16+rcx]
+       movdqa  xmm0,XMMWORD PTR[r11]
+       movdqa  xmm1,XMMWORD PTR[16+r11]
+       movdqa  xmm2,XMMWORD PTR[32+r11]
+       movdqa  xmm3,XMMWORD PTR[48+r11]
+       movdqa  xmm4,XMMWORD PTR[64+r11]
+       pcmpeqd xmm5,xmm5
+
+       movdqu  xmm6,XMMWORD PTR[rcx]
+       movdqa  XMMWORD PTR[rax],xmm7
+       lea     rax,QWORD PTR[16+rax]
+       dec     r10d
+       jmp     $L$key_loop
+ALIGN  16
+$L$key_loop::
+DB     102,15,56,0,244
+
+       movdqa  xmm8,xmm0
+       movdqa  xmm9,xmm1
+
+       pand    xmm8,xmm6
+       pand    xmm9,xmm6
+       movdqa  xmm10,xmm2
+       pcmpeqb xmm8,xmm0
+       psllq   xmm0,4
+       movdqa  xmm11,xmm3
+       pcmpeqb xmm9,xmm1
+       psllq   xmm1,4
+
+       pand    xmm10,xmm6
+       pand    xmm11,xmm6
+       movdqa  xmm12,xmm0
+       pcmpeqb xmm10,xmm2
+       psllq   xmm2,4
+       movdqa  xmm13,xmm1
+       pcmpeqb xmm11,xmm3
+       psllq   xmm3,4
+
+       movdqa  xmm14,xmm2
+       movdqa  xmm15,xmm3
+       pxor    xmm8,xmm5
+       pxor    xmm9,xmm5
+
+       pand    xmm12,xmm6
+       pand    xmm13,xmm6
+       movdqa  XMMWORD PTR[rax],xmm8
+       pcmpeqb xmm12,xmm0
+       psrlq   xmm0,4
+       movdqa  XMMWORD PTR[16+rax],xmm9
+       pcmpeqb xmm13,xmm1
+       psrlq   xmm1,4
+       lea     rcx,QWORD PTR[16+rcx]
+
+       pand    xmm14,xmm6
+       pand    xmm15,xmm6
+       movdqa  XMMWORD PTR[32+rax],xmm10
+       pcmpeqb xmm14,xmm2
+       psrlq   xmm2,4
+       movdqa  XMMWORD PTR[48+rax],xmm11
+       pcmpeqb xmm15,xmm3
+       psrlq   xmm3,4
+       movdqu  xmm6,XMMWORD PTR[rcx]
+
+       pxor    xmm13,xmm5
+       pxor    xmm14,xmm5
+       movdqa  XMMWORD PTR[64+rax],xmm12
+       movdqa  XMMWORD PTR[80+rax],xmm13
+       movdqa  XMMWORD PTR[96+rax],xmm14
+       movdqa  XMMWORD PTR[112+rax],xmm15
+       lea     rax,QWORD PTR[128+rax]
+       dec     r10d
+       jnz     $L$key_loop
+
+       movdqa  xmm7,XMMWORD PTR[80+r11]
+
+       DB      0F3h,0C3h               ;repret
+_bsaes_key_convert     ENDP
+EXTERN asm_AES_cbc_encrypt:NEAR
+PUBLIC bsaes_cbc_encrypt
+
+ALIGN  16
+bsaes_cbc_encrypt      PROC PUBLIC
+       mov     r11d,DWORD PTR[48+rsp]
+       cmp     r11d,0
+       jne     asm_AES_cbc_encrypt
+       cmp     r8,128
+       jb      asm_AES_cbc_encrypt
+
+       mov     rax,rsp
+$L$cbc_dec_prologue::
+       push    rbp
+       push    rbx
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       lea     rsp,QWORD PTR[((-72))+rsp]
+       mov     r10,QWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[((-160))+rsp]
+       movaps  XMMWORD PTR[64+rsp],xmm6
+       movaps  XMMWORD PTR[80+rsp],xmm7
+       movaps  XMMWORD PTR[96+rsp],xmm8
+       movaps  XMMWORD PTR[112+rsp],xmm9
+       movaps  XMMWORD PTR[128+rsp],xmm10
+       movaps  XMMWORD PTR[144+rsp],xmm11
+       movaps  XMMWORD PTR[160+rsp],xmm12
+       movaps  XMMWORD PTR[176+rsp],xmm13
+       movaps  XMMWORD PTR[192+rsp],xmm14
+       movaps  XMMWORD PTR[208+rsp],xmm15
+$L$cbc_dec_body::
+       mov     rbp,rsp
+       mov     eax,DWORD PTR[240+r9]
+       mov     r12,rcx
+       mov     r13,rdx
+       mov     r14,r8
+       mov     r15,r9
+       mov     rbx,r10
+       shr     r14,4
+
+       mov     edx,eax
+       shl     rax,7
+       sub     rax,96
+       sub     rsp,rax
+
+       mov     rax,rsp
+       mov     rcx,r15
+       mov     r10d,edx
+       call    _bsaes_key_convert
+       pxor    xmm7,XMMWORD PTR[rsp]
+       movdqa  XMMWORD PTR[rax],xmm6
+       movdqa  XMMWORD PTR[rsp],xmm7
+
+       movdqu  xmm14,XMMWORD PTR[rbx]
+       sub     r14,8
+$L$cbc_dec_loop::
+       movdqu  xmm15,XMMWORD PTR[r12]
+       movdqu  xmm0,XMMWORD PTR[16+r12]
+       movdqu  xmm1,XMMWORD PTR[32+r12]
+       movdqu  xmm2,XMMWORD PTR[48+r12]
+       movdqu  xmm3,XMMWORD PTR[64+r12]
+       movdqu  xmm4,XMMWORD PTR[80+r12]
+       mov     rax,rsp
+       movdqu  xmm5,XMMWORD PTR[96+r12]
+       mov     r10d,edx
+       movdqu  xmm6,XMMWORD PTR[112+r12]
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm5,xmm8
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm3,xmm9
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       pxor    xmm1,xmm10
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       pxor    xmm6,xmm11
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       pxor    xmm2,xmm12
+       movdqu  xmm14,XMMWORD PTR[112+r12]
+       pxor    xmm4,xmm13
+       movdqu  XMMWORD PTR[r13],xmm15
+       lea     r12,QWORD PTR[128+r12]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       movdqu  XMMWORD PTR[96+r13],xmm2
+       movdqu  XMMWORD PTR[112+r13],xmm4
+       lea     r13,QWORD PTR[128+r13]
+       sub     r14,8
+       jnc     $L$cbc_dec_loop
+
+       add     r14,8
+       jz      $L$cbc_dec_done
+
+       movdqu  xmm15,XMMWORD PTR[r12]
+       mov     rax,rsp
+       mov     r10d,edx
+       cmp     r14,2
+       jb      $L$cbc_dec_one
+       movdqu  xmm0,XMMWORD PTR[16+r12]
+       je      $L$cbc_dec_two
+       movdqu  xmm1,XMMWORD PTR[32+r12]
+       cmp     r14,4
+       jb      $L$cbc_dec_three
+       movdqu  xmm2,XMMWORD PTR[48+r12]
+       je      $L$cbc_dec_four
+       movdqu  xmm3,XMMWORD PTR[64+r12]
+       cmp     r14,6
+       jb      $L$cbc_dec_five
+       movdqu  xmm4,XMMWORD PTR[80+r12]
+       je      $L$cbc_dec_six
+       movdqu  xmm5,XMMWORD PTR[96+r12]
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+       call    _bsaes_decrypt8
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm5,xmm8
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm3,xmm9
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       pxor    xmm1,xmm10
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       pxor    xmm6,xmm11
+       movdqu  xmm14,XMMWORD PTR[96+r12]
+       pxor    xmm2,xmm12
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       movdqu  XMMWORD PTR[96+r13],xmm2
+       jmp     $L$cbc_dec_done
+ALIGN  16
+$L$cbc_dec_six::
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+       call    _bsaes_decrypt8
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm5,xmm8
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm3,xmm9
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       pxor    xmm1,xmm10
+       movdqu  xmm14,XMMWORD PTR[80+r12]
+       pxor    xmm6,xmm11
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       jmp     $L$cbc_dec_done
+ALIGN  16
+$L$cbc_dec_five::
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+       call    _bsaes_decrypt8
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm5,xmm8
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm3,xmm9
+       movdqu  xmm14,XMMWORD PTR[64+r12]
+       pxor    xmm1,xmm10
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       jmp     $L$cbc_dec_done
+ALIGN  16
+$L$cbc_dec_four::
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+       call    _bsaes_decrypt8
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm5,xmm8
+       movdqu  xmm14,XMMWORD PTR[48+r12]
+       pxor    xmm3,xmm9
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       jmp     $L$cbc_dec_done
+ALIGN  16
+$L$cbc_dec_three::
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+       call    _bsaes_decrypt8
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  xmm14,XMMWORD PTR[32+r12]
+       pxor    xmm5,xmm8
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       jmp     $L$cbc_dec_done
+ALIGN  16
+$L$cbc_dec_two::
+       movdqa  XMMWORD PTR[32+rbp],xmm14
+       call    _bsaes_decrypt8
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm14,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm7
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       jmp     $L$cbc_dec_done
+ALIGN  16
+$L$cbc_dec_one::
+       lea     rcx,QWORD PTR[r12]
+       lea     rdx,QWORD PTR[32+rbp]
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_decrypt
+
+       pxor    xmm14,XMMWORD PTR[32+rbp]
+       movdqu  XMMWORD PTR[r13],xmm14
+       movdqa  xmm14,xmm15
+
+$L$cbc_dec_done::
+       movdqu  XMMWORD PTR[rbx],xmm14
+       lea     rax,QWORD PTR[rsp]
+       pxor    xmm0,xmm0
+$L$cbc_dec_bzero::
+       movdqa  XMMWORD PTR[rax],xmm0
+       movdqa  XMMWORD PTR[16+rax],xmm0
+       lea     rax,QWORD PTR[32+rax]
+       cmp     rbp,rax
+       ja      $L$cbc_dec_bzero
+
+       lea     rsp,QWORD PTR[rbp]
+       movaps  xmm6,XMMWORD PTR[64+rbp]
+       movaps  xmm7,XMMWORD PTR[80+rbp]
+       movaps  xmm8,XMMWORD PTR[96+rbp]
+       movaps  xmm9,XMMWORD PTR[112+rbp]
+       movaps  xmm10,XMMWORD PTR[128+rbp]
+       movaps  xmm11,XMMWORD PTR[144+rbp]
+       movaps  xmm12,XMMWORD PTR[160+rbp]
+       movaps  xmm13,XMMWORD PTR[176+rbp]
+       movaps  xmm14,XMMWORD PTR[192+rbp]
+       movaps  xmm15,XMMWORD PTR[208+rbp]
+       lea     rsp,QWORD PTR[160+rbp]
+       mov     r15,QWORD PTR[72+rsp]
+       mov     r14,QWORD PTR[80+rsp]
+       mov     r13,QWORD PTR[88+rsp]
+       mov     r12,QWORD PTR[96+rsp]
+       mov     rbx,QWORD PTR[104+rsp]
+       mov     rax,QWORD PTR[112+rsp]
+       lea     rsp,QWORD PTR[120+rsp]
+       mov     rbp,rax
+$L$cbc_dec_epilogue::
+       DB      0F3h,0C3h               ;repret
+bsaes_cbc_encrypt      ENDP
+
+PUBLIC bsaes_ctr32_encrypt_blocks
+
+ALIGN  16
+bsaes_ctr32_encrypt_blocks     PROC PUBLIC
+       mov     rax,rsp
+$L$ctr_enc_prologue::
+       push    rbp
+       push    rbx
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       lea     rsp,QWORD PTR[((-72))+rsp]
+       mov     r10,QWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[((-160))+rsp]
+       movaps  XMMWORD PTR[64+rsp],xmm6
+       movaps  XMMWORD PTR[80+rsp],xmm7
+       movaps  XMMWORD PTR[96+rsp],xmm8
+       movaps  XMMWORD PTR[112+rsp],xmm9
+       movaps  XMMWORD PTR[128+rsp],xmm10
+       movaps  XMMWORD PTR[144+rsp],xmm11
+       movaps  XMMWORD PTR[160+rsp],xmm12
+       movaps  XMMWORD PTR[176+rsp],xmm13
+       movaps  XMMWORD PTR[192+rsp],xmm14
+       movaps  XMMWORD PTR[208+rsp],xmm15
+$L$ctr_enc_body::
+       mov     rbp,rsp
+       movdqu  xmm0,XMMWORD PTR[r10]
+       mov     eax,DWORD PTR[240+r9]
+       mov     r12,rcx
+       mov     r13,rdx
+       mov     r14,r8
+       mov     r15,r9
+       movdqa  XMMWORD PTR[32+rbp],xmm0
+       cmp     r8,8
+       jb      $L$ctr_enc_short
+
+       mov     ebx,eax
+       shl     rax,7
+       sub     rax,96
+       sub     rsp,rax
+
+       mov     rax,rsp
+       mov     rcx,r15
+       mov     r10d,ebx
+       call    _bsaes_key_convert
+       pxor    xmm7,xmm6
+       movdqa  XMMWORD PTR[rax],xmm7
+
+       movdqa  xmm8,XMMWORD PTR[rsp]
+       lea     r11,QWORD PTR[$L$ADD1]
+       movdqa  xmm15,XMMWORD PTR[32+rbp]
+       movdqa  xmm7,XMMWORD PTR[((-32))+r11]
+DB     102,68,15,56,0,199
+DB     102,68,15,56,0,255
+       movdqa  XMMWORD PTR[rsp],xmm8
+       jmp     $L$ctr_enc_loop
+ALIGN  16
+$L$ctr_enc_loop::
+       movdqa  XMMWORD PTR[32+rbp],xmm15
+       movdqa  xmm0,xmm15
+       movdqa  xmm1,xmm15
+       paddd   xmm0,XMMWORD PTR[r11]
+       movdqa  xmm2,xmm15
+       paddd   xmm1,XMMWORD PTR[16+r11]
+       movdqa  xmm3,xmm15
+       paddd   xmm2,XMMWORD PTR[32+r11]
+       movdqa  xmm4,xmm15
+       paddd   xmm3,XMMWORD PTR[48+r11]
+       movdqa  xmm5,xmm15
+       paddd   xmm4,XMMWORD PTR[64+r11]
+       movdqa  xmm6,xmm15
+       paddd   xmm5,XMMWORD PTR[80+r11]
+       paddd   xmm6,XMMWORD PTR[96+r11]
+
+
+
+       movdqa  xmm8,XMMWORD PTR[rsp]
+       lea     rax,QWORD PTR[16+rsp]
+       movdqa  xmm7,XMMWORD PTR[((-16))+r11]
+       pxor    xmm15,xmm8
+       pxor    xmm0,xmm8
+DB     102,68,15,56,0,255
+       pxor    xmm1,xmm8
+DB     102,15,56,0,199
+       pxor    xmm2,xmm8
+DB     102,15,56,0,207
+       pxor    xmm3,xmm8
+DB     102,15,56,0,215
+       pxor    xmm4,xmm8
+DB     102,15,56,0,223
+       pxor    xmm5,xmm8
+DB     102,15,56,0,231
+       pxor    xmm6,xmm8
+DB     102,15,56,0,239
+       lea     r11,QWORD PTR[$L$BS0]
+DB     102,15,56,0,247
+       mov     r10d,ebx
+
+       call    _bsaes_encrypt8_bitslice
+
+       sub     r14,8
+       jc      $L$ctr_enc_loop_done
+
+       movdqu  xmm7,XMMWORD PTR[r12]
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       movdqu  xmm14,XMMWORD PTR[112+r12]
+       lea     r12,QWORD PTR[128+r12]
+       pxor    xmm7,xmm15
+       movdqa  xmm15,XMMWORD PTR[32+rbp]
+       pxor    xmm0,xmm8
+       movdqu  XMMWORD PTR[r13],xmm7
+       pxor    xmm3,xmm9
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm5,xmm10
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       pxor    xmm2,xmm11
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       pxor    xmm6,xmm12
+       movdqu  XMMWORD PTR[64+r13],xmm2
+       pxor    xmm1,xmm13
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       pxor    xmm4,xmm14
+       movdqu  XMMWORD PTR[96+r13],xmm1
+       lea     r11,QWORD PTR[$L$ADD1]
+       movdqu  XMMWORD PTR[112+r13],xmm4
+       lea     r13,QWORD PTR[128+r13]
+       paddd   xmm15,XMMWORD PTR[112+r11]
+       jnz     $L$ctr_enc_loop
+
+       jmp     $L$ctr_enc_done
+ALIGN  16
+$L$ctr_enc_loop_done::
+       add     r14,8
+       movdqu  xmm7,XMMWORD PTR[r12]
+       pxor    xmm15,xmm7
+       movdqu  XMMWORD PTR[r13],xmm15
+       cmp     r14,2
+       jb      $L$ctr_enc_done
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm0,xmm8
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       je      $L$ctr_enc_done
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm3,xmm9
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       cmp     r14,4
+       jb      $L$ctr_enc_done
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm5,xmm10
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       je      $L$ctr_enc_done
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       pxor    xmm2,xmm11
+       movdqu  XMMWORD PTR[64+r13],xmm2
+       cmp     r14,6
+       jb      $L$ctr_enc_done
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       pxor    xmm6,xmm12
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       je      $L$ctr_enc_done
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       pxor    xmm1,xmm13
+       movdqu  XMMWORD PTR[96+r13],xmm1
+       jmp     $L$ctr_enc_done
+
+ALIGN  16
+$L$ctr_enc_short::
+       lea     rcx,QWORD PTR[32+rbp]
+       lea     rdx,QWORD PTR[48+rbp]
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_encrypt
+       movdqu  xmm0,XMMWORD PTR[r12]
+       lea     r12,QWORD PTR[16+r12]
+       mov     eax,DWORD PTR[44+rbp]
+       bswap   eax
+       pxor    xmm0,XMMWORD PTR[48+rbp]
+       inc     eax
+       movdqu  XMMWORD PTR[r13],xmm0
+       bswap   eax
+       lea     r13,QWORD PTR[16+r13]
+       mov     DWORD PTR[44+rsp],eax
+       dec     r14
+       jnz     $L$ctr_enc_short
+
+$L$ctr_enc_done::
+       lea     rax,QWORD PTR[rsp]
+       pxor    xmm0,xmm0
+$L$ctr_enc_bzero::
+       movdqa  XMMWORD PTR[rax],xmm0
+       movdqa  XMMWORD PTR[16+rax],xmm0
+       lea     rax,QWORD PTR[32+rax]
+       cmp     rbp,rax
+       ja      $L$ctr_enc_bzero
+
+       lea     rsp,QWORD PTR[rbp]
+       movaps  xmm6,XMMWORD PTR[64+rbp]
+       movaps  xmm7,XMMWORD PTR[80+rbp]
+       movaps  xmm8,XMMWORD PTR[96+rbp]
+       movaps  xmm9,XMMWORD PTR[112+rbp]
+       movaps  xmm10,XMMWORD PTR[128+rbp]
+       movaps  xmm11,XMMWORD PTR[144+rbp]
+       movaps  xmm12,XMMWORD PTR[160+rbp]
+       movaps  xmm13,XMMWORD PTR[176+rbp]
+       movaps  xmm14,XMMWORD PTR[192+rbp]
+       movaps  xmm15,XMMWORD PTR[208+rbp]
+       lea     rsp,QWORD PTR[160+rbp]
+       mov     r15,QWORD PTR[72+rsp]
+       mov     r14,QWORD PTR[80+rsp]
+       mov     r13,QWORD PTR[88+rsp]
+       mov     r12,QWORD PTR[96+rsp]
+       mov     rbx,QWORD PTR[104+rsp]
+       mov     rax,QWORD PTR[112+rsp]
+       lea     rsp,QWORD PTR[120+rsp]
+       mov     rbp,rax
+$L$ctr_enc_epilogue::
+       DB      0F3h,0C3h               ;repret
+bsaes_ctr32_encrypt_blocks     ENDP
+PUBLIC bsaes_xts_encrypt
+
+ALIGN  16
+bsaes_xts_encrypt      PROC PUBLIC
+       mov     rax,rsp
+$L$xts_enc_prologue::
+       push    rbp
+       push    rbx
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       lea     rsp,QWORD PTR[((-72))+rsp]
+       mov     r10,QWORD PTR[160+rsp]
+       mov     r11d,DWORD PTR[168+rsp]
+       lea     rsp,QWORD PTR[((-160))+rsp]
+       movaps  XMMWORD PTR[64+rsp],xmm6
+       movaps  XMMWORD PTR[80+rsp],xmm7
+       movaps  XMMWORD PTR[96+rsp],xmm8
+       movaps  XMMWORD PTR[112+rsp],xmm9
+       movaps  XMMWORD PTR[128+rsp],xmm10
+       movaps  XMMWORD PTR[144+rsp],xmm11
+       movaps  XMMWORD PTR[160+rsp],xmm12
+       movaps  XMMWORD PTR[176+rsp],xmm13
+       movaps  XMMWORD PTR[192+rsp],xmm14
+       movaps  XMMWORD PTR[208+rsp],xmm15
+$L$xts_enc_body::
+       mov     rbp,rsp
+       mov     r12,rcx
+       mov     r13,rdx
+       mov     r14,r8
+       mov     r15,r9
+
+       lea     rcx,QWORD PTR[r11]
+       lea     rdx,QWORD PTR[32+rbp]
+       lea     r8,QWORD PTR[r10]
+       call    asm_AES_encrypt
+
+
+       mov     eax,DWORD PTR[240+r15]
+       mov     rbx,r14
+
+       mov     edx,eax
+       shl     rax,7
+       sub     rax,96
+       sub     rsp,rax
+
+       mov     rax,rsp
+       mov     rcx,r15
+       mov     r10d,edx
+       call    _bsaes_key_convert
+       pxor    xmm7,xmm6
+       movdqa  XMMWORD PTR[rax],xmm7
+
+       and     r14,-16
+       sub     rsp,080h
+       movdqa  xmm6,XMMWORD PTR[32+rbp]
+
+       pxor    xmm14,xmm14
+       movdqa  xmm12,XMMWORD PTR[$L$xts_magic]
+       pcmpgtd xmm14,xmm6
+
+       sub     r14,080h
+       jc      $L$xts_enc_short
+       jmp     $L$xts_enc_loop
+
+ALIGN  16
+$L$xts_enc_loop::
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm15,xmm6
+       movdqa  XMMWORD PTR[rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm0,xmm6
+       movdqa  XMMWORD PTR[16+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm7,XMMWORD PTR[r12]
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm1,xmm6
+       movdqa  XMMWORD PTR[32+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm15,xmm7
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm2,xmm6
+       movdqa  XMMWORD PTR[48+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm0,xmm8
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm3,xmm6
+       movdqa  XMMWORD PTR[64+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm1,xmm9
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm4,xmm6
+       movdqa  XMMWORD PTR[80+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       pxor    xmm2,xmm10
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm5,xmm6
+       movdqa  XMMWORD PTR[96+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       pxor    xmm3,xmm11
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       pxor    xmm4,xmm12
+       movdqu  xmm14,XMMWORD PTR[112+r12]
+       lea     r12,QWORD PTR[128+r12]
+       movdqa  XMMWORD PTR[112+rsp],xmm6
+       pxor    xmm5,xmm13
+       lea     rax,QWORD PTR[128+rsp]
+       pxor    xmm6,xmm14
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm3,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm5,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       pxor    xmm2,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       pxor    xmm6,XMMWORD PTR[80+rsp]
+       movdqu  XMMWORD PTR[64+r13],xmm2
+       pxor    xmm1,XMMWORD PTR[96+rsp]
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       pxor    xmm4,XMMWORD PTR[112+rsp]
+       movdqu  XMMWORD PTR[96+r13],xmm1
+       movdqu  XMMWORD PTR[112+r13],xmm4
+       lea     r13,QWORD PTR[128+r13]
+
+       movdqa  xmm6,XMMWORD PTR[112+rsp]
+       pxor    xmm14,xmm14
+       movdqa  xmm12,XMMWORD PTR[$L$xts_magic]
+       pcmpgtd xmm14,xmm6
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+
+       sub     r14,080h
+       jnc     $L$xts_enc_loop
+
+$L$xts_enc_short::
+       add     r14,080h
+       jz      $L$xts_enc_done
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm15,xmm6
+       movdqa  XMMWORD PTR[rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm0,xmm6
+       movdqa  XMMWORD PTR[16+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm7,XMMWORD PTR[r12]
+       cmp     r14,16
+       je      $L$xts_enc_1
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm1,xmm6
+       movdqa  XMMWORD PTR[32+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       cmp     r14,32
+       je      $L$xts_enc_2
+       pxor    xmm15,xmm7
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm2,xmm6
+       movdqa  XMMWORD PTR[48+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       cmp     r14,48
+       je      $L$xts_enc_3
+       pxor    xmm0,xmm8
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm3,xmm6
+       movdqa  XMMWORD PTR[64+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       cmp     r14,64
+       je      $L$xts_enc_4
+       pxor    xmm1,xmm9
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm4,xmm6
+       movdqa  XMMWORD PTR[80+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       cmp     r14,80
+       je      $L$xts_enc_5
+       pxor    xmm2,xmm10
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm5,xmm6
+       movdqa  XMMWORD PTR[96+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       cmp     r14,96
+       je      $L$xts_enc_6
+       pxor    xmm3,xmm11
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       pxor    xmm4,xmm12
+       movdqa  XMMWORD PTR[112+rsp],xmm6
+       lea     r12,QWORD PTR[112+r12]
+       pxor    xmm5,xmm13
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm3,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm5,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       pxor    xmm2,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       pxor    xmm6,XMMWORD PTR[80+rsp]
+       movdqu  XMMWORD PTR[64+r13],xmm2
+       pxor    xmm1,XMMWORD PTR[96+rsp]
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       movdqu  XMMWORD PTR[96+r13],xmm1
+       lea     r13,QWORD PTR[112+r13]
+
+       movdqa  xmm6,XMMWORD PTR[112+rsp]
+       jmp     $L$xts_enc_done
+ALIGN  16
+$L$xts_enc_6::
+       pxor    xmm3,xmm11
+       lea     r12,QWORD PTR[96+r12]
+       pxor    xmm4,xmm12
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm3,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm5,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       pxor    xmm2,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       pxor    xmm6,XMMWORD PTR[80+rsp]
+       movdqu  XMMWORD PTR[64+r13],xmm2
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       lea     r13,QWORD PTR[96+r13]
+
+       movdqa  xmm6,XMMWORD PTR[96+rsp]
+       jmp     $L$xts_enc_done
+ALIGN  16
+$L$xts_enc_5::
+       pxor    xmm2,xmm10
+       lea     r12,QWORD PTR[80+r12]
+       pxor    xmm3,xmm11
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm3,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm5,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       pxor    xmm2,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       movdqu  XMMWORD PTR[64+r13],xmm2
+       lea     r13,QWORD PTR[80+r13]
+
+       movdqa  xmm6,XMMWORD PTR[80+rsp]
+       jmp     $L$xts_enc_done
+ALIGN  16
+$L$xts_enc_4::
+       pxor    xmm1,xmm9
+       lea     r12,QWORD PTR[64+r12]
+       pxor    xmm2,xmm10
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm3,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm5,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       movdqu  XMMWORD PTR[48+r13],xmm5
+       lea     r13,QWORD PTR[64+r13]
+
+       movdqa  xmm6,XMMWORD PTR[64+rsp]
+       jmp     $L$xts_enc_done
+ALIGN  16
+$L$xts_enc_3::
+       pxor    xmm0,xmm8
+       lea     r12,QWORD PTR[48+r12]
+       pxor    xmm1,xmm9
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm3,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm3
+       lea     r13,QWORD PTR[48+r13]
+
+       movdqa  xmm6,XMMWORD PTR[48+rsp]
+       jmp     $L$xts_enc_done
+ALIGN  16
+$L$xts_enc_2::
+       pxor    xmm15,xmm7
+       lea     r12,QWORD PTR[32+r12]
+       pxor    xmm0,xmm8
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_encrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       lea     r13,QWORD PTR[32+r13]
+
+       movdqa  xmm6,XMMWORD PTR[32+rsp]
+       jmp     $L$xts_enc_done
+ALIGN  16
+$L$xts_enc_1::
+       pxor    xmm7,xmm15
+       lea     r12,QWORD PTR[16+r12]
+       movdqa  XMMWORD PTR[32+rbp],xmm7
+       lea     rcx,QWORD PTR[32+rbp]
+       lea     rdx,QWORD PTR[32+rbp]
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_encrypt
+
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+
+
+
+
+
+       movdqu  XMMWORD PTR[r13],xmm15
+       lea     r13,QWORD PTR[16+r13]
+
+       movdqa  xmm6,XMMWORD PTR[16+rsp]
+
+$L$xts_enc_done::
+       and     ebx,15
+       jz      $L$xts_enc_ret
+       mov     rdx,r13
+
+$L$xts_enc_steal::
+       movzx   eax,BYTE PTR[r12]
+       movzx   ecx,BYTE PTR[((-16))+rdx]
+       lea     r12,QWORD PTR[1+r12]
+       mov     BYTE PTR[((-16))+rdx],al
+       mov     BYTE PTR[rdx],cl
+       lea     rdx,QWORD PTR[1+rdx]
+       sub     ebx,1
+       jnz     $L$xts_enc_steal
+
+       movdqu  xmm15,XMMWORD PTR[((-16))+r13]
+       lea     rcx,QWORD PTR[32+rbp]
+       pxor    xmm15,xmm6
+       lea     rdx,QWORD PTR[32+rbp]
+       movdqa  XMMWORD PTR[32+rbp],xmm15
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_encrypt
+
+       pxor    xmm6,XMMWORD PTR[32+rbp]
+       movdqu  XMMWORD PTR[(-16)+r13],xmm6
+
+$L$xts_enc_ret::
+       lea     rax,QWORD PTR[rsp]
+       pxor    xmm0,xmm0
+$L$xts_enc_bzero::
+       movdqa  XMMWORD PTR[rax],xmm0
+       movdqa  XMMWORD PTR[16+rax],xmm0
+       lea     rax,QWORD PTR[32+rax]
+       cmp     rbp,rax
+       ja      $L$xts_enc_bzero
+
+       lea     rsp,QWORD PTR[rbp]
+       movaps  xmm6,XMMWORD PTR[64+rbp]
+       movaps  xmm7,XMMWORD PTR[80+rbp]
+       movaps  xmm8,XMMWORD PTR[96+rbp]
+       movaps  xmm9,XMMWORD PTR[112+rbp]
+       movaps  xmm10,XMMWORD PTR[128+rbp]
+       movaps  xmm11,XMMWORD PTR[144+rbp]
+       movaps  xmm12,XMMWORD PTR[160+rbp]
+       movaps  xmm13,XMMWORD PTR[176+rbp]
+       movaps  xmm14,XMMWORD PTR[192+rbp]
+       movaps  xmm15,XMMWORD PTR[208+rbp]
+       lea     rsp,QWORD PTR[160+rbp]
+       mov     r15,QWORD PTR[72+rsp]
+       mov     r14,QWORD PTR[80+rsp]
+       mov     r13,QWORD PTR[88+rsp]
+       mov     r12,QWORD PTR[96+rsp]
+       mov     rbx,QWORD PTR[104+rsp]
+       mov     rax,QWORD PTR[112+rsp]
+       lea     rsp,QWORD PTR[120+rsp]
+       mov     rbp,rax
+$L$xts_enc_epilogue::
+       DB      0F3h,0C3h               ;repret
+bsaes_xts_encrypt      ENDP
+
+PUBLIC bsaes_xts_decrypt
+
+ALIGN  16
+bsaes_xts_decrypt      PROC PUBLIC
+       mov     rax,rsp
+$L$xts_dec_prologue::
+       push    rbp
+       push    rbx
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       lea     rsp,QWORD PTR[((-72))+rsp]
+       mov     r10,QWORD PTR[160+rsp]
+       mov     r11d,DWORD PTR[168+rsp]
+       lea     rsp,QWORD PTR[((-160))+rsp]
+       movaps  XMMWORD PTR[64+rsp],xmm6
+       movaps  XMMWORD PTR[80+rsp],xmm7
+       movaps  XMMWORD PTR[96+rsp],xmm8
+       movaps  XMMWORD PTR[112+rsp],xmm9
+       movaps  XMMWORD PTR[128+rsp],xmm10
+       movaps  XMMWORD PTR[144+rsp],xmm11
+       movaps  XMMWORD PTR[160+rsp],xmm12
+       movaps  XMMWORD PTR[176+rsp],xmm13
+       movaps  XMMWORD PTR[192+rsp],xmm14
+       movaps  XMMWORD PTR[208+rsp],xmm15
+$L$xts_dec_body::
+       mov     rbp,rsp
+       mov     r12,rcx
+       mov     r13,rdx
+       mov     r14,r8
+       mov     r15,r9
+
+       lea     rcx,QWORD PTR[r11]
+       lea     rdx,QWORD PTR[32+rbp]
+       lea     r8,QWORD PTR[r10]
+       call    asm_AES_encrypt
+
+
+       mov     eax,DWORD PTR[240+r15]
+       mov     rbx,r14
+
+       mov     edx,eax
+       shl     rax,7
+       sub     rax,96
+       sub     rsp,rax
+
+       mov     rax,rsp
+       mov     rcx,r15
+       mov     r10d,edx
+       call    _bsaes_key_convert
+       pxor    xmm7,XMMWORD PTR[rsp]
+       movdqa  XMMWORD PTR[rax],xmm6
+       movdqa  XMMWORD PTR[rsp],xmm7
+
+       xor     eax,eax
+       and     r14,-16
+       test    ebx,15
+       setnz   al
+       shl     rax,4
+       sub     r14,rax
+
+       sub     rsp,080h
+       movdqa  xmm6,XMMWORD PTR[32+rbp]
+
+       pxor    xmm14,xmm14
+       movdqa  xmm12,XMMWORD PTR[$L$xts_magic]
+       pcmpgtd xmm14,xmm6
+
+       sub     r14,080h
+       jc      $L$xts_dec_short
+       jmp     $L$xts_dec_loop
+
+ALIGN  16
+$L$xts_dec_loop::
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm15,xmm6
+       movdqa  XMMWORD PTR[rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm0,xmm6
+       movdqa  XMMWORD PTR[16+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm7,XMMWORD PTR[r12]
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm1,xmm6
+       movdqa  XMMWORD PTR[32+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       pxor    xmm15,xmm7
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm2,xmm6
+       movdqa  XMMWORD PTR[48+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       pxor    xmm0,xmm8
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm3,xmm6
+       movdqa  XMMWORD PTR[64+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       pxor    xmm1,xmm9
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm4,xmm6
+       movdqa  XMMWORD PTR[80+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       pxor    xmm2,xmm10
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm5,xmm6
+       movdqa  XMMWORD PTR[96+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       pxor    xmm3,xmm11
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       pxor    xmm4,xmm12
+       movdqu  xmm14,XMMWORD PTR[112+r12]
+       lea     r12,QWORD PTR[128+r12]
+       movdqa  XMMWORD PTR[112+rsp],xmm6
+       pxor    xmm5,xmm13
+       lea     rax,QWORD PTR[128+rsp]
+       pxor    xmm6,xmm14
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm5,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm3,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       pxor    xmm1,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       pxor    xmm6,XMMWORD PTR[80+rsp]
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       pxor    xmm2,XMMWORD PTR[96+rsp]
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       pxor    xmm4,XMMWORD PTR[112+rsp]
+       movdqu  XMMWORD PTR[96+r13],xmm2
+       movdqu  XMMWORD PTR[112+r13],xmm4
+       lea     r13,QWORD PTR[128+r13]
+
+       movdqa  xmm6,XMMWORD PTR[112+rsp]
+       pxor    xmm14,xmm14
+       movdqa  xmm12,XMMWORD PTR[$L$xts_magic]
+       pcmpgtd xmm14,xmm6
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+
+       sub     r14,080h
+       jnc     $L$xts_dec_loop
+
+$L$xts_dec_short::
+       add     r14,080h
+       jz      $L$xts_dec_done
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm15,xmm6
+       movdqa  XMMWORD PTR[rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm0,xmm6
+       movdqa  XMMWORD PTR[16+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm7,XMMWORD PTR[r12]
+       cmp     r14,16
+       je      $L$xts_dec_1
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm1,xmm6
+       movdqa  XMMWORD PTR[32+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm8,XMMWORD PTR[16+r12]
+       cmp     r14,32
+       je      $L$xts_dec_2
+       pxor    xmm15,xmm7
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm2,xmm6
+       movdqa  XMMWORD PTR[48+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm9,XMMWORD PTR[32+r12]
+       cmp     r14,48
+       je      $L$xts_dec_3
+       pxor    xmm0,xmm8
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm3,xmm6
+       movdqa  XMMWORD PTR[64+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm10,XMMWORD PTR[48+r12]
+       cmp     r14,64
+       je      $L$xts_dec_4
+       pxor    xmm1,xmm9
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm4,xmm6
+       movdqa  XMMWORD PTR[80+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm11,XMMWORD PTR[64+r12]
+       cmp     r14,80
+       je      $L$xts_dec_5
+       pxor    xmm2,xmm10
+       pshufd  xmm13,xmm14,013h
+       pxor    xmm14,xmm14
+       movdqa  xmm5,xmm6
+       movdqa  XMMWORD PTR[96+rsp],xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       pcmpgtd xmm14,xmm6
+       pxor    xmm6,xmm13
+       movdqu  xmm12,XMMWORD PTR[80+r12]
+       cmp     r14,96
+       je      $L$xts_dec_6
+       pxor    xmm3,xmm11
+       movdqu  xmm13,XMMWORD PTR[96+r12]
+       pxor    xmm4,xmm12
+       movdqa  XMMWORD PTR[112+rsp],xmm6
+       lea     r12,QWORD PTR[112+r12]
+       pxor    xmm5,xmm13
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm5,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm3,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       pxor    xmm1,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       pxor    xmm6,XMMWORD PTR[80+rsp]
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       pxor    xmm2,XMMWORD PTR[96+rsp]
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       movdqu  XMMWORD PTR[96+r13],xmm2
+       lea     r13,QWORD PTR[112+r13]
+
+       movdqa  xmm6,XMMWORD PTR[112+rsp]
+       jmp     $L$xts_dec_done
+ALIGN  16
+$L$xts_dec_6::
+       pxor    xmm3,xmm11
+       lea     r12,QWORD PTR[96+r12]
+       pxor    xmm4,xmm12
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm5,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm3,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       pxor    xmm1,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       pxor    xmm6,XMMWORD PTR[80+rsp]
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       movdqu  XMMWORD PTR[80+r13],xmm6
+       lea     r13,QWORD PTR[96+r13]
+
+       movdqa  xmm6,XMMWORD PTR[96+rsp]
+       jmp     $L$xts_dec_done
+ALIGN  16
+$L$xts_dec_5::
+       pxor    xmm2,xmm10
+       lea     r12,QWORD PTR[80+r12]
+       pxor    xmm3,xmm11
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm5,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm3,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       pxor    xmm1,XMMWORD PTR[64+rsp]
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       movdqu  XMMWORD PTR[64+r13],xmm1
+       lea     r13,QWORD PTR[80+r13]
+
+       movdqa  xmm6,XMMWORD PTR[80+rsp]
+       jmp     $L$xts_dec_done
+ALIGN  16
+$L$xts_dec_4::
+       pxor    xmm1,xmm9
+       lea     r12,QWORD PTR[64+r12]
+       pxor    xmm2,xmm10
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm5,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       pxor    xmm3,XMMWORD PTR[48+rsp]
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       movdqu  XMMWORD PTR[48+r13],xmm3
+       lea     r13,QWORD PTR[64+r13]
+
+       movdqa  xmm6,XMMWORD PTR[64+rsp]
+       jmp     $L$xts_dec_done
+ALIGN  16
+$L$xts_dec_3::
+       pxor    xmm0,xmm8
+       lea     r12,QWORD PTR[48+r12]
+       pxor    xmm1,xmm9
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       pxor    xmm5,XMMWORD PTR[32+rsp]
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       movdqu  XMMWORD PTR[32+r13],xmm5
+       lea     r13,QWORD PTR[48+r13]
+
+       movdqa  xmm6,XMMWORD PTR[48+rsp]
+       jmp     $L$xts_dec_done
+ALIGN  16
+$L$xts_dec_2::
+       pxor    xmm15,xmm7
+       lea     r12,QWORD PTR[32+r12]
+       pxor    xmm0,xmm8
+       lea     rax,QWORD PTR[128+rsp]
+       mov     r10d,edx
+
+       call    _bsaes_decrypt8
+
+       pxor    xmm15,XMMWORD PTR[rsp]
+       pxor    xmm0,XMMWORD PTR[16+rsp]
+       movdqu  XMMWORD PTR[r13],xmm15
+       movdqu  XMMWORD PTR[16+r13],xmm0
+       lea     r13,QWORD PTR[32+r13]
+
+       movdqa  xmm6,XMMWORD PTR[32+rsp]
+       jmp     $L$xts_dec_done
+ALIGN  16
+$L$xts_dec_1::
+       pxor    xmm7,xmm15
+       lea     r12,QWORD PTR[16+r12]
+       movdqa  XMMWORD PTR[32+rbp],xmm7
+       lea     rcx,QWORD PTR[32+rbp]
+       lea     rdx,QWORD PTR[32+rbp]
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_decrypt
+
+       pxor    xmm15,XMMWORD PTR[32+rbp]
+
+
+
+
+
+       movdqu  XMMWORD PTR[r13],xmm15
+       lea     r13,QWORD PTR[16+r13]
+
+       movdqa  xmm6,XMMWORD PTR[16+rsp]
+
+$L$xts_dec_done::
+       and     ebx,15
+       jz      $L$xts_dec_ret
+
+       pxor    xmm14,xmm14
+       movdqa  xmm12,XMMWORD PTR[$L$xts_magic]
+       pcmpgtd xmm14,xmm6
+       pshufd  xmm13,xmm14,013h
+       movdqa  xmm5,xmm6
+       paddq   xmm6,xmm6
+       pand    xmm13,xmm12
+       movdqu  xmm15,XMMWORD PTR[r12]
+       pxor    xmm6,xmm13
+
+       lea     rcx,QWORD PTR[32+rbp]
+       pxor    xmm15,xmm6
+       lea     rdx,QWORD PTR[32+rbp]
+       movdqa  XMMWORD PTR[32+rbp],xmm15
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_decrypt
+
+       pxor    xmm6,XMMWORD PTR[32+rbp]
+       mov     rdx,r13
+       movdqu  XMMWORD PTR[r13],xmm6
+
+$L$xts_dec_steal::
+       movzx   eax,BYTE PTR[16+r12]
+       movzx   ecx,BYTE PTR[rdx]
+       lea     r12,QWORD PTR[1+r12]
+       mov     BYTE PTR[rdx],al
+       mov     BYTE PTR[16+rdx],cl
+       lea     rdx,QWORD PTR[1+rdx]
+       sub     ebx,1
+       jnz     $L$xts_dec_steal
+
+       movdqu  xmm15,XMMWORD PTR[r13]
+       lea     rcx,QWORD PTR[32+rbp]
+       pxor    xmm15,xmm5
+       lea     rdx,QWORD PTR[32+rbp]
+       movdqa  XMMWORD PTR[32+rbp],xmm15
+       lea     r8,QWORD PTR[r15]
+       call    asm_AES_decrypt
+
+       pxor    xmm5,XMMWORD PTR[32+rbp]
+       movdqu  XMMWORD PTR[r13],xmm5
+
+$L$xts_dec_ret::
+       lea     rax,QWORD PTR[rsp]
+       pxor    xmm0,xmm0
+$L$xts_dec_bzero::
+       movdqa  XMMWORD PTR[rax],xmm0
+       movdqa  XMMWORD PTR[16+rax],xmm0
+       lea     rax,QWORD PTR[32+rax]
+       cmp     rbp,rax
+       ja      $L$xts_dec_bzero
+
+       lea     rsp,QWORD PTR[rbp]
+       movaps  xmm6,XMMWORD PTR[64+rbp]
+       movaps  xmm7,XMMWORD PTR[80+rbp]
+       movaps  xmm8,XMMWORD PTR[96+rbp]
+       movaps  xmm9,XMMWORD PTR[112+rbp]
+       movaps  xmm10,XMMWORD PTR[128+rbp]
+       movaps  xmm11,XMMWORD PTR[144+rbp]
+       movaps  xmm12,XMMWORD PTR[160+rbp]
+       movaps  xmm13,XMMWORD PTR[176+rbp]
+       movaps  xmm14,XMMWORD PTR[192+rbp]
+       movaps  xmm15,XMMWORD PTR[208+rbp]
+       lea     rsp,QWORD PTR[160+rbp]
+       mov     r15,QWORD PTR[72+rsp]
+       mov     r14,QWORD PTR[80+rsp]
+       mov     r13,QWORD PTR[88+rsp]
+       mov     r12,QWORD PTR[96+rsp]
+       mov     rbx,QWORD PTR[104+rsp]
+       mov     rax,QWORD PTR[112+rsp]
+       lea     rsp,QWORD PTR[120+rsp]
+       mov     rbp,rax
+$L$xts_dec_epilogue::
+       DB      0F3h,0C3h               ;repret
+bsaes_xts_decrypt      ENDP
+
+ALIGN  64
+_bsaes_const::
+$L$M0ISR::
+       DQ      00a0e0206070b0f03h,00004080c0d010509h
+$L$ISRM0::
+       DQ      001040b0e0205080fh,00306090c00070a0dh
+$L$ISR::
+       DQ      00504070602010003h,00f0e0d0c080b0a09h
+$L$BS0::
+       DQ      05555555555555555h,05555555555555555h
+$L$BS1::
+       DQ      03333333333333333h,03333333333333333h
+$L$BS2::
+       DQ      00f0f0f0f0f0f0f0fh,00f0f0f0f0f0f0f0fh
+$L$SR::
+       DQ      00504070600030201h,00f0e0d0c0a09080bh
+$L$SRM0::
+       DQ      00304090e00050a0fh,001060b0c0207080dh
+$L$M0SR::
+       DQ      00a0e02060f03070bh,00004080c05090d01h
+$L$SWPUP::
+       DQ      00706050403020100h,00c0d0e0f0b0a0908h
+$L$SWPUPM0SR::
+       DQ      00a0d02060c03070bh,00004080f05090e01h
+$L$ADD1::
+       DQ      00000000000000000h,00000000100000000h
+$L$ADD2::
+       DQ      00000000000000000h,00000000200000000h
+$L$ADD3::
+       DQ      00000000000000000h,00000000300000000h
+$L$ADD4::
+       DQ      00000000000000000h,00000000400000000h
+$L$ADD5::
+       DQ      00000000000000000h,00000000500000000h
+$L$ADD6::
+       DQ      00000000000000000h,00000000600000000h
+$L$ADD7::
+       DQ      00000000000000000h,00000000700000000h
+$L$ADD8::
+       DQ      00000000000000000h,00000000800000000h
+$L$xts_magic::
+       DD      087h,0,1,0
+$L$masks::
+       DQ      00101010101010101h,00101010101010101h
+       DQ      00202020202020202h,00202020202020202h
+       DQ      00404040404040404h,00404040404040404h
+       DQ      00808080808080808h,00808080808080808h
+$L$M0::
+       DQ      002060a0e03070b0fh,00004080c0105090dh
+$L$63::
+       DQ      06363636363636363h,06363636363636363h
+DB     66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102
+DB     111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44
+DB     32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44
+DB     32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32
+DB     65,110,100,121,32,80,111,108,121,97,107,111,118,0
+ALIGN  64
+
+EXTERN __imp_RtlVirtualUnwind:NEAR
+
+ALIGN  16
+se_handler     PROC PRIVATE
+       push    rsi
+       push    rdi
+       push    rbx
+       push    rbp
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       pushfq
+       sub     rsp,64
+
+       mov     rax,QWORD PTR[120+r8]
+       mov     rbx,QWORD PTR[248+r8]
+
+       mov     rsi,QWORD PTR[8+r9]
+       mov     r11,QWORD PTR[56+r9]
+
+       mov     r10d,DWORD PTR[r11]
+       lea     r10,QWORD PTR[r10*1+rsi]
+       cmp     rbx,r10
+       jb      $L$in_prologue
+
+       mov     rax,QWORD PTR[152+r8]
+
+       mov     r10d,DWORD PTR[4+r11]
+       lea     r10,QWORD PTR[r10*1+rsi]
+       cmp     rbx,r10
+       jae     $L$in_prologue
+
+       mov     rax,QWORD PTR[160+r8]
+
+       lea     rsi,QWORD PTR[64+rax]
+       lea     rdi,QWORD PTR[512+r8]
+       mov     ecx,20
+       DD      0a548f3fch
+
+       lea     rax,QWORD PTR[160+rax]
+
+       mov     rbp,QWORD PTR[112+rax]
+       mov     rbx,QWORD PTR[104+rax]
+       mov     r12,QWORD PTR[96+rax]
+       mov     r13,QWORD PTR[88+rax]
+       mov     r14,QWORD PTR[80+rax]
+       mov     r15,QWORD PTR[72+rax]
+       lea     rax,QWORD PTR[120+rax]
+       mov     QWORD PTR[144+r8],rbx
+       mov     QWORD PTR[160+r8],rbp
+       mov     QWORD PTR[216+r8],r12
+       mov     QWORD PTR[224+r8],r13
+       mov     QWORD PTR[232+r8],r14
+       mov     QWORD PTR[240+r8],r15
+
+$L$in_prologue::
+       mov     QWORD PTR[152+r8],rax
+
+       mov     rdi,QWORD PTR[40+r9]
+       mov     rsi,r8
+       mov     ecx,154
+       DD      0a548f3fch
+
+
+       mov     rsi,r9
+       xor     rcx,rcx
+       mov     rdx,QWORD PTR[8+rsi]
+       mov     r8,QWORD PTR[rsi]
+       mov     r9,QWORD PTR[16+rsi]
+       mov     r10,QWORD PTR[40+rsi]
+       lea     r11,QWORD PTR[56+rsi]
+       lea     r12,QWORD PTR[24+rsi]
+       mov     QWORD PTR[32+rsp],r10
+       mov     QWORD PTR[40+rsp],r11
+       mov     QWORD PTR[48+rsp],r12
+       mov     QWORD PTR[56+rsp],rcx
+       call    QWORD PTR[__imp_RtlVirtualUnwind]
+
+       mov     eax,1
+       add     rsp,64
+       popfq
+       pop     r15
+       pop     r14
+       pop     r13
+       pop     r12
+       pop     rbp
+       pop     rbx
+       pop     rdi
+       pop     rsi
+       DB      0F3h,0C3h               ;repret
+se_handler     ENDP
+
+.text$ ENDS
+.pdata SEGMENT READONLY ALIGN(4)
+ALIGN  4
+       DD      imagerel $L$cbc_dec_prologue
+       DD      imagerel $L$cbc_dec_epilogue
+       DD      imagerel $L$cbc_dec_info
+
+       DD      imagerel $L$ctr_enc_prologue
+       DD      imagerel $L$ctr_enc_epilogue
+       DD      imagerel $L$ctr_enc_info
+
+       DD      imagerel $L$xts_enc_prologue
+       DD      imagerel $L$xts_enc_epilogue
+       DD      imagerel $L$xts_enc_info
+
+       DD      imagerel $L$xts_dec_prologue
+       DD      imagerel $L$xts_dec_epilogue
+       DD      imagerel $L$xts_dec_info
+
+.pdata ENDS
+.xdata SEGMENT READONLY ALIGN(8)
+ALIGN  8
+$L$cbc_dec_info::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$cbc_dec_body,imagerel $L$cbc_dec_epilogue
+
+$L$ctr_enc_info::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$ctr_enc_body,imagerel $L$ctr_enc_epilogue
+
+$L$xts_enc_info::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$xts_enc_body,imagerel $L$xts_enc_epilogue
+
+$L$xts_dec_info::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$xts_dec_body,imagerel $L$xts_dec_epilogue
+
+
+.xdata ENDS
+END
diff --git a/deps/openssl/asm/x64-win32-masm/aes/vpaes-x86_64.asm b/deps/openssl/asm/x64-win32-masm/aes/vpaes-x86_64.asm
new file mode 100644 (file)
index 0000000..b92bddd
--- /dev/null
@@ -0,0 +1,1161 @@
+OPTION DOTNAME
+.text$ SEGMENT ALIGN(64) 'CODE'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_encrypt_core    PROC PRIVATE
+       mov     r9,rdx
+       mov     r11,16
+       mov     eax,DWORD PTR[240+rdx]
+       movdqa  xmm1,xmm9
+       movdqa  xmm2,XMMWORD PTR[$L$k_ipt]
+       pandn   xmm1,xmm0
+       movdqu  xmm5,XMMWORD PTR[r9]
+       psrld   xmm1,4
+       pand    xmm0,xmm9
+DB     102,15,56,0,208
+       movdqa  xmm0,XMMWORD PTR[(($L$k_ipt+16))]
+DB     102,15,56,0,193
+       pxor    xmm2,xmm5
+       pxor    xmm0,xmm2
+       add     r9,16
+       lea     r10,QWORD PTR[$L$k_mc_backward]
+       jmp     $L$enc_entry
+
+ALIGN  16
+$L$enc_loop::
+
+       movdqa  xmm4,xmm13
+DB     102,15,56,0,226
+       pxor    xmm4,xmm5
+       movdqa  xmm0,xmm12
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       movdqa  xmm5,xmm15
+DB     102,15,56,0,234
+       movdqa  xmm1,XMMWORD PTR[((-64))+r10*1+r11]
+       movdqa  xmm2,xmm14
+DB     102,15,56,0,211
+       pxor    xmm2,xmm5
+       movdqa  xmm4,XMMWORD PTR[r10*1+r11]
+       movdqa  xmm3,xmm0
+DB     102,15,56,0,193
+       add     r9,16
+       pxor    xmm0,xmm2
+DB     102,15,56,0,220
+       add     r11,16
+       pxor    xmm3,xmm0
+DB     102,15,56,0,193
+       and     r11,030h
+       pxor    xmm0,xmm3
+       sub     rax,1
+
+$L$enc_entry::
+
+       movdqa  xmm1,xmm9
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm9
+       movdqa  xmm5,xmm11
+DB     102,15,56,0,232
+       pxor    xmm0,xmm1
+       movdqa  xmm3,xmm10
+DB     102,15,56,0,217
+       pxor    xmm3,xmm5
+       movdqa  xmm4,xmm10
+DB     102,15,56,0,224
+       pxor    xmm4,xmm5
+       movdqa  xmm2,xmm10
+DB     102,15,56,0,211
+       pxor    xmm2,xmm0
+       movdqa  xmm3,xmm10
+       movdqu  xmm5,XMMWORD PTR[r9]
+DB     102,15,56,0,220
+       pxor    xmm3,xmm1
+       jnz     $L$enc_loop
+
+
+       movdqa  xmm4,XMMWORD PTR[((-96))+r10]
+       movdqa  xmm0,XMMWORD PTR[((-80))+r10]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm5
+DB     102,15,56,0,195
+       movdqa  xmm1,XMMWORD PTR[64+r10*1+r11]
+       pxor    xmm0,xmm4
+DB     102,15,56,0,193
+       DB      0F3h,0C3h               ;repret
+_vpaes_encrypt_core    ENDP
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_decrypt_core    PROC PRIVATE
+       mov     r9,rdx
+       mov     eax,DWORD PTR[240+rdx]
+       movdqa  xmm1,xmm9
+       movdqa  xmm2,XMMWORD PTR[$L$k_dipt]
+       pandn   xmm1,xmm0
+       mov     r11,rax
+       psrld   xmm1,4
+       movdqu  xmm5,XMMWORD PTR[r9]
+       shl     r11,4
+       pand    xmm0,xmm9
+DB     102,15,56,0,208
+       movdqa  xmm0,XMMWORD PTR[(($L$k_dipt+16))]
+       xor     r11,030h
+       lea     r10,QWORD PTR[$L$k_dsbd]
+DB     102,15,56,0,193
+       and     r11,030h
+       pxor    xmm2,xmm5
+       movdqa  xmm5,XMMWORD PTR[(($L$k_mc_forward+48))]
+       pxor    xmm0,xmm2
+       add     r9,16
+       add     r11,r10
+       jmp     $L$dec_entry
+
+ALIGN  16
+$L$dec_loop::
+
+
+
+       movdqa  xmm4,XMMWORD PTR[((-32))+r10]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR[((-16))+r10]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       add     r9,16
+
+DB     102,15,56,0,197
+       movdqa  xmm4,XMMWORD PTR[r10]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR[16+r10]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       sub     rax,1
+
+DB     102,15,56,0,197
+       movdqa  xmm4,XMMWORD PTR[32+r10]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR[48+r10]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+
+DB     102,15,56,0,197
+       movdqa  xmm4,XMMWORD PTR[64+r10]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR[80+r10]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+
+DB     102,15,58,15,237,12
+
+$L$dec_entry::
+
+       movdqa  xmm1,xmm9
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm9
+       movdqa  xmm2,xmm11
+DB     102,15,56,0,208
+       pxor    xmm0,xmm1
+       movdqa  xmm3,xmm10
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+       movdqa  xmm4,xmm10
+DB     102,15,56,0,224
+       pxor    xmm4,xmm2
+       movdqa  xmm2,xmm10
+DB     102,15,56,0,211
+       pxor    xmm2,xmm0
+       movdqa  xmm3,xmm10
+DB     102,15,56,0,220
+       pxor    xmm3,xmm1
+       movdqu  xmm0,XMMWORD PTR[r9]
+       jnz     $L$dec_loop
+
+
+       movdqa  xmm4,XMMWORD PTR[96+r10]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR[112+r10]
+       movdqa  xmm2,XMMWORD PTR[((-352))+r11]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+DB     102,15,56,0,194
+       DB      0F3h,0C3h               ;repret
+_vpaes_decrypt_core    ENDP
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_schedule_core   PROC PRIVATE
+
+
+
+
+
+       call    _vpaes_preheat
+
+       movdqa  xmm8,XMMWORD PTR[$L$k_rcon]
+       movdqu  xmm0,XMMWORD PTR[rdi]
+
+
+       movdqa  xmm3,xmm0
+       lea     r11,QWORD PTR[$L$k_ipt]
+       call    _vpaes_schedule_transform
+       movdqa  xmm7,xmm0
+
+       lea     r10,QWORD PTR[$L$k_sr]
+       test    rcx,rcx
+       jnz     $L$schedule_am_decrypting
+
+
+       movdqu  XMMWORD PTR[rdx],xmm0
+       jmp     $L$schedule_go
+
+$L$schedule_am_decrypting::
+
+       movdqa  xmm1,XMMWORD PTR[r10*1+r8]
+DB     102,15,56,0,217
+       movdqu  XMMWORD PTR[rdx],xmm3
+       xor     r8,030h
+
+$L$schedule_go::
+       cmp     esi,192
+       ja      $L$schedule_256
+       je      $L$schedule_192
+
+
+
+
+
+
+
+
+
+
+$L$schedule_128::
+       mov     esi,10
+
+$L$oop_schedule_128::
+       call    _vpaes_schedule_round
+       dec     rsi
+       jz      $L$schedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+       jmp     $L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+$L$schedule_192::
+       movdqu  xmm0,XMMWORD PTR[8+rdi]
+       call    _vpaes_schedule_transform
+
+       movdqa  xmm6,xmm0
+       pxor    xmm4,xmm4
+       movhlps xmm6,xmm4
+       mov     esi,4
+
+$L$oop_schedule_192::
+       call    _vpaes_schedule_round
+DB     102,15,58,15,198,8
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_192_smear
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_round
+       dec     rsi
+       jz      $L$schedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+       call    _vpaes_schedule_192_smear
+       jmp     $L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+$L$schedule_256::
+       movdqu  xmm0,XMMWORD PTR[16+rdi]
+       call    _vpaes_schedule_transform
+
+       mov     esi,7
+
+$L$oop_schedule_256::
+       call    _vpaes_schedule_mangle
+
+       movdqa  xmm6,xmm0
+
+
+       call    _vpaes_schedule_round
+       dec     rsi
+       jz      $L$schedule_mangle_last
+       call    _vpaes_schedule_mangle
+
+
+
+       pshufd  xmm0,xmm0,0FFh
+       movdqa  xmm5,xmm7
+       movdqa  xmm7,xmm6
+       call    _vpaes_schedule_low_round
+       movdqa  xmm7,xmm5
+
+       jmp     $L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+$L$schedule_mangle_last::
+
+       lea     r11,QWORD PTR[$L$k_deskew]
+       test    rcx,rcx
+       jnz     $L$schedule_mangle_last_dec
+
+
+       movdqa  xmm1,XMMWORD PTR[r10*1+r8]
+DB     102,15,56,0,193
+       lea     r11,QWORD PTR[$L$k_opt]
+       add     rdx,32
+
+$L$schedule_mangle_last_dec::
+       add     rdx,-16
+       pxor    xmm0,XMMWORD PTR[$L$k_s63]
+       call    _vpaes_schedule_transform
+
+       movdqu  XMMWORD PTR[rdx],xmm0
+
+
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
+       DB      0F3h,0C3h               ;repret
+_vpaes_schedule_core   ENDP
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_schedule_192_smear      PROC PRIVATE
+       pshufd  xmm0,xmm6,080h
+       pxor    xmm6,xmm0
+       pshufd  xmm0,xmm7,0FEh
+       pxor    xmm6,xmm0
+       movdqa  xmm0,xmm6
+       pxor    xmm1,xmm1
+       movhlps xmm6,xmm1
+       DB      0F3h,0C3h               ;repret
+_vpaes_schedule_192_smear      ENDP
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_schedule_round  PROC PRIVATE
+
+       pxor    xmm1,xmm1
+DB     102,65,15,58,15,200,15
+DB     102,69,15,58,15,192,15
+       pxor    xmm7,xmm1
+
+
+       pshufd  xmm0,xmm0,0FFh
+DB     102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round::
+
+       movdqa  xmm1,xmm7
+       pslldq  xmm7,4
+       pxor    xmm7,xmm1
+       movdqa  xmm1,xmm7
+       pslldq  xmm7,8
+       pxor    xmm7,xmm1
+       pxor    xmm7,XMMWORD PTR[$L$k_s63]
+
+
+       movdqa  xmm1,xmm9
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm9
+       movdqa  xmm2,xmm11
+DB     102,15,56,0,208
+       pxor    xmm0,xmm1
+       movdqa  xmm3,xmm10
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+       movdqa  xmm4,xmm10
+DB     102,15,56,0,224
+       pxor    xmm4,xmm2
+       movdqa  xmm2,xmm10
+DB     102,15,56,0,211
+       pxor    xmm2,xmm0
+       movdqa  xmm3,xmm10
+DB     102,15,56,0,220
+       pxor    xmm3,xmm1
+       movdqa  xmm4,xmm13
+DB     102,15,56,0,226
+       movdqa  xmm0,xmm12
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+
+
+       pxor    xmm0,xmm7
+       movdqa  xmm7,xmm0
+       DB      0F3h,0C3h               ;repret
+_vpaes_schedule_round  ENDP
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_schedule_transform      PROC PRIVATE
+       movdqa  xmm1,xmm9
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm9
+       movdqa  xmm2,XMMWORD PTR[r11]
+DB     102,15,56,0,208
+       movdqa  xmm0,XMMWORD PTR[16+r11]
+DB     102,15,56,0,193
+       pxor    xmm0,xmm2
+       DB      0F3h,0C3h               ;repret
+_vpaes_schedule_transform      ENDP
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_schedule_mangle PROC PRIVATE
+       movdqa  xmm4,xmm0
+       movdqa  xmm5,XMMWORD PTR[$L$k_mc_forward]
+       test    rcx,rcx
+       jnz     $L$schedule_mangle_dec
+
+
+       add     rdx,16
+       pxor    xmm4,XMMWORD PTR[$L$k_s63]
+DB     102,15,56,0,229
+       movdqa  xmm3,xmm4
+DB     102,15,56,0,229
+       pxor    xmm3,xmm4
+DB     102,15,56,0,229
+       pxor    xmm3,xmm4
+
+       jmp     $L$schedule_mangle_both
+ALIGN  16
+$L$schedule_mangle_dec::
+
+       lea     r11,QWORD PTR[$L$k_dksd]
+       movdqa  xmm1,xmm9
+       pandn   xmm1,xmm4
+       psrld   xmm1,4
+       pand    xmm4,xmm9
+
+       movdqa  xmm2,XMMWORD PTR[r11]
+DB     102,15,56,0,212
+       movdqa  xmm3,XMMWORD PTR[16+r11]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+DB     102,15,56,0,221
+
+       movdqa  xmm2,XMMWORD PTR[32+r11]
+DB     102,15,56,0,212
+       pxor    xmm2,xmm3
+       movdqa  xmm3,XMMWORD PTR[48+r11]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+DB     102,15,56,0,221
+
+       movdqa  xmm2,XMMWORD PTR[64+r11]
+DB     102,15,56,0,212
+       pxor    xmm2,xmm3
+       movdqa  xmm3,XMMWORD PTR[80+r11]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+DB     102,15,56,0,221
+
+       movdqa  xmm2,XMMWORD PTR[96+r11]
+DB     102,15,56,0,212
+       pxor    xmm2,xmm3
+       movdqa  xmm3,XMMWORD PTR[112+r11]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+
+       add     rdx,-16
+
+$L$schedule_mangle_both::
+       movdqa  xmm1,XMMWORD PTR[r10*1+r8]
+DB     102,15,56,0,217
+       add     r8,-16
+       and     r8,030h
+       movdqu  XMMWORD PTR[rdx],xmm3
+       DB      0F3h,0C3h               ;repret
+_vpaes_schedule_mangle ENDP
+
+
+
+
+PUBLIC vpaes_set_encrypt_key
+
+ALIGN  16
+vpaes_set_encrypt_key  PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_vpaes_set_encrypt_key::
+       mov     rdi,rcx
+       mov     rsi,rdx
+       mov     rdx,r8
+
+
+       lea     rsp,QWORD PTR[((-184))+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm6
+       movaps  XMMWORD PTR[32+rsp],xmm7
+       movaps  XMMWORD PTR[48+rsp],xmm8
+       movaps  XMMWORD PTR[64+rsp],xmm9
+       movaps  XMMWORD PTR[80+rsp],xmm10
+       movaps  XMMWORD PTR[96+rsp],xmm11
+       movaps  XMMWORD PTR[112+rsp],xmm12
+       movaps  XMMWORD PTR[128+rsp],xmm13
+       movaps  XMMWORD PTR[144+rsp],xmm14
+       movaps  XMMWORD PTR[160+rsp],xmm15
+$L$enc_key_body::
+       mov     eax,esi
+       shr     eax,5
+       add     eax,5
+       mov     DWORD PTR[240+rdx],eax
+
+       mov     ecx,0
+       mov     r8d,030h
+       call    _vpaes_schedule_core
+       movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  xmm15,XMMWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[184+rsp]
+$L$enc_key_epilogue::
+       xor     eax,eax
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_vpaes_set_encrypt_key::
+vpaes_set_encrypt_key  ENDP
+
+PUBLIC vpaes_set_decrypt_key
+
+ALIGN  16
+vpaes_set_decrypt_key  PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_vpaes_set_decrypt_key::
+       mov     rdi,rcx
+       mov     rsi,rdx
+       mov     rdx,r8
+
+
+       lea     rsp,QWORD PTR[((-184))+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm6
+       movaps  XMMWORD PTR[32+rsp],xmm7
+       movaps  XMMWORD PTR[48+rsp],xmm8
+       movaps  XMMWORD PTR[64+rsp],xmm9
+       movaps  XMMWORD PTR[80+rsp],xmm10
+       movaps  XMMWORD PTR[96+rsp],xmm11
+       movaps  XMMWORD PTR[112+rsp],xmm12
+       movaps  XMMWORD PTR[128+rsp],xmm13
+       movaps  XMMWORD PTR[144+rsp],xmm14
+       movaps  XMMWORD PTR[160+rsp],xmm15
+$L$dec_key_body::
+       mov     eax,esi
+       shr     eax,5
+       add     eax,5
+       mov     DWORD PTR[240+rdx],eax
+       shl     eax,4
+       lea     rdx,QWORD PTR[16+rax*1+rdx]
+
+       mov     ecx,1
+       mov     r8d,esi
+       shr     r8d,1
+       and     r8d,32
+       xor     r8d,32
+       call    _vpaes_schedule_core
+       movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  xmm15,XMMWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[184+rsp]
+$L$dec_key_epilogue::
+       xor     eax,eax
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_vpaes_set_decrypt_key::
+vpaes_set_decrypt_key  ENDP
+
+PUBLIC vpaes_encrypt
+
+ALIGN  16
+vpaes_encrypt  PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_vpaes_encrypt::
+       mov     rdi,rcx
+       mov     rsi,rdx
+       mov     rdx,r8
+
+
+       lea     rsp,QWORD PTR[((-184))+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm6
+       movaps  XMMWORD PTR[32+rsp],xmm7
+       movaps  XMMWORD PTR[48+rsp],xmm8
+       movaps  XMMWORD PTR[64+rsp],xmm9
+       movaps  XMMWORD PTR[80+rsp],xmm10
+       movaps  XMMWORD PTR[96+rsp],xmm11
+       movaps  XMMWORD PTR[112+rsp],xmm12
+       movaps  XMMWORD PTR[128+rsp],xmm13
+       movaps  XMMWORD PTR[144+rsp],xmm14
+       movaps  XMMWORD PTR[160+rsp],xmm15
+$L$enc_body::
+       movdqu  xmm0,XMMWORD PTR[rdi]
+       call    _vpaes_preheat
+       call    _vpaes_encrypt_core
+       movdqu  XMMWORD PTR[rsi],xmm0
+       movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  xmm15,XMMWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[184+rsp]
+$L$enc_epilogue::
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_vpaes_encrypt::
+vpaes_encrypt  ENDP
+
+PUBLIC vpaes_decrypt
+
+ALIGN  16
+vpaes_decrypt  PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_vpaes_decrypt::
+       mov     rdi,rcx
+       mov     rsi,rdx
+       mov     rdx,r8
+
+
+       lea     rsp,QWORD PTR[((-184))+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm6
+       movaps  XMMWORD PTR[32+rsp],xmm7
+       movaps  XMMWORD PTR[48+rsp],xmm8
+       movaps  XMMWORD PTR[64+rsp],xmm9
+       movaps  XMMWORD PTR[80+rsp],xmm10
+       movaps  XMMWORD PTR[96+rsp],xmm11
+       movaps  XMMWORD PTR[112+rsp],xmm12
+       movaps  XMMWORD PTR[128+rsp],xmm13
+       movaps  XMMWORD PTR[144+rsp],xmm14
+       movaps  XMMWORD PTR[160+rsp],xmm15
+$L$dec_body::
+       movdqu  xmm0,XMMWORD PTR[rdi]
+       call    _vpaes_preheat
+       call    _vpaes_decrypt_core
+       movdqu  XMMWORD PTR[rsi],xmm0
+       movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  xmm15,XMMWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[184+rsp]
+$L$dec_epilogue::
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_vpaes_decrypt::
+vpaes_decrypt  ENDP
+PUBLIC vpaes_cbc_encrypt
+
+ALIGN  16
+vpaes_cbc_encrypt      PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_vpaes_cbc_encrypt::
+       mov     rdi,rcx
+       mov     rsi,rdx
+       mov     rdx,r8
+       mov     rcx,r9
+       mov     r8,QWORD PTR[40+rsp]
+       mov     r9,QWORD PTR[48+rsp]
+
+
+       xchg    rdx,rcx
+       sub     rcx,16
+       jc      $L$cbc_abort
+       lea     rsp,QWORD PTR[((-184))+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm6
+       movaps  XMMWORD PTR[32+rsp],xmm7
+       movaps  XMMWORD PTR[48+rsp],xmm8
+       movaps  XMMWORD PTR[64+rsp],xmm9
+       movaps  XMMWORD PTR[80+rsp],xmm10
+       movaps  XMMWORD PTR[96+rsp],xmm11
+       movaps  XMMWORD PTR[112+rsp],xmm12
+       movaps  XMMWORD PTR[128+rsp],xmm13
+       movaps  XMMWORD PTR[144+rsp],xmm14
+       movaps  XMMWORD PTR[160+rsp],xmm15
+$L$cbc_body::
+       movdqu  xmm6,XMMWORD PTR[r8]
+       sub     rsi,rdi
+       call    _vpaes_preheat
+       cmp     r9d,0
+       je      $L$cbc_dec_loop
+       jmp     $L$cbc_enc_loop
+ALIGN  16
+$L$cbc_enc_loop::
+       movdqu  xmm0,XMMWORD PTR[rdi]
+       pxor    xmm0,xmm6
+       call    _vpaes_encrypt_core
+       movdqa  xmm6,xmm0
+       movdqu  XMMWORD PTR[rdi*1+rsi],xmm0
+       lea     rdi,QWORD PTR[16+rdi]
+       sub     rcx,16
+       jnc     $L$cbc_enc_loop
+       jmp     $L$cbc_done
+ALIGN  16
+$L$cbc_dec_loop::
+       movdqu  xmm0,XMMWORD PTR[rdi]
+       movdqa  xmm7,xmm0
+       call    _vpaes_decrypt_core
+       pxor    xmm0,xmm6
+       movdqa  xmm6,xmm7
+       movdqu  XMMWORD PTR[rdi*1+rsi],xmm0
+       lea     rdi,QWORD PTR[16+rdi]
+       sub     rcx,16
+       jnc     $L$cbc_dec_loop
+$L$cbc_done::
+       movdqu  XMMWORD PTR[r8],xmm6
+       movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  xmm15,XMMWORD PTR[160+rsp]
+       lea     rsp,QWORD PTR[184+rsp]
+$L$cbc_epilogue::
+$L$cbc_abort::
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_vpaes_cbc_encrypt::
+vpaes_cbc_encrypt      ENDP
+
+
+
+
+
+
+
+ALIGN  16
+_vpaes_preheat PROC PRIVATE
+       lea     r10,QWORD PTR[$L$k_s0F]
+       movdqa  xmm10,XMMWORD PTR[((-32))+r10]
+       movdqa  xmm11,XMMWORD PTR[((-16))+r10]
+       movdqa  xmm9,XMMWORD PTR[r10]
+       movdqa  xmm13,XMMWORD PTR[48+r10]
+       movdqa  xmm12,XMMWORD PTR[64+r10]
+       movdqa  xmm15,XMMWORD PTR[80+r10]
+       movdqa  xmm14,XMMWORD PTR[96+r10]
+       DB      0F3h,0C3h               ;repret
+_vpaes_preheat ENDP
+
+
+
+
+
+
+ALIGN  64
+_vpaes_consts::
+$L$k_inv::
+       DQ      00E05060F0D080180h,0040703090A0B0C02h
+       DQ      001040A060F0B0780h,0030D0E0C02050809h
+
+$L$k_s0F::
+       DQ      00F0F0F0F0F0F0F0Fh,00F0F0F0F0F0F0F0Fh
+
+$L$k_ipt::
+       DQ      0C2B2E8985A2A7000h,0CABAE09052227808h
+       DQ      04C01307D317C4D00h,0CD80B1FCB0FDCC81h
+
+$L$k_sb1::
+       DQ      0B19BE18FCB503E00h,0A5DF7A6E142AF544h
+       DQ      03618D415FAE22300h,03BF7CCC10D2ED9EFh
+$L$k_sb2::
+       DQ      0E27A93C60B712400h,05EB7E955BC982FCDh
+       DQ      069EB88400AE12900h,0C2A163C8AB82234Ah
+$L$k_sbo::
+       DQ      0D0D26D176FBDC700h,015AABF7AC502A878h
+       DQ      0CFE474A55FBB6A00h,08E1E90D1412B35FAh
+
+$L$k_mc_forward::
+       DQ      00407060500030201h,00C0F0E0D080B0A09h
+       DQ      0080B0A0904070605h,0000302010C0F0E0Dh
+       DQ      00C0F0E0D080B0A09h,00407060500030201h
+       DQ      0000302010C0F0E0Dh,0080B0A0904070605h
+
+$L$k_mc_backward::
+       DQ      00605040702010003h,00E0D0C0F0A09080Bh
+       DQ      0020100030E0D0C0Fh,00A09080B06050407h
+       DQ      00E0D0C0F0A09080Bh,00605040702010003h
+       DQ      00A09080B06050407h,0020100030E0D0C0Fh
+
+$L$k_sr::
+       DQ      00706050403020100h,00F0E0D0C0B0A0908h
+       DQ      0030E09040F0A0500h,00B06010C07020D08h
+       DQ      00F060D040B020900h,0070E050C030A0108h
+       DQ      00B0E0104070A0D00h,00306090C0F020508h
+
+$L$k_rcon::
+       DQ      01F8391B9AF9DEEB6h,0702A98084D7C7D81h
+
+$L$k_s63::
+       DQ      05B5B5B5B5B5B5B5Bh,05B5B5B5B5B5B5B5Bh
+
+$L$k_opt::
+       DQ      0FF9F4929D6B66000h,0F7974121DEBE6808h
+       DQ      001EDBD5150BCEC00h,0E10D5DB1B05C0CE0h
+
+$L$k_deskew::
+       DQ      007E4A34047A4E300h,01DFEB95A5DBEF91Ah
+       DQ      05F36B5DC83EA6900h,02841C2ABF49D1E77h
+
+
+
+
+
+$L$k_dksd::
+       DQ      0FEB91A5DA3E44700h,00740E3A45A1DBEF9h
+       DQ      041C277F4B5368300h,05FDC69EAAB289D1Eh
+$L$k_dksb::
+       DQ      09A4FCA1F8550D500h,003D653861CC94C99h
+       DQ      0115BEDA7B6FC4A00h,0D993256F7E3482C8h
+$L$k_dkse::
+       DQ      0D5031CCA1FC9D600h,053859A4C994F5086h
+       DQ      0A23196054FDC7BE8h,0CD5EF96A20B31487h
+$L$k_dks9::
+       DQ      0B6116FC87ED9A700h,04AED933482255BFCh
+       DQ      04576516227143300h,08BB89FACE9DAFDCEh
+
+
+
+
+
+$L$k_dipt::
+       DQ      00F505B040B545F00h,0154A411E114E451Ah
+       DQ      086E383E660056500h,012771772F491F194h
+
+$L$k_dsb9::
+       DQ      0851C03539A86D600h,0CAD51F504F994CC9h
+       DQ      0C03B1789ECD74900h,0725E2C9EB2FBA565h
+$L$k_dsbd::
+       DQ      07D57CCDFE6B1A200h,0F56E9B13882A4439h
+       DQ      03CE2FAF724C6CB00h,02931180D15DEEFD3h
+$L$k_dsbb::
+       DQ      0D022649296B44200h,0602646F6B0F2D404h
+       DQ      0C19498A6CD596700h,0F3FF0C3E3255AA6Bh
+$L$k_dsbe::
+       DQ      046F2929626D4D000h,02242600464B4F6B0h
+       DQ      00C55A6CDFFAAC100h,09467F36B98593E32h
+$L$k_dsbo::
+       DQ      01387EA537EF94000h,0C7AA6DB9D4943E2Dh
+       DQ      012D7560F93441D00h,0CA4B8159D8C58E9Ch
+DB     86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111
+DB     110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52
+DB     47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109
+DB     98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85
+DB     110,105,118,101,114,115,105,116,121,41,0
+ALIGN  64
+
+EXTERN __imp_RtlVirtualUnwind:NEAR
+
+ALIGN  16
+se_handler     PROC PRIVATE
+       push    rsi
+       push    rdi
+       push    rbx
+       push    rbp
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       pushfq
+       sub     rsp,64
+
+       mov     rax,QWORD PTR[120+r8]
+       mov     rbx,QWORD PTR[248+r8]
+
+       mov     rsi,QWORD PTR[8+r9]
+       mov     r11,QWORD PTR[56+r9]
+
+       mov     r10d,DWORD PTR[r11]
+       lea     r10,QWORD PTR[r10*1+rsi]
+       cmp     rbx,r10
+       jb      $L$in_prologue
+
+       mov     rax,QWORD PTR[152+r8]
+
+       mov     r10d,DWORD PTR[4+r11]
+       lea     r10,QWORD PTR[r10*1+rsi]
+       cmp     rbx,r10
+       jae     $L$in_prologue
+
+       lea     rsi,QWORD PTR[16+rax]
+       lea     rdi,QWORD PTR[512+r8]
+       mov     ecx,20
+       DD      0a548f3fch
+
+       lea     rax,QWORD PTR[184+rax]
+
+$L$in_prologue::
+       mov     rdi,QWORD PTR[8+rax]
+       mov     rsi,QWORD PTR[16+rax]
+       mov     QWORD PTR[152+r8],rax
+       mov     QWORD PTR[168+r8],rsi
+       mov     QWORD PTR[176+r8],rdi
+
+       mov     rdi,QWORD PTR[40+r9]
+       mov     rsi,r8
+       mov     ecx,154
+       DD      0a548f3fch
+
+
+       mov     rsi,r9
+       xor     rcx,rcx
+       mov     rdx,QWORD PTR[8+rsi]
+       mov     r8,QWORD PTR[rsi]
+       mov     r9,QWORD PTR[16+rsi]
+       mov     r10,QWORD PTR[40+rsi]
+       lea     r11,QWORD PTR[56+rsi]
+       lea     r12,QWORD PTR[24+rsi]
+       mov     QWORD PTR[32+rsp],r10
+       mov     QWORD PTR[40+rsp],r11
+       mov     QWORD PTR[48+rsp],r12
+       mov     QWORD PTR[56+rsp],rcx
+       call    QWORD PTR[__imp_RtlVirtualUnwind]
+
+       mov     eax,1
+       add     rsp,64
+       popfq
+       pop     r15
+       pop     r14
+       pop     r13
+       pop     r12
+       pop     rbp
+       pop     rbx
+       pop     rdi
+       pop     rsi
+       DB      0F3h,0C3h               ;repret
+se_handler     ENDP
+
+.text$ ENDS
+.pdata SEGMENT READONLY ALIGN(4)
+ALIGN  4
+       DD      imagerel $L$SEH_begin_vpaes_set_encrypt_key
+       DD      imagerel $L$SEH_end_vpaes_set_encrypt_key
+       DD      imagerel $L$SEH_info_vpaes_set_encrypt_key
+
+       DD      imagerel $L$SEH_begin_vpaes_set_decrypt_key
+       DD      imagerel $L$SEH_end_vpaes_set_decrypt_key
+       DD      imagerel $L$SEH_info_vpaes_set_decrypt_key
+
+       DD      imagerel $L$SEH_begin_vpaes_encrypt
+       DD      imagerel $L$SEH_end_vpaes_encrypt
+       DD      imagerel $L$SEH_info_vpaes_encrypt
+
+       DD      imagerel $L$SEH_begin_vpaes_decrypt
+       DD      imagerel $L$SEH_end_vpaes_decrypt
+       DD      imagerel $L$SEH_info_vpaes_decrypt
+
+       DD      imagerel $L$SEH_begin_vpaes_cbc_encrypt
+       DD      imagerel $L$SEH_end_vpaes_cbc_encrypt
+       DD      imagerel $L$SEH_info_vpaes_cbc_encrypt
+
+.pdata ENDS
+.xdata SEGMENT READONLY ALIGN(8)
+ALIGN  8
+$L$SEH_info_vpaes_set_encrypt_key::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$enc_key_body,imagerel $L$enc_key_epilogue
+
+$L$SEH_info_vpaes_set_decrypt_key::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$dec_key_body,imagerel $L$dec_key_epilogue
+
+$L$SEH_info_vpaes_encrypt::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$enc_body,imagerel $L$enc_epilogue
+
+$L$SEH_info_vpaes_decrypt::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$dec_body,imagerel $L$dec_epilogue
+
+$L$SEH_info_vpaes_cbc_encrypt::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$cbc_body,imagerel $L$cbc_epilogue
+
+
+.xdata ENDS
+END
diff --git a/deps/openssl/asm/x64-win32-masm/aes/vpaesni-x86_64.asm b/deps/openssl/asm/x64-win32-masm/aes/vpaesni-x86_64.asm
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/deps/openssl/asm/x64-win32-masm/modes/ghash-x86_64.asm b/deps/openssl/asm/x64-win32-masm/modes/ghash-x86_64.asm
new file mode 100644 (file)
index 0000000..01fe307
--- /dev/null
@@ -0,0 +1,1202 @@
+OPTION DOTNAME
+.text$ SEGMENT ALIGN(64) 'CODE'
+
+PUBLIC gcm_gmult_4bit
+
+ALIGN  16
+gcm_gmult_4bit PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_gcm_gmult_4bit::
+       mov     rdi,rcx
+       mov     rsi,rdx
+
+
+       push    rbx
+       push    rbp
+       push    r12
+$L$gmult_prologue::
+
+       movzx   r8,BYTE PTR[15+rdi]
+       lea     r11,QWORD PTR[$L$rem_4bit]
+       xor     rax,rax
+       xor     rbx,rbx
+       mov     al,r8b
+       mov     bl,r8b
+       shl     al,4
+       mov     rcx,14
+       mov     r8,QWORD PTR[8+rax*1+rsi]
+       mov     r9,QWORD PTR[rax*1+rsi]
+       and     bl,0f0h
+       mov     rdx,r8
+       jmp     $L$oop1
+
+ALIGN  16
+$L$oop1::
+       shr     r8,4
+       and     rdx,0fh
+       mov     r10,r9
+       mov     al,BYTE PTR[rcx*1+rdi]
+       shr     r9,4
+       xor     r8,QWORD PTR[8+rbx*1+rsi]
+       shl     r10,60
+       xor     r9,QWORD PTR[rbx*1+rsi]
+       mov     bl,al
+       xor     r9,QWORD PTR[rdx*8+r11]
+       mov     rdx,r8
+       shl     al,4
+       xor     r8,r10
+       dec     rcx
+       js      $L$break1
+
+       shr     r8,4
+       and     rdx,0fh
+       mov     r10,r9
+       shr     r9,4
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       shl     r10,60
+       xor     r9,QWORD PTR[rax*1+rsi]
+       and     bl,0f0h
+       xor     r9,QWORD PTR[rdx*8+r11]
+       mov     rdx,r8
+       xor     r8,r10
+       jmp     $L$oop1
+
+ALIGN  16
+$L$break1::
+       shr     r8,4
+       and     rdx,0fh
+       mov     r10,r9
+       shr     r9,4
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       shl     r10,60
+       xor     r9,QWORD PTR[rax*1+rsi]
+       and     bl,0f0h
+       xor     r9,QWORD PTR[rdx*8+r11]
+       mov     rdx,r8
+       xor     r8,r10
+
+       shr     r8,4
+       and     rdx,0fh
+       mov     r10,r9
+       shr     r9,4
+       xor     r8,QWORD PTR[8+rbx*1+rsi]
+       shl     r10,60
+       xor     r9,QWORD PTR[rbx*1+rsi]
+       xor     r8,r10
+       xor     r9,QWORD PTR[rdx*8+r11]
+
+       bswap   r8
+       bswap   r9
+       mov     QWORD PTR[8+rdi],r8
+       mov     QWORD PTR[rdi],r9
+
+       mov     rbx,QWORD PTR[16+rsp]
+       lea     rsp,QWORD PTR[24+rsp]
+$L$gmult_epilogue::
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_gcm_gmult_4bit::
+gcm_gmult_4bit ENDP
+PUBLIC gcm_ghash_4bit
+
+ALIGN  16
+gcm_ghash_4bit PROC PUBLIC
+       mov     QWORD PTR[8+rsp],rdi    ;WIN64 prologue
+       mov     QWORD PTR[16+rsp],rsi
+       mov     rax,rsp
+$L$SEH_begin_gcm_ghash_4bit::
+       mov     rdi,rcx
+       mov     rsi,rdx
+       mov     rdx,r8
+       mov     rcx,r9
+
+
+       push    rbx
+       push    rbp
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       sub     rsp,280
+$L$ghash_prologue::
+       mov     r14,rdx
+       mov     r15,rcx
+       sub     rsi,-128
+       lea     rbp,QWORD PTR[((16+128))+rsp]
+       xor     edx,edx
+       mov     r8,QWORD PTR[((0+0-128))+rsi]
+       mov     rax,QWORD PTR[((0+8-128))+rsi]
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     r9,QWORD PTR[((16+0-128))+rsi]
+       shl     dl,4
+       mov     rbx,QWORD PTR[((16+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[rbp],r8
+       mov     r8,QWORD PTR[((32+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((0-128))+rbp],rax
+       mov     rax,QWORD PTR[((32+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[1+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[8+rbp],r9
+       mov     r9,QWORD PTR[((48+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((8-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((48+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[2+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[16+rbp],r8
+       mov     r8,QWORD PTR[((64+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((16-128))+rbp],rax
+       mov     rax,QWORD PTR[((64+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[3+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[24+rbp],r9
+       mov     r9,QWORD PTR[((80+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((24-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((80+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[4+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[32+rbp],r8
+       mov     r8,QWORD PTR[((96+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((32-128))+rbp],rax
+       mov     rax,QWORD PTR[((96+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[5+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[40+rbp],r9
+       mov     r9,QWORD PTR[((112+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((40-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((112+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[6+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[48+rbp],r8
+       mov     r8,QWORD PTR[((128+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((48-128))+rbp],rax
+       mov     rax,QWORD PTR[((128+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[7+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[56+rbp],r9
+       mov     r9,QWORD PTR[((144+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((56-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((144+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[8+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[64+rbp],r8
+       mov     r8,QWORD PTR[((160+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((64-128))+rbp],rax
+       mov     rax,QWORD PTR[((160+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[9+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[72+rbp],r9
+       mov     r9,QWORD PTR[((176+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((72-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((176+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[10+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[80+rbp],r8
+       mov     r8,QWORD PTR[((192+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((80-128))+rbp],rax
+       mov     rax,QWORD PTR[((192+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[11+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[88+rbp],r9
+       mov     r9,QWORD PTR[((208+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((88-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((208+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[12+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[96+rbp],r8
+       mov     r8,QWORD PTR[((224+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((96-128))+rbp],rax
+       mov     rax,QWORD PTR[((224+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[13+rsp],dl
+       or      rbx,r10
+       mov     dl,al
+       shr     rax,4
+       mov     r10,r8
+       shr     r8,4
+       mov     QWORD PTR[104+rbp],r9
+       mov     r9,QWORD PTR[((240+0-128))+rsi]
+       shl     dl,4
+       mov     QWORD PTR[((104-128))+rbp],rbx
+       mov     rbx,QWORD PTR[((240+8-128))+rsi]
+       shl     r10,60
+       mov     BYTE PTR[14+rsp],dl
+       or      rax,r10
+       mov     dl,bl
+       shr     rbx,4
+       mov     r10,r9
+       shr     r9,4
+       mov     QWORD PTR[112+rbp],r8
+       shl     dl,4
+       mov     QWORD PTR[((112-128))+rbp],rax
+       shl     r10,60
+       mov     BYTE PTR[15+rsp],dl
+       or      rbx,r10
+       mov     QWORD PTR[120+rbp],r9
+       mov     QWORD PTR[((120-128))+rbp],rbx
+       add     rsi,-128
+       mov     r8,QWORD PTR[8+rdi]
+       mov     r9,QWORD PTR[rdi]
+       add     r15,r14
+       lea     r11,QWORD PTR[$L$rem_8bit]
+       jmp     $L$outer_loop
+ALIGN  16
+$L$outer_loop::
+       xor     r9,QWORD PTR[r14]
+       mov     rdx,QWORD PTR[8+r14]
+       lea     r14,QWORD PTR[16+r14]
+       xor     rdx,r8
+       mov     QWORD PTR[rdi],r9
+       mov     QWORD PTR[8+rdi],rdx
+       shr     rdx,32
+       xor     rax,rax
+       rol     edx,8
+       mov     al,dl
+       movzx   ebx,dl
+       shl     al,4
+       shr     ebx,4
+       rol     edx,8
+       mov     r8,QWORD PTR[8+rax*1+rsi]
+       mov     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       xor     r12,r8
+       mov     r10,r9
+       shr     r8,8
+       movzx   r12,r12b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       mov     edx,DWORD PTR[8+rdi]
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       mov     edx,DWORD PTR[4+rdi]
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       mov     edx,DWORD PTR[rdi]
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       shr     ecx,4
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r12,WORD PTR[r12*2+r11]
+       movzx   ebx,dl
+       shl     al,4
+       movzx   r13,BYTE PTR[rcx*1+rsp]
+       shr     ebx,4
+       shl     r12,48
+       xor     r13,r8
+       mov     r10,r9
+       xor     r9,r12
+       shr     r8,8
+       movzx   r13,r13b
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rcx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rcx*8+rbp]
+       rol     edx,8
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       mov     al,dl
+       xor     r8,r10
+       movzx   r13,WORD PTR[r13*2+r11]
+       movzx   ecx,dl
+       shl     al,4
+       movzx   r12,BYTE PTR[rbx*1+rsp]
+       and     ecx,240
+       shl     r13,48
+       xor     r12,r8
+       mov     r10,r9
+       xor     r9,r13
+       shr     r8,8
+       movzx   r12,r12b
+       mov     edx,DWORD PTR[((-4))+rdi]
+       shr     r9,8
+       xor     r8,QWORD PTR[((-128))+rbx*8+rbp]
+       shl     r10,56
+       xor     r9,QWORD PTR[rbx*8+rbp]
+       movzx   r12,WORD PTR[r12*2+r11]
+       xor     r8,QWORD PTR[8+rax*1+rsi]
+       xor     r9,QWORD PTR[rax*1+rsi]
+       shl     r12,48
+       xor     r8,r10
+       xor     r9,r12
+       movzx   r13,r8b
+       shr     r8,4
+       mov     r10,r9
+       shl     r13b,4
+       shr     r9,4
+       xor     r8,QWORD PTR[8+rcx*1+rsi]
+       movzx   r13,WORD PTR[r13*2+r11]
+       shl     r10,60
+       xor     r9,QWORD PTR[rcx*1+rsi]
+       xor     r8,r10
+       shl     r13,48
+       bswap   r8
+       xor     r9,r13
+       bswap   r9
+       cmp     r14,r15
+       jb      $L$outer_loop
+       mov     QWORD PTR[8+rdi],r8
+       mov     QWORD PTR[rdi],r9
+
+       lea     rsi,QWORD PTR[280+rsp]
+       mov     r15,QWORD PTR[rsi]
+       mov     r14,QWORD PTR[8+rsi]
+       mov     r13,QWORD PTR[16+rsi]
+       mov     r12,QWORD PTR[24+rsi]
+       mov     rbp,QWORD PTR[32+rsi]
+       mov     rbx,QWORD PTR[40+rsi]
+       lea     rsp,QWORD PTR[48+rsi]
+$L$ghash_epilogue::
+       mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
+       mov     rsi,QWORD PTR[16+rsp]
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_gcm_ghash_4bit::
+gcm_ghash_4bit ENDP
+PUBLIC gcm_init_clmul
+
+ALIGN  16
+gcm_init_clmul PROC PUBLIC
+       movdqu  xmm2,XMMWORD PTR[rdx]
+       pshufd  xmm2,xmm2,78
+
+
+       pshufd  xmm4,xmm2,255
+       movdqa  xmm3,xmm2
+       psllq   xmm2,1
+       pxor    xmm5,xmm5
+       psrlq   xmm3,63
+       pcmpgtd xmm5,xmm4
+       pslldq  xmm3,8
+       por     xmm2,xmm3
+
+
+       pand    xmm5,XMMWORD PTR[$L$0x1c2_polynomial]
+       pxor    xmm2,xmm5
+
+
+       movdqa  xmm0,xmm2
+       movdqa  xmm1,xmm0
+       pshufd  xmm3,xmm0,78
+       pshufd  xmm4,xmm2,78
+       pxor    xmm3,xmm0
+       pxor    xmm4,xmm2
+DB     102,15,58,68,194,0
+DB     102,15,58,68,202,17
+DB     102,15,58,68,220,0
+       pxor    xmm3,xmm0
+       pxor    xmm3,xmm1
+
+       movdqa  xmm4,xmm3
+       psrldq  xmm3,8
+       pslldq  xmm4,8
+       pxor    xmm1,xmm3
+       pxor    xmm0,xmm4
+
+       movdqa  xmm3,xmm0
+       psllq   xmm0,1
+       pxor    xmm0,xmm3
+       psllq   xmm0,5
+       pxor    xmm0,xmm3
+       psllq   xmm0,57
+       movdqa  xmm4,xmm0
+       pslldq  xmm0,8
+       psrldq  xmm4,8
+       pxor    xmm0,xmm3
+       pxor    xmm1,xmm4
+
+
+       movdqa  xmm4,xmm0
+       psrlq   xmm0,5
+       pxor    xmm0,xmm4
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       pxor    xmm4,xmm1
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       movdqu  XMMWORD PTR[rcx],xmm2
+       movdqu  XMMWORD PTR[16+rcx],xmm0
+       DB      0F3h,0C3h               ;repret
+gcm_init_clmul ENDP
+PUBLIC gcm_gmult_clmul
+
+ALIGN  16
+gcm_gmult_clmul        PROC PUBLIC
+       movdqu  xmm0,XMMWORD PTR[rcx]
+       movdqa  xmm5,XMMWORD PTR[$L$bswap_mask]
+       movdqu  xmm2,XMMWORD PTR[rdx]
+DB     102,15,56,0,197
+       movdqa  xmm1,xmm0
+       pshufd  xmm3,xmm0,78
+       pshufd  xmm4,xmm2,78
+       pxor    xmm3,xmm0
+       pxor    xmm4,xmm2
+DB     102,15,58,68,194,0
+DB     102,15,58,68,202,17
+DB     102,15,58,68,220,0
+       pxor    xmm3,xmm0
+       pxor    xmm3,xmm1
+
+       movdqa  xmm4,xmm3
+       psrldq  xmm3,8
+       pslldq  xmm4,8
+       pxor    xmm1,xmm3
+       pxor    xmm0,xmm4
+
+       movdqa  xmm3,xmm0
+       psllq   xmm0,1
+       pxor    xmm0,xmm3
+       psllq   xmm0,5
+       pxor    xmm0,xmm3
+       psllq   xmm0,57
+       movdqa  xmm4,xmm0
+       pslldq  xmm0,8
+       psrldq  xmm4,8
+       pxor    xmm0,xmm3
+       pxor    xmm1,xmm4
+
+
+       movdqa  xmm4,xmm0
+       psrlq   xmm0,5
+       pxor    xmm0,xmm4
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       pxor    xmm4,xmm1
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+DB     102,15,56,0,197
+       movdqu  XMMWORD PTR[rcx],xmm0
+       DB      0F3h,0C3h               ;repret
+gcm_gmult_clmul        ENDP
+PUBLIC gcm_ghash_clmul
+
+ALIGN  16
+gcm_ghash_clmul        PROC PUBLIC
+$L$SEH_begin_gcm_ghash_clmul::
+
+DB     048h,083h,0ech,058h
+
+DB     00fh,029h,034h,024h
+
+DB     00fh,029h,07ch,024h,010h
+
+DB     044h,00fh,029h,044h,024h,020h
+
+DB     044h,00fh,029h,04ch,024h,030h
+
+DB     044h,00fh,029h,054h,024h,040h
+
+       movdqa  xmm5,XMMWORD PTR[$L$bswap_mask]
+
+       movdqu  xmm0,XMMWORD PTR[rcx]
+       movdqu  xmm2,XMMWORD PTR[rdx]
+DB     102,15,56,0,197
+
+       sub     r9,010h
+       jz      $L$odd_tail
+
+       movdqu  xmm8,XMMWORD PTR[16+rdx]
+
+
+
+
+
+       movdqu  xmm3,XMMWORD PTR[r8]
+       movdqu  xmm6,XMMWORD PTR[16+r8]
+DB     102,15,56,0,221
+DB     102,15,56,0,245
+       pxor    xmm0,xmm3
+       movdqa  xmm7,xmm6
+       pshufd  xmm3,xmm6,78
+       pshufd  xmm4,xmm2,78
+       pxor    xmm3,xmm6
+       pxor    xmm4,xmm2
+DB     102,15,58,68,242,0
+DB     102,15,58,68,250,17
+DB     102,15,58,68,220,0
+       pxor    xmm3,xmm6
+       pxor    xmm3,xmm7
+
+       movdqa  xmm4,xmm3
+       psrldq  xmm3,8
+       pslldq  xmm4,8
+       pxor    xmm7,xmm3
+       pxor    xmm6,xmm4
+       movdqa  xmm1,xmm0
+       pshufd  xmm3,xmm0,78
+       pshufd  xmm4,xmm8,78
+       pxor    xmm3,xmm0
+       pxor    xmm4,xmm8
+
+       lea     r8,QWORD PTR[32+r8]
+       sub     r9,020h
+       jbe     $L$even_tail
+
+$L$mod_loop::
+DB     102,65,15,58,68,192,0
+DB     102,65,15,58,68,200,17
+DB     102,15,58,68,220,0
+       pxor    xmm3,xmm0
+       pxor    xmm3,xmm1
+
+       movdqa  xmm4,xmm3
+       psrldq  xmm3,8
+       pslldq  xmm4,8
+       pxor    xmm1,xmm3
+       pxor    xmm0,xmm4
+       movdqu  xmm3,XMMWORD PTR[r8]
+       pxor    xmm0,xmm6
+       pxor    xmm1,xmm7
+
+       movdqu  xmm6,XMMWORD PTR[16+r8]
+DB     102,15,56,0,221
+DB     102,15,56,0,245
+
+       movdqa  xmm7,xmm6
+       pshufd  xmm9,xmm6,78
+       pshufd  xmm10,xmm2,78
+       pxor    xmm9,xmm6
+       pxor    xmm10,xmm2
+       pxor    xmm1,xmm3
+
+       movdqa  xmm3,xmm0
+       psllq   xmm0,1
+       pxor    xmm0,xmm3
+       psllq   xmm0,5
+       pxor    xmm0,xmm3
+DB     102,15,58,68,242,0
+       psllq   xmm0,57
+       movdqa  xmm4,xmm0
+       pslldq  xmm0,8
+       psrldq  xmm4,8
+       pxor    xmm0,xmm3
+       pxor    xmm1,xmm4
+
+DB     102,15,58,68,250,17
+       movdqa  xmm4,xmm0
+       psrlq   xmm0,5
+       pxor    xmm0,xmm4
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       pxor    xmm4,xmm1
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+
+DB     102,69,15,58,68,202,0
+       movdqa  xmm1,xmm0
+       pshufd  xmm3,xmm0,78
+       pshufd  xmm4,xmm8,78
+       pxor    xmm3,xmm0
+       pxor    xmm4,xmm8
+
+       pxor    xmm9,xmm6
+       pxor    xmm9,xmm7
+       movdqa  xmm10,xmm9
+       psrldq  xmm9,8
+       pslldq  xmm10,8
+       pxor    xmm7,xmm9
+       pxor    xmm6,xmm10
+
+       lea     r8,QWORD PTR[32+r8]
+       sub     r9,020h
+       ja      $L$mod_loop
+
+$L$even_tail::
+DB     102,65,15,58,68,192,0
+DB     102,65,15,58,68,200,17
+DB     102,15,58,68,220,0
+       pxor    xmm3,xmm0
+       pxor    xmm3,xmm1
+
+       movdqa  xmm4,xmm3
+       psrldq  xmm3,8
+       pslldq  xmm4,8
+       pxor    xmm1,xmm3
+       pxor    xmm0,xmm4
+       pxor    xmm0,xmm6
+       pxor    xmm1,xmm7
+
+       movdqa  xmm3,xmm0
+       psllq   xmm0,1
+       pxor    xmm0,xmm3
+       psllq   xmm0,5
+       pxor    xmm0,xmm3
+       psllq   xmm0,57
+       movdqa  xmm4,xmm0
+       pslldq  xmm0,8
+       psrldq  xmm4,8
+       pxor    xmm0,xmm3
+       pxor    xmm1,xmm4
+
+
+       movdqa  xmm4,xmm0
+       psrlq   xmm0,5
+       pxor    xmm0,xmm4
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       pxor    xmm4,xmm1
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       test    r9,r9
+       jnz     $L$done
+
+$L$odd_tail::
+       movdqu  xmm3,XMMWORD PTR[r8]
+DB     102,15,56,0,221
+       pxor    xmm0,xmm3
+       movdqa  xmm1,xmm0
+       pshufd  xmm3,xmm0,78
+       pshufd  xmm4,xmm2,78
+       pxor    xmm3,xmm0
+       pxor    xmm4,xmm2
+DB     102,15,58,68,194,0
+DB     102,15,58,68,202,17
+DB     102,15,58,68,220,0
+       pxor    xmm3,xmm0
+       pxor    xmm3,xmm1
+
+       movdqa  xmm4,xmm3
+       psrldq  xmm3,8
+       pslldq  xmm4,8
+       pxor    xmm1,xmm3
+       pxor    xmm0,xmm4
+
+       movdqa  xmm3,xmm0
+       psllq   xmm0,1
+       pxor    xmm0,xmm3
+       psllq   xmm0,5
+       pxor    xmm0,xmm3
+       psllq   xmm0,57
+       movdqa  xmm4,xmm0
+       pslldq  xmm0,8
+       psrldq  xmm4,8
+       pxor    xmm0,xmm3
+       pxor    xmm1,xmm4
+
+
+       movdqa  xmm4,xmm0
+       psrlq   xmm0,5
+       pxor    xmm0,xmm4
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+       pxor    xmm4,xmm1
+       psrlq   xmm0,1
+       pxor    xmm0,xmm4
+$L$done::
+DB     102,15,56,0,197
+       movdqu  XMMWORD PTR[rcx],xmm0
+       movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  xmm10,XMMWORD PTR[64+rsp]
+       add     rsp,058h
+       DB      0F3h,0C3h               ;repret
+$L$SEH_end_gcm_ghash_clmul::
+gcm_ghash_clmul        ENDP
+ALIGN  64
+$L$bswap_mask::
+DB     15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$0x1c2_polynomial::
+DB     1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0c2h
+ALIGN  64
+
+$L$rem_4bit::
+       DD      0,0,0,471859200,0,943718400,0,610271232
+       DD      0,1887436800,0,1822425088,0,1220542464,0,1423966208
+       DD      0,3774873600,0,4246732800,0,3644850176,0,3311403008
+       DD      0,2441084928,0,2376073216,0,2847932416,0,3051356160
+
+$L$rem_8bit::
+       DW      00000h,001C2h,00384h,00246h,00708h,006CAh,0048Ch,0054Eh
+       DW      00E10h,00FD2h,00D94h,00C56h,00918h,008DAh,00A9Ch,00B5Eh
+       DW      01C20h,01DE2h,01FA4h,01E66h,01B28h,01AEAh,018ACh,0196Eh
+       DW      01230h,013F2h,011B4h,01076h,01538h,014FAh,016BCh,0177Eh
+       DW      03840h,03982h,03BC4h,03A06h,03F48h,03E8Ah,03CCCh,03D0Eh
+       DW      03650h,03792h,035D4h,03416h,03158h,0309Ah,032DCh,0331Eh
+       DW      02460h,025A2h,027E4h,02626h,02368h,022AAh,020ECh,0212Eh
+       DW      02A70h,02BB2h,029F4h,02836h,02D78h,02CBAh,02EFCh,02F3Eh
+       DW      07080h,07142h,07304h,072C6h,07788h,0764Ah,0740Ch,075CEh
+       DW      07E90h,07F52h,07D14h,07CD6h,07998h,0785Ah,07A1Ch,07BDEh
+       DW      06CA0h,06D62h,06F24h,06EE6h,06BA8h,06A6Ah,0682Ch,069EEh
+       DW      062B0h,06372h,06134h,060F6h,065B8h,0647Ah,0663Ch,067FEh
+       DW      048C0h,04902h,04B44h,04A86h,04FC8h,04E0Ah,04C4Ch,04D8Eh
+       DW      046D0h,04712h,04554h,04496h,041D8h,0401Ah,0425Ch,0439Eh
+       DW      054E0h,05522h,05764h,056A6h,053E8h,0522Ah,0506Ch,051AEh
+       DW      05AF0h,05B32h,05974h,058B6h,05DF8h,05C3Ah,05E7Ch,05FBEh
+       DW      0E100h,0E0C2h,0E284h,0E346h,0E608h,0E7CAh,0E58Ch,0E44Eh
+       DW      0EF10h,0EED2h,0EC94h,0ED56h,0E818h,0E9DAh,0EB9Ch,0EA5Eh
+       DW      0FD20h,0FCE2h,0FEA4h,0FF66h,0FA28h,0FBEAh,0F9ACh,0F86Eh
+       DW      0F330h,0F2F2h,0F0B4h,0F176h,0F438h,0F5FAh,0F7BCh,0F67Eh
+       DW      0D940h,0D882h,0DAC4h,0DB06h,0DE48h,0DF8Ah,0DDCCh,0DC0Eh
+       DW      0D750h,0D692h,0D4D4h,0D516h,0D058h,0D19Ah,0D3DCh,0D21Eh
+       DW      0C560h,0C4A2h,0C6E4h,0C726h,0C268h,0C3AAh,0C1ECh,0C02Eh
+       DW      0CB70h,0CAB2h,0C8F4h,0C936h,0CC78h,0CDBAh,0CFFCh,0CE3Eh
+       DW      09180h,09042h,09204h,093C6h,09688h,0974Ah,0950Ch,094CEh
+       DW      09F90h,09E52h,09C14h,09DD6h,09898h,0995Ah,09B1Ch,09ADEh
+       DW      08DA0h,08C62h,08E24h,08FE6h,08AA8h,08B6Ah,0892Ch,088EEh
+       DW      083B0h,08272h,08034h,081F6h,084B8h,0857Ah,0873Ch,086FEh
+       DW      0A9C0h,0A802h,0AA44h,0AB86h,0AEC8h,0AF0Ah,0AD4Ch,0AC8Eh
+       DW      0A7D0h,0A612h,0A454h,0A596h,0A0D8h,0A11Ah,0A35Ch,0A29Eh
+       DW      0B5E0h,0B422h,0B664h,0B7A6h,0B2E8h,0B32Ah,0B16Ch,0B0AEh
+       DW      0BBF0h,0BA32h,0B874h,0B9B6h,0BCF8h,0BD3Ah,0BF7Ch,0BEBEh
+
+DB     71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
+DB     44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+DB     60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+DB     114,103,62,0
+ALIGN  64
+EXTERN __imp_RtlVirtualUnwind:NEAR
+
+ALIGN  16
+se_handler     PROC PRIVATE
+       push    rsi
+       push    rdi
+       push    rbx
+       push    rbp
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+       pushfq
+       sub     rsp,64
+
+       mov     rax,QWORD PTR[120+r8]
+       mov     rbx,QWORD PTR[248+r8]
+
+       mov     rsi,QWORD PTR[8+r9]
+       mov     r11,QWORD PTR[56+r9]
+
+       mov     r10d,DWORD PTR[r11]
+       lea     r10,QWORD PTR[r10*1+rsi]
+       cmp     rbx,r10
+       jb      $L$in_prologue
+
+       mov     rax,QWORD PTR[152+r8]
+
+       mov     r10d,DWORD PTR[4+r11]
+       lea     r10,QWORD PTR[r10*1+rsi]
+       cmp     rbx,r10
+       jae     $L$in_prologue
+
+       lea     rax,QWORD PTR[24+rax]
+
+       mov     rbx,QWORD PTR[((-8))+rax]
+       mov     rbp,QWORD PTR[((-16))+rax]
+       mov     r12,QWORD PTR[((-24))+rax]
+       mov     QWORD PTR[144+r8],rbx
+       mov     QWORD PTR[160+r8],rbp
+       mov     QWORD PTR[216+r8],r12
+
+$L$in_prologue::
+       mov     rdi,QWORD PTR[8+rax]
+       mov     rsi,QWORD PTR[16+rax]
+       mov     QWORD PTR[152+r8],rax
+       mov     QWORD PTR[168+r8],rsi
+       mov     QWORD PTR[176+r8],rdi
+
+       mov     rdi,QWORD PTR[40+r9]
+       mov     rsi,r8
+       mov     ecx,154
+       DD      0a548f3fch
+
+
+       mov     rsi,r9
+       xor     rcx,rcx
+       mov     rdx,QWORD PTR[8+rsi]
+       mov     r8,QWORD PTR[rsi]
+       mov     r9,QWORD PTR[16+rsi]
+       mov     r10,QWORD PTR[40+rsi]
+       lea     r11,QWORD PTR[56+rsi]
+       lea     r12,QWORD PTR[24+rsi]
+       mov     QWORD PTR[32+rsp],r10
+       mov     QWORD PTR[40+rsp],r11
+       mov     QWORD PTR[48+rsp],r12
+       mov     QWORD PTR[56+rsp],rcx
+       call    QWORD PTR[__imp_RtlVirtualUnwind]
+
+       mov     eax,1
+       add     rsp,64
+       popfq
+       pop     r15
+       pop     r14
+       pop     r13
+       pop     r12
+       pop     rbp
+       pop     rbx
+       pop     rdi
+       pop     rsi
+       DB      0F3h,0C3h               ;repret
+se_handler     ENDP
+
+.text$ ENDS
+.pdata SEGMENT READONLY ALIGN(4)
+ALIGN  4
+       DD      imagerel $L$SEH_begin_gcm_gmult_4bit
+       DD      imagerel $L$SEH_end_gcm_gmult_4bit
+       DD      imagerel $L$SEH_info_gcm_gmult_4bit
+
+       DD      imagerel $L$SEH_begin_gcm_ghash_4bit
+       DD      imagerel $L$SEH_end_gcm_ghash_4bit
+       DD      imagerel $L$SEH_info_gcm_ghash_4bit
+
+       DD      imagerel $L$SEH_begin_gcm_ghash_clmul
+       DD      imagerel $L$SEH_end_gcm_ghash_clmul
+       DD      imagerel $L$SEH_info_gcm_ghash_clmul
+
+.pdata ENDS
+.xdata SEGMENT READONLY ALIGN(8)
+ALIGN  8
+$L$SEH_info_gcm_gmult_4bit::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$gmult_prologue,imagerel $L$gmult_epilogue
+
+$L$SEH_info_gcm_ghash_4bit::
+DB     9,0,0,0
+       DD      imagerel se_handler
+       DD      imagerel $L$ghash_prologue,imagerel $L$ghash_epilogue
+
+$L$SEH_info_gcm_ghash_clmul::
+DB     001h,01fh,00bh,000h
+DB     01fh,0a8h,004h,000h
+
+DB     019h,098h,003h,000h
+
+DB     013h,088h,002h,000h
+
+DB     00dh,078h,001h,000h
+
+DB     008h,068h,000h,000h
+
+DB     004h,0a2h,000h,000h
+
+
+.xdata ENDS
+END
diff --git a/deps/openssl/asm/x86-elf-gas/aes/vpaes-x86.s b/deps/openssl/asm/x86-elf-gas/aes/vpaes-x86.s
new file mode 100644 (file)
index 0000000..c53a507
--- /dev/null
@@ -0,0 +1,661 @@
+.file  "vpaes-x86.s"
+.text
+.align 64
+.L_vpaes_consts:
+.long  218628480,235210255,168496130,67568393
+.long  252381056,17041926,33884169,51187212
+.long  252645135,252645135,252645135,252645135
+.long  1512730624,3266504856,1377990664,3401244816
+.long  830229760,1275146365,2969422977,3447763452
+.long  3411033600,2979783055,338359620,2782886510
+.long  4209124096,907596821,221174255,1006095553
+.long  191964160,3799684038,3164090317,1589111125
+.long  182528256,1777043520,2877432650,3265356744
+.long  1874708224,3503451415,3305285752,363511674
+.long  1606117888,3487855781,1093350906,2384367825
+.long  197121,67569157,134941193,202313229
+.long  67569157,134941193,202313229,197121
+.long  134941193,202313229,197121,67569157
+.long  202313229,197121,67569157,134941193
+.long  33619971,100992007,168364043,235736079
+.long  235736079,33619971,100992007,168364043
+.long  168364043,235736079,33619971,100992007
+.long  100992007,168364043,235736079,33619971
+.long  50462976,117835012,185207048,252579084
+.long  252314880,51251460,117574920,184942860
+.long  184682752,252054788,50987272,118359308
+.long  118099200,185467140,251790600,50727180
+.long  2946363062,528716217,1300004225,1881839624
+.long  1532713819,1532713819,1532713819,1532713819
+.long  3602276352,4288629033,3737020424,4153884961
+.long  1354558464,32357713,2958822624,3775749553
+.long  1201988352,132424512,1572796698,503232858
+.long  2213177600,1597421020,4103937655,675398315
+.long  2749646592,4273543773,1511898873,121693092
+.long  3040248576,1103263732,2871565598,1608280554
+.long  2236667136,2588920351,482954393,64377734
+.long  3069987328,291237287,2117370568,3650299247
+.long  533321216,3573750986,2572112006,1401264716
+.long  1339849704,2721158661,548607111,3445553514
+.long  2128193280,3054596040,2183486460,1257083700
+.long  655635200,1165381986,3923443150,2344132524
+.long  190078720,256924420,290342170,357187870
+.long  1610966272,2263057382,4103205268,309794674
+.long  2592527872,2233205587,1335446729,3402964816
+.long  3973531904,3225098121,3002836325,1918774430
+.long  3870401024,2102906079,2284471353,4117666579
+.long  617007872,1021508343,366931923,691083277
+.long  2528395776,3491914898,2968704004,1613121270
+.long  3445188352,3247741094,844474987,4093578302
+.long  651481088,1190302358,1689581232,574775300
+.long  4289380608,206939853,2555985458,2489840491
+.long  2130264064,327674451,3566485037,3349835193
+.long  2470714624,316102159,3636825756,3393945945
+.byte  86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte  111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte  83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte  114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte  118,101,114,115,105,116,121,41,0
+.align 64
+.type  _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+       addl    (%esp),%ebp
+       movdqa  -48(%ebp),%xmm7
+       movdqa  -16(%ebp),%xmm6
+       ret
+.size  _vpaes_preheat,.-_vpaes_preheat
+.type  _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+       movl    $16,%ecx
+       movl    240(%edx),%eax
+       movdqa  %xmm6,%xmm1
+       movdqa  (%ebp),%xmm2
+       pandn   %xmm0,%xmm1
+       movdqu  (%edx),%xmm5
+       psrld   $4,%xmm1
+       pand    %xmm6,%xmm0
+.byte  102,15,56,0,208
+       movdqa  16(%ebp),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm5,%xmm2
+       pxor    %xmm2,%xmm0
+       addl    $16,%edx
+       leal    192(%ebp),%ebx
+       jmp     .L000enc_entry
+.align 16
+.L001enc_loop:
+       movdqa  32(%ebp),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+       movdqa  48(%ebp),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       movdqa  64(%ebp),%xmm5
+.byte  102,15,56,0,234
+       movdqa  -64(%ebx,%ecx,1),%xmm1
+       movdqa  80(%ebp),%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm5,%xmm2
+       movdqa  (%ebx,%ecx,1),%xmm4
+       movdqa  %xmm0,%xmm3
+.byte  102,15,56,0,193
+       addl    $16,%edx
+       pxor    %xmm2,%xmm0
+.byte  102,15,56,0,220
+       addl    $16,%ecx
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,193
+       andl    $48,%ecx
+       pxor    %xmm3,%xmm0
+       subl    $1,%eax
+.L000enc_entry:
+       movdqa  %xmm6,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm6,%xmm0
+       movdqa  -32(%ebp),%xmm5
+.byte  102,15,56,0,232
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm7,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm5,%xmm3
+       movdqa  %xmm7,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm5,%xmm4
+       movdqa  %xmm7,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm7,%xmm3
+       movdqu  (%edx),%xmm5
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       jnz     .L001enc_loop
+       movdqa  96(%ebp),%xmm4
+       movdqa  112(%ebp),%xmm0
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+.byte  102,15,56,0,195
+       movdqa  64(%ebx,%ecx,1),%xmm1
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,193
+       ret
+.size  _vpaes_encrypt_core,.-_vpaes_encrypt_core
+.type  _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+       movl    240(%edx),%eax
+       leal    608(%ebp),%ebx
+       movdqa  %xmm6,%xmm1
+       movdqa  -64(%ebx),%xmm2
+       pandn   %xmm0,%xmm1
+       movl    %eax,%ecx
+       psrld   $4,%xmm1
+       movdqu  (%edx),%xmm5
+       shll    $4,%ecx
+       pand    %xmm6,%xmm0
+.byte  102,15,56,0,208
+       movdqa  -48(%ebx),%xmm0
+       xorl    $48,%ecx
+.byte  102,15,56,0,193
+       andl    $48,%ecx
+       pxor    %xmm5,%xmm2
+       movdqa  176(%ebp),%xmm5
+       pxor    %xmm2,%xmm0
+       addl    $16,%edx
+       leal    -352(%ebx,%ecx,1),%ecx
+       jmp     .L002dec_entry
+.align 16
+.L003dec_loop:
+       movdqa  -32(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  -16(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       addl    $16,%edx
+.byte  102,15,56,0,197
+       movdqa  (%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  16(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       subl    $1,%eax
+.byte  102,15,56,0,197
+       movdqa  32(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  48(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,197
+       movdqa  64(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  80(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,58,15,237,12
+.L002dec_entry:
+       movdqa  %xmm6,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm6,%xmm0
+       movdqa  -32(%ebp),%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm7,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm7,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm7,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm7,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqu  (%edx),%xmm0
+       jnz     .L003dec_loop
+       movdqa  96(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  112(%ebx),%xmm0
+       movdqa  (%ecx),%xmm2
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,194
+       ret
+.size  _vpaes_decrypt_core,.-_vpaes_decrypt_core
+.type  _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+       addl    (%esp),%ebp
+       movdqu  (%esi),%xmm0
+       movdqa  320(%ebp),%xmm2
+       movdqa  %xmm0,%xmm3
+       leal    (%ebp),%ebx
+       movdqa  %xmm2,4(%esp)
+       call    _vpaes_schedule_transform
+       movdqa  %xmm0,%xmm7
+       testl   %edi,%edi
+       jnz     .L004schedule_am_decrypting
+       movdqu  %xmm0,(%edx)
+       jmp     .L005schedule_go
+.L004schedule_am_decrypting:
+       movdqa  256(%ebp,%ecx,1),%xmm1
+.byte  102,15,56,0,217
+       movdqu  %xmm3,(%edx)
+       xorl    $48,%ecx
+.L005schedule_go:
+       cmpl    $192,%eax
+       ja      .L006schedule_256
+       je      .L007schedule_192
+.L008schedule_128:
+       movl    $10,%eax
+.L009loop_schedule_128:
+       call    _vpaes_schedule_round
+       decl    %eax
+       jz      .L010schedule_mangle_last
+       call    _vpaes_schedule_mangle
+       jmp     .L009loop_schedule_128
+.align 16
+.L007schedule_192:
+       movdqu  8(%esi),%xmm0
+       call    _vpaes_schedule_transform
+       movdqa  %xmm0,%xmm6
+       pxor    %xmm4,%xmm4
+       movhlps %xmm4,%xmm6
+       movl    $4,%eax
+.L011loop_schedule_192:
+       call    _vpaes_schedule_round
+.byte  102,15,58,15,198,8
+       call    _vpaes_schedule_mangle
+       call    _vpaes_schedule_192_smear
+       call    _vpaes_schedule_mangle
+       call    _vpaes_schedule_round
+       decl    %eax
+       jz      .L010schedule_mangle_last
+       call    _vpaes_schedule_mangle
+       call    _vpaes_schedule_192_smear
+       jmp     .L011loop_schedule_192
+.align 16
+.L006schedule_256:
+       movdqu  16(%esi),%xmm0
+       call    _vpaes_schedule_transform
+       movl    $7,%eax
+.L012loop_schedule_256:
+       call    _vpaes_schedule_mangle
+       movdqa  %xmm0,%xmm6
+       call    _vpaes_schedule_round
+       decl    %eax
+       jz      .L010schedule_mangle_last
+       call    _vpaes_schedule_mangle
+       pshufd  $255,%xmm0,%xmm0
+       movdqa  %xmm7,20(%esp)
+       movdqa  %xmm6,%xmm7
+       call    .L_vpaes_schedule_low_round
+       movdqa  20(%esp),%xmm7
+       jmp     .L012loop_schedule_256
+.align 16
+.L010schedule_mangle_last:
+       leal    384(%ebp),%ebx
+       testl   %edi,%edi
+       jnz     .L013schedule_mangle_last_dec
+       movdqa  256(%ebp,%ecx,1),%xmm1
+.byte  102,15,56,0,193
+       leal    352(%ebp),%ebx
+       addl    $32,%edx
+.L013schedule_mangle_last_dec:
+       addl    $-16,%edx
+       pxor    336(%ebp),%xmm0
+       call    _vpaes_schedule_transform
+       movdqu  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       ret
+.size  _vpaes_schedule_core,.-_vpaes_schedule_core
+.type  _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+       pshufd  $128,%xmm6,%xmm0
+       pxor    %xmm0,%xmm6
+       pshufd  $254,%xmm7,%xmm0
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm6,%xmm0
+       pxor    %xmm1,%xmm1
+       movhlps %xmm1,%xmm6
+       ret
+.size  _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.type  _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+       movdqa  8(%esp),%xmm2
+       pxor    %xmm1,%xmm1
+.byte  102,15,58,15,202,15
+.byte  102,15,58,15,210,15
+       pxor    %xmm1,%xmm7
+       pshufd  $255,%xmm0,%xmm0
+.byte  102,15,58,15,192,1
+       movdqa  %xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+       movdqa  %xmm7,%xmm1
+       pslldq  $4,%xmm7
+       pxor    %xmm1,%xmm7
+       movdqa  %xmm7,%xmm1
+       pslldq  $8,%xmm7
+       pxor    %xmm1,%xmm7
+       pxor    336(%ebp),%xmm7
+       movdqa  -16(%ebp),%xmm4
+       movdqa  -48(%ebp),%xmm5
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm4,%xmm0
+       movdqa  -32(%ebp),%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm5,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm5,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm5,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm5,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqa  32(%ebp),%xmm4
+.byte  102,15,56,0,226
+       movdqa  48(%ebp),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       pxor    %xmm7,%xmm0
+       movdqa  %xmm0,%xmm7
+       ret
+.size  _vpaes_schedule_round,.-_vpaes_schedule_round
+.type  _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+       movdqa  -16(%ebp),%xmm2
+       movdqa  %xmm2,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm2,%xmm0
+       movdqa  (%ebx),%xmm2
+.byte  102,15,56,0,208
+       movdqa  16(%ebx),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm2,%xmm0
+       ret
+.size  _vpaes_schedule_transform,.-_vpaes_schedule_transform
+.type  _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+       movdqa  %xmm0,%xmm4
+       movdqa  128(%ebp),%xmm5
+       testl   %edi,%edi
+       jnz     .L014schedule_mangle_dec
+       addl    $16,%edx
+       pxor    336(%ebp),%xmm4
+.byte  102,15,56,0,229
+       movdqa  %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+       jmp     .L015schedule_mangle_both
+.align 16
+.L014schedule_mangle_dec:
+       movdqa  -16(%ebp),%xmm2
+       leal    416(%ebp),%esi
+       movdqa  %xmm2,%xmm1
+       pandn   %xmm4,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm2,%xmm4
+       movdqa  (%esi),%xmm2
+.byte  102,15,56,0,212
+       movdqa  16(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+       movdqa  32(%esi),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  48(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+       movdqa  64(%esi),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  80(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+       movdqa  96(%esi),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  112(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       addl    $-16,%edx
+.L015schedule_mangle_both:
+       movdqa  256(%ebp,%ecx,1),%xmm1
+.byte  102,15,56,0,217
+       addl    $-16,%ecx
+       andl    $48,%ecx
+       movdqu  %xmm3,(%edx)
+       ret
+.size  _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl vpaes_set_encrypt_key
+.type  vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%eax
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movl    %eax,%ebx
+       shrl    $5,%ebx
+       addl    $5,%ebx
+       movl    %ebx,240(%edx)
+       movl    $48,%ecx
+       movl    $0,%edi
+       leal    .L_vpaes_consts+0x30-.L016pic_point,%ebp
+       call    _vpaes_schedule_core
+.L016pic_point:
+       movl    48(%esp),%esp
+       xorl    %eax,%eax
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl vpaes_set_decrypt_key
+.type  vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%eax
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movl    %eax,%ebx
+       shrl    $5,%ebx
+       addl    $5,%ebx
+       movl    %ebx,240(%edx)
+       shll    $4,%ebx
+       leal    16(%edx,%ebx,1),%edx
+       movl    $1,%edi
+       movl    %eax,%ecx
+       shrl    $1,%ecx
+       andl    $32,%ecx
+       xorl    $32,%ecx
+       leal    .L_vpaes_consts+0x30-.L017pic_point,%ebp
+       call    _vpaes_schedule_core
+.L017pic_point:
+       movl    48(%esp),%esp
+       xorl    %eax,%eax
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl vpaes_encrypt
+.type  vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       leal    .L_vpaes_consts+0x30-.L018pic_point,%ebp
+       call    _vpaes_preheat
+.L018pic_point:
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%edi
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movdqu  (%esi),%xmm0
+       call    _vpaes_encrypt_core
+       movdqu  %xmm0,(%edi)
+       movl    48(%esp),%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl vpaes_decrypt
+.type  vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       leal    .L_vpaes_consts+0x30-.L019pic_point,%ebp
+       call    _vpaes_preheat
+.L019pic_point:
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%edi
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movdqu  (%esi),%xmm0
+       call    _vpaes_decrypt_core
+       movdqu  %xmm0,(%edi)
+       movl    48(%esp),%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl vpaes_cbc_encrypt
+.type  vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       movl    24(%esp),%edi
+       movl    28(%esp),%eax
+       movl    32(%esp),%edx
+       subl    $16,%eax
+       jc      .L020cbc_abort
+       leal    -56(%esp),%ebx
+       movl    36(%esp),%ebp
+       andl    $-16,%ebx
+       movl    40(%esp),%ecx
+       xchgl   %esp,%ebx
+       movdqu  (%ebp),%xmm1
+       subl    %esi,%edi
+       movl    %ebx,48(%esp)
+       movl    %edi,(%esp)
+       movl    %edx,4(%esp)
+       movl    %ebp,8(%esp)
+       movl    %eax,%edi
+       leal    .L_vpaes_consts+0x30-.L021pic_point,%ebp
+       call    _vpaes_preheat
+.L021pic_point:
+       cmpl    $0,%ecx
+       je      .L022cbc_dec_loop
+       jmp     .L023cbc_enc_loop
+.align 16
+.L023cbc_enc_loop:
+       movdqu  (%esi),%xmm0
+       pxor    %xmm1,%xmm0
+       call    _vpaes_encrypt_core
+       movl    (%esp),%ebx
+       movl    4(%esp),%edx
+       movdqa  %xmm0,%xmm1
+       movdqu  %xmm0,(%ebx,%esi,1)
+       leal    16(%esi),%esi
+       subl    $16,%edi
+       jnc     .L023cbc_enc_loop
+       jmp     .L024cbc_done
+.align 16
+.L022cbc_dec_loop:
+       movdqu  (%esi),%xmm0
+       movdqa  %xmm1,16(%esp)
+       movdqa  %xmm0,32(%esp)
+       call    _vpaes_decrypt_core
+       movl    (%esp),%ebx
+       movl    4(%esp),%edx
+       pxor    16(%esp),%xmm0
+       movdqa  32(%esp),%xmm1
+       movdqu  %xmm0,(%ebx,%esi,1)
+       leal    16(%esi),%esi
+       subl    $16,%edi
+       jnc     .L022cbc_dec_loop
+.L024cbc_done:
+       movl    8(%esp),%ebx
+       movl    48(%esp),%esp
+       movdqu  %xmm1,(%ebx)
+.L020cbc_abort:
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
diff --git a/deps/openssl/asm/x86-elf-gas/modes/ghash-x86.s b/deps/openssl/asm/x86-elf-gas/modes/ghash-x86.s
new file mode 100644 (file)
index 0000000..cb9ae20
--- /dev/null
@@ -0,0 +1,728 @@
+.file  "ghash-x86.s"
+.text
+.globl gcm_gmult_4bit_x86
+.type  gcm_gmult_4bit_x86,@function
+.align 16
+gcm_gmult_4bit_x86:
+.L_gcm_gmult_4bit_x86_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       subl    $84,%esp
+       movl    104(%esp),%edi
+       movl    108(%esp),%esi
+       movl    (%edi),%ebp
+       movl    4(%edi),%edx
+       movl    8(%edi),%ecx
+       movl    12(%edi),%ebx
+       movl    $0,16(%esp)
+       movl    $471859200,20(%esp)
+       movl    $943718400,24(%esp)
+       movl    $610271232,28(%esp)
+       movl    $1887436800,32(%esp)
+       movl    $1822425088,36(%esp)
+       movl    $1220542464,40(%esp)
+       movl    $1423966208,44(%esp)
+       movl    $3774873600,48(%esp)
+       movl    $4246732800,52(%esp)
+       movl    $3644850176,56(%esp)
+       movl    $3311403008,60(%esp)
+       movl    $2441084928,64(%esp)
+       movl    $2376073216,68(%esp)
+       movl    $2847932416,72(%esp)
+       movl    $3051356160,76(%esp)
+       movl    %ebp,(%esp)
+       movl    %edx,4(%esp)
+       movl    %ecx,8(%esp)
+       movl    %ebx,12(%esp)
+       shrl    $20,%ebx
+       andl    $240,%ebx
+       movl    4(%esi,%ebx,1),%ebp
+       movl    (%esi,%ebx,1),%edx
+       movl    12(%esi,%ebx,1),%ecx
+       movl    8(%esi,%ebx,1),%ebx
+       xorl    %eax,%eax
+       movl    $15,%edi
+       jmp     .L000x86_loop
+.align 16
+.L000x86_loop:
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       andb    $240,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       decl    %edi
+       js      .L001x86_break
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       shlb    $4,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       jmp     .L000x86_loop
+.align 16
+.L001x86_break:
+       bswap   %ebx
+       bswap   %ecx
+       bswap   %edx
+       bswap   %ebp
+       movl    104(%esp),%edi
+       movl    %ebx,12(%edi)
+       movl    %ecx,8(%edi)
+       movl    %edx,4(%edi)
+       movl    %ebp,(%edi)
+       addl    $84,%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin
+.globl gcm_ghash_4bit_x86
+.type  gcm_ghash_4bit_x86,@function
+.align 16
+gcm_ghash_4bit_x86:
+.L_gcm_ghash_4bit_x86_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       subl    $84,%esp
+       movl    104(%esp),%ebx
+       movl    108(%esp),%esi
+       movl    112(%esp),%edi
+       movl    116(%esp),%ecx
+       addl    %edi,%ecx
+       movl    %ecx,116(%esp)
+       movl    (%ebx),%ebp
+       movl    4(%ebx),%edx
+       movl    8(%ebx),%ecx
+       movl    12(%ebx),%ebx
+       movl    $0,16(%esp)
+       movl    $471859200,20(%esp)
+       movl    $943718400,24(%esp)
+       movl    $610271232,28(%esp)
+       movl    $1887436800,32(%esp)
+       movl    $1822425088,36(%esp)
+       movl    $1220542464,40(%esp)
+       movl    $1423966208,44(%esp)
+       movl    $3774873600,48(%esp)
+       movl    $4246732800,52(%esp)
+       movl    $3644850176,56(%esp)
+       movl    $3311403008,60(%esp)
+       movl    $2441084928,64(%esp)
+       movl    $2376073216,68(%esp)
+       movl    $2847932416,72(%esp)
+       movl    $3051356160,76(%esp)
+.align 16
+.L002x86_outer_loop:
+       xorl    12(%edi),%ebx
+       xorl    8(%edi),%ecx
+       xorl    4(%edi),%edx
+       xorl    (%edi),%ebp
+       movl    %ebx,12(%esp)
+       movl    %ecx,8(%esp)
+       movl    %edx,4(%esp)
+       movl    %ebp,(%esp)
+       shrl    $20,%ebx
+       andl    $240,%ebx
+       movl    4(%esi,%ebx,1),%ebp
+       movl    (%esi,%ebx,1),%edx
+       movl    12(%esi,%ebx,1),%ecx
+       movl    8(%esi,%ebx,1),%ebx
+       xorl    %eax,%eax
+       movl    $15,%edi
+       jmp     .L003x86_loop
+.align 16
+.L003x86_loop:
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       andb    $240,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       decl    %edi
+       js      .L004x86_break
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       shlb    $4,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       jmp     .L003x86_loop
+.align 16
+.L004x86_break:
+       bswap   %ebx
+       bswap   %ecx
+       bswap   %edx
+       bswap   %ebp
+       movl    112(%esp),%edi
+       leal    16(%edi),%edi
+       cmpl    116(%esp),%edi
+       movl    %edi,112(%esp)
+       jb      .L002x86_outer_loop
+       movl    104(%esp),%edi
+       movl    %ebx,12(%edi)
+       movl    %ecx,8(%edi)
+       movl    %edx,4(%edi)
+       movl    %ebp,(%edi)
+       addl    $84,%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
+.type  _mmx_gmult_4bit_inner,@function
+.align 16
+_mmx_gmult_4bit_inner:
+       xorl    %ecx,%ecx
+       movl    %ebx,%edx
+       movb    %dl,%cl
+       shlb    $4,%cl
+       andl    $240,%edx
+       movq    8(%esi,%ecx,1),%mm0
+       movq    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    14(%edi),%cl
+       psllq   $60,%mm2
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    13(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    12(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    11(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    10(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    9(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    8(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    7(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    6(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    5(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    4(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    3(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    2(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    1(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    (%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       movl    4(%eax,%ebp,8),%edi
+       psrlq   $32,%mm0
+       movd    %mm1,%edx
+       psrlq   $32,%mm1
+       movd    %mm0,%ecx
+       movd    %mm1,%ebp
+       shll    $4,%edi
+       bswap   %ebx
+       bswap   %edx
+       bswap   %ecx
+       xorl    %edi,%ebp
+       bswap   %ebp
+       ret
+.size  _mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
+.globl gcm_gmult_4bit_mmx
+.type  gcm_gmult_4bit_mmx,@function
+.align 16
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%edi
+       movl    24(%esp),%esi
+       call    .L005pic_point
+.L005pic_point:
+       popl    %eax
+       leal    .Lrem_4bit-.L005pic_point(%eax),%eax
+       movzbl  15(%edi),%ebx
+       call    _mmx_gmult_4bit_inner
+       movl    20(%esp),%edi
+       emms
+       movl    %ebx,12(%edi)
+       movl    %edx,4(%edi)
+       movl    %ecx,8(%edi)
+       movl    %ebp,(%edi)
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  gcm_gmult_4bit_mmx,.-.L_gcm_gmult_4bit_mmx_begin
+.globl gcm_ghash_4bit_mmx
+.type  gcm_ghash_4bit_mmx,@function
+.align 16
+gcm_ghash_4bit_mmx:
+.L_gcm_ghash_4bit_mmx_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%ebp
+       movl    24(%esp),%esi
+       movl    28(%esp),%edi
+       movl    32(%esp),%ecx
+       call    .L006pic_point
+.L006pic_point:
+       popl    %eax
+       leal    .Lrem_4bit-.L006pic_point(%eax),%eax
+       addl    %edi,%ecx
+       movl    %ecx,32(%esp)
+       subl    $20,%esp
+       movl    12(%ebp),%ebx
+       movl    4(%ebp),%edx
+       movl    8(%ebp),%ecx
+       movl    (%ebp),%ebp
+       jmp     .L007mmx_outer_loop
+.align 16
+.L007mmx_outer_loop:
+       xorl    12(%edi),%ebx
+       xorl    4(%edi),%edx
+       xorl    8(%edi),%ecx
+       xorl    (%edi),%ebp
+       movl    %edi,48(%esp)
+       movl    %ebx,12(%esp)
+       movl    %edx,4(%esp)
+       movl    %ecx,8(%esp)
+       movl    %ebp,(%esp)
+       movl    %esp,%edi
+       shrl    $24,%ebx
+       call    _mmx_gmult_4bit_inner
+       movl    48(%esp),%edi
+       leal    16(%edi),%edi
+       cmpl    52(%esp),%edi
+       jb      .L007mmx_outer_loop
+       movl    40(%esp),%edi
+       emms
+       movl    %ebx,12(%edi)
+       movl    %edx,4(%edi)
+       movl    %ecx,8(%edi)
+       movl    %ebp,(%edi)
+       addl    $20,%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.size  gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.align 64
+.Lrem_4bit:
+.long  0,0,0,29491200,0,58982400,0,38141952
+.long  0,117964800,0,113901568,0,76283904,0,88997888
+.long  0,235929600,0,265420800,0,227803136,0,206962688
+.long  0,152567808,0,148504576,0,177995776,0,190709760
+.align 64
+.L008rem_8bit:
+.value 0,450,900,582,1800,1738,1164,1358
+.value 3600,4050,3476,3158,2328,2266,2716,2910
+.value 7200,7650,8100,7782,6952,6890,6316,6510
+.value 4656,5106,4532,4214,5432,5370,5820,6014
+.value 14400,14722,15300,14854,16200,16010,15564,15630
+.value 13904,14226,13780,13334,12632,12442,13020,13086
+.value 9312,9634,10212,9766,9064,8874,8428,8494
+.value 10864,11186,10740,10294,11640,11450,12028,12094
+.value 28800,28994,29444,29382,30600,30282,29708,30158
+.value 32400,32594,32020,31958,31128,30810,31260,31710
+.value 27808,28002,28452,28390,27560,27242,26668,27118
+.value 25264,25458,24884,24822,26040,25722,26172,26622
+.value 18624,18690,19268,19078,20424,19978,19532,19854
+.value 18128,18194,17748,17558,16856,16410,16988,17310
+.value 21728,21794,22372,22182,21480,21034,20588,20910
+.value 23280,23346,22900,22710,24056,23610,24188,24510
+.value 57600,57538,57988,58182,58888,59338,58764,58446
+.value 61200,61138,60564,60758,59416,59866,60316,59998
+.value 64800,64738,65188,65382,64040,64490,63916,63598
+.value 62256,62194,61620,61814,62520,62970,63420,63102
+.value 55616,55426,56004,56070,56904,57226,56780,56334
+.value 55120,54930,54484,54550,53336,53658,54236,53790
+.value 50528,50338,50916,50982,49768,50090,49644,49198
+.value 52080,51890,51444,51510,52344,52666,53244,52798
+.value 37248,36930,37380,37830,38536,38730,38156,38094
+.value 40848,40530,39956,40406,39064,39258,39708,39646
+.value 36256,35938,36388,36838,35496,35690,35116,35054
+.value 33712,33394,32820,33270,33976,34170,34620,34558
+.value 43456,43010,43588,43910,44744,44810,44364,44174
+.value 42960,42514,42068,42390,41176,41242,41820,41630
+.value 46560,46114,46692,47014,45800,45866,45420,45230
+.value 48112,47666,47220,47542,48376,48442,49020,48830
+.byte  71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte  82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte  112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte  0
diff --git a/deps/openssl/asm/x86-macosx-gas/aes/vpaes-x86.s b/deps/openssl/asm/x86-macosx-gas/aes/vpaes-x86.s
new file mode 100644 (file)
index 0000000..f6d164f
--- /dev/null
@@ -0,0 +1,635 @@
+.file  "vpaes-x86.s"
+.text
+.align 6,0x90
+L_vpaes_consts:
+.long  218628480,235210255,168496130,67568393
+.long  252381056,17041926,33884169,51187212
+.long  252645135,252645135,252645135,252645135
+.long  1512730624,3266504856,1377990664,3401244816
+.long  830229760,1275146365,2969422977,3447763452
+.long  3411033600,2979783055,338359620,2782886510
+.long  4209124096,907596821,221174255,1006095553
+.long  191964160,3799684038,3164090317,1589111125
+.long  182528256,1777043520,2877432650,3265356744
+.long  1874708224,3503451415,3305285752,363511674
+.long  1606117888,3487855781,1093350906,2384367825
+.long  197121,67569157,134941193,202313229
+.long  67569157,134941193,202313229,197121
+.long  134941193,202313229,197121,67569157
+.long  202313229,197121,67569157,134941193
+.long  33619971,100992007,168364043,235736079
+.long  235736079,33619971,100992007,168364043
+.long  168364043,235736079,33619971,100992007
+.long  100992007,168364043,235736079,33619971
+.long  50462976,117835012,185207048,252579084
+.long  252314880,51251460,117574920,184942860
+.long  184682752,252054788,50987272,118359308
+.long  118099200,185467140,251790600,50727180
+.long  2946363062,528716217,1300004225,1881839624
+.long  1532713819,1532713819,1532713819,1532713819
+.long  3602276352,4288629033,3737020424,4153884961
+.long  1354558464,32357713,2958822624,3775749553
+.long  1201988352,132424512,1572796698,503232858
+.long  2213177600,1597421020,4103937655,675398315
+.long  2749646592,4273543773,1511898873,121693092
+.long  3040248576,1103263732,2871565598,1608280554
+.long  2236667136,2588920351,482954393,64377734
+.long  3069987328,291237287,2117370568,3650299247
+.long  533321216,3573750986,2572112006,1401264716
+.long  1339849704,2721158661,548607111,3445553514
+.long  2128193280,3054596040,2183486460,1257083700
+.long  655635200,1165381986,3923443150,2344132524
+.long  190078720,256924420,290342170,357187870
+.long  1610966272,2263057382,4103205268,309794674
+.long  2592527872,2233205587,1335446729,3402964816
+.long  3973531904,3225098121,3002836325,1918774430
+.long  3870401024,2102906079,2284471353,4117666579
+.long  617007872,1021508343,366931923,691083277
+.long  2528395776,3491914898,2968704004,1613121270
+.long  3445188352,3247741094,844474987,4093578302
+.long  651481088,1190302358,1689581232,574775300
+.long  4289380608,206939853,2555985458,2489840491
+.long  2130264064,327674451,3566485037,3349835193
+.long  2470714624,316102159,3636825756,3393945945
+.byte  86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte  111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte  83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte  114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte  118,101,114,115,105,116,121,41,0
+.align 6,0x90
+.align 4
+__vpaes_preheat:
+       addl    (%esp),%ebp
+       movdqa  -48(%ebp),%xmm7
+       movdqa  -16(%ebp),%xmm6
+       ret
+.align 4
+__vpaes_encrypt_core:
+       movl    $16,%ecx
+       movl    240(%edx),%eax
+       movdqa  %xmm6,%xmm1
+       movdqa  (%ebp),%xmm2
+       pandn   %xmm0,%xmm1
+       movdqu  (%edx),%xmm5
+       psrld   $4,%xmm1
+       pand    %xmm6,%xmm0
+.byte  102,15,56,0,208
+       movdqa  16(%ebp),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm5,%xmm2
+       pxor    %xmm2,%xmm0
+       addl    $16,%edx
+       leal    192(%ebp),%ebx
+       jmp     L000enc_entry
+.align 4,0x90
+L001enc_loop:
+       movdqa  32(%ebp),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+       movdqa  48(%ebp),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       movdqa  64(%ebp),%xmm5
+.byte  102,15,56,0,234
+       movdqa  -64(%ebx,%ecx,1),%xmm1
+       movdqa  80(%ebp),%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm5,%xmm2
+       movdqa  (%ebx,%ecx,1),%xmm4
+       movdqa  %xmm0,%xmm3
+.byte  102,15,56,0,193
+       addl    $16,%edx
+       pxor    %xmm2,%xmm0
+.byte  102,15,56,0,220
+       addl    $16,%ecx
+       pxor    %xmm0,%xmm3
+.byte  102,15,56,0,193
+       andl    $48,%ecx
+       pxor    %xmm3,%xmm0
+       subl    $1,%eax
+L000enc_entry:
+       movdqa  %xmm6,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm6,%xmm0
+       movdqa  -32(%ebp),%xmm5
+.byte  102,15,56,0,232
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm7,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm5,%xmm3
+       movdqa  %xmm7,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm5,%xmm4
+       movdqa  %xmm7,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm7,%xmm3
+       movdqu  (%edx),%xmm5
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       jnz     L001enc_loop
+       movdqa  96(%ebp),%xmm4
+       movdqa  112(%ebp),%xmm0
+.byte  102,15,56,0,226
+       pxor    %xmm5,%xmm4
+.byte  102,15,56,0,195
+       movdqa  64(%ebx,%ecx,1),%xmm1
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,193
+       ret
+.align 4
+__vpaes_decrypt_core:
+       movl    240(%edx),%eax
+       leal    608(%ebp),%ebx
+       movdqa  %xmm6,%xmm1
+       movdqa  -64(%ebx),%xmm2
+       pandn   %xmm0,%xmm1
+       movl    %eax,%ecx
+       psrld   $4,%xmm1
+       movdqu  (%edx),%xmm5
+       shll    $4,%ecx
+       pand    %xmm6,%xmm0
+.byte  102,15,56,0,208
+       movdqa  -48(%ebx),%xmm0
+       xorl    $48,%ecx
+.byte  102,15,56,0,193
+       andl    $48,%ecx
+       pxor    %xmm5,%xmm2
+       movdqa  176(%ebp),%xmm5
+       pxor    %xmm2,%xmm0
+       addl    $16,%edx
+       leal    -352(%ebx,%ecx,1),%ecx
+       jmp     L002dec_entry
+.align 4,0x90
+L003dec_loop:
+       movdqa  -32(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  -16(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       addl    $16,%edx
+.byte  102,15,56,0,197
+       movdqa  (%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  16(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       subl    $1,%eax
+.byte  102,15,56,0,197
+       movdqa  32(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  48(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,197
+       movdqa  64(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  80(%ebx),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,58,15,237,12
+L002dec_entry:
+       movdqa  %xmm6,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm6,%xmm0
+       movdqa  -32(%ebp),%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm7,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm7,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm7,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm7,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqu  (%edx),%xmm0
+       jnz     L003dec_loop
+       movdqa  96(%ebx),%xmm4
+.byte  102,15,56,0,226
+       pxor    %xmm0,%xmm4
+       movdqa  112(%ebx),%xmm0
+       movdqa  (%ecx),%xmm2
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+.byte  102,15,56,0,194
+       ret
+.align 4
+__vpaes_schedule_core:
+       addl    (%esp),%ebp
+       movdqu  (%esi),%xmm0
+       movdqa  320(%ebp),%xmm2
+       movdqa  %xmm0,%xmm3
+       leal    (%ebp),%ebx
+       movdqa  %xmm2,4(%esp)
+       call    __vpaes_schedule_transform
+       movdqa  %xmm0,%xmm7
+       testl   %edi,%edi
+       jnz     L004schedule_am_decrypting
+       movdqu  %xmm0,(%edx)
+       jmp     L005schedule_go
+L004schedule_am_decrypting:
+       movdqa  256(%ebp,%ecx,1),%xmm1
+.byte  102,15,56,0,217
+       movdqu  %xmm3,(%edx)
+       xorl    $48,%ecx
+L005schedule_go:
+       cmpl    $192,%eax
+       ja      L006schedule_256
+       je      L007schedule_192
+L008schedule_128:
+       movl    $10,%eax
+L009loop_schedule_128:
+       call    __vpaes_schedule_round
+       decl    %eax
+       jz      L010schedule_mangle_last
+       call    __vpaes_schedule_mangle
+       jmp     L009loop_schedule_128
+.align 4,0x90
+L007schedule_192:
+       movdqu  8(%esi),%xmm0
+       call    __vpaes_schedule_transform
+       movdqa  %xmm0,%xmm6
+       pxor    %xmm4,%xmm4
+       movhlps %xmm4,%xmm6
+       movl    $4,%eax
+L011loop_schedule_192:
+       call    __vpaes_schedule_round
+.byte  102,15,58,15,198,8
+       call    __vpaes_schedule_mangle
+       call    __vpaes_schedule_192_smear
+       call    __vpaes_schedule_mangle
+       call    __vpaes_schedule_round
+       decl    %eax
+       jz      L010schedule_mangle_last
+       call    __vpaes_schedule_mangle
+       call    __vpaes_schedule_192_smear
+       jmp     L011loop_schedule_192
+.align 4,0x90
+L006schedule_256:
+       movdqu  16(%esi),%xmm0
+       call    __vpaes_schedule_transform
+       movl    $7,%eax
+L012loop_schedule_256:
+       call    __vpaes_schedule_mangle
+       movdqa  %xmm0,%xmm6
+       call    __vpaes_schedule_round
+       decl    %eax
+       jz      L010schedule_mangle_last
+       call    __vpaes_schedule_mangle
+       pshufd  $255,%xmm0,%xmm0
+       movdqa  %xmm7,20(%esp)
+       movdqa  %xmm6,%xmm7
+       call    L_vpaes_schedule_low_round
+       movdqa  20(%esp),%xmm7
+       jmp     L012loop_schedule_256
+.align 4,0x90
+L010schedule_mangle_last:
+       leal    384(%ebp),%ebx
+       testl   %edi,%edi
+       jnz     L013schedule_mangle_last_dec
+       movdqa  256(%ebp,%ecx,1),%xmm1
+.byte  102,15,56,0,193
+       leal    352(%ebp),%ebx
+       addl    $32,%edx
+L013schedule_mangle_last_dec:
+       addl    $-16,%edx
+       pxor    336(%ebp),%xmm0
+       call    __vpaes_schedule_transform
+       movdqu  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       ret
+.align 4
+__vpaes_schedule_192_smear:
+       pshufd  $128,%xmm6,%xmm0
+       pxor    %xmm0,%xmm6
+       pshufd  $254,%xmm7,%xmm0
+       pxor    %xmm0,%xmm6
+       movdqa  %xmm6,%xmm0
+       pxor    %xmm1,%xmm1
+       movhlps %xmm1,%xmm6
+       ret
+.align 4
+__vpaes_schedule_round:
+       movdqa  8(%esp),%xmm2
+       pxor    %xmm1,%xmm1
+.byte  102,15,58,15,202,15
+.byte  102,15,58,15,210,15
+       pxor    %xmm1,%xmm7
+       pshufd  $255,%xmm0,%xmm0
+.byte  102,15,58,15,192,1
+       movdqa  %xmm2,8(%esp)
+L_vpaes_schedule_low_round:
+       movdqa  %xmm7,%xmm1
+       pslldq  $4,%xmm7
+       pxor    %xmm1,%xmm7
+       movdqa  %xmm7,%xmm1
+       pslldq  $8,%xmm7
+       pxor    %xmm1,%xmm7
+       pxor    336(%ebp),%xmm7
+       movdqa  -16(%ebp),%xmm4
+       movdqa  -48(%ebp),%xmm5
+       movdqa  %xmm4,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm4,%xmm0
+       movdqa  -32(%ebp),%xmm2
+.byte  102,15,56,0,208
+       pxor    %xmm1,%xmm0
+       movdqa  %xmm5,%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       movdqa  %xmm5,%xmm4
+.byte  102,15,56,0,224
+       pxor    %xmm2,%xmm4
+       movdqa  %xmm5,%xmm2
+.byte  102,15,56,0,211
+       pxor    %xmm0,%xmm2
+       movdqa  %xmm5,%xmm3
+.byte  102,15,56,0,220
+       pxor    %xmm1,%xmm3
+       movdqa  32(%ebp),%xmm4
+.byte  102,15,56,0,226
+       movdqa  48(%ebp),%xmm0
+.byte  102,15,56,0,195
+       pxor    %xmm4,%xmm0
+       pxor    %xmm7,%xmm0
+       movdqa  %xmm0,%xmm7
+       ret
+.align 4
+__vpaes_schedule_transform:
+       movdqa  -16(%ebp),%xmm2
+       movdqa  %xmm2,%xmm1
+       pandn   %xmm0,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm2,%xmm0
+       movdqa  (%ebx),%xmm2
+.byte  102,15,56,0,208
+       movdqa  16(%ebx),%xmm0
+.byte  102,15,56,0,193
+       pxor    %xmm2,%xmm0
+       ret
+.align 4
+__vpaes_schedule_mangle:
+       movdqa  %xmm0,%xmm4
+       movdqa  128(%ebp),%xmm5
+       testl   %edi,%edi
+       jnz     L014schedule_mangle_dec
+       addl    $16,%edx
+       pxor    336(%ebp),%xmm4
+.byte  102,15,56,0,229
+       movdqa  %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+.byte  102,15,56,0,229
+       pxor    %xmm4,%xmm3
+       jmp     L015schedule_mangle_both
+.align 4,0x90
+L014schedule_mangle_dec:
+       movdqa  -16(%ebp),%xmm2
+       leal    416(%ebp),%esi
+       movdqa  %xmm2,%xmm1
+       pandn   %xmm4,%xmm1
+       psrld   $4,%xmm1
+       pand    %xmm2,%xmm4
+       movdqa  (%esi),%xmm2
+.byte  102,15,56,0,212
+       movdqa  16(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+       movdqa  32(%esi),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  48(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+       movdqa  64(%esi),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  80(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+.byte  102,15,56,0,221
+       movdqa  96(%esi),%xmm2
+.byte  102,15,56,0,212
+       pxor    %xmm3,%xmm2
+       movdqa  112(%esi),%xmm3
+.byte  102,15,56,0,217
+       pxor    %xmm2,%xmm3
+       addl    $-16,%edx
+L015schedule_mangle_both:
+       movdqa  256(%ebp,%ecx,1),%xmm1
+.byte  102,15,56,0,217
+       addl    $-16,%ecx
+       andl    $48,%ecx
+       movdqu  %xmm3,(%edx)
+       ret
+.globl _vpaes_set_encrypt_key
+.align 4
+_vpaes_set_encrypt_key:
+L_vpaes_set_encrypt_key_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%eax
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movl    %eax,%ebx
+       shrl    $5,%ebx
+       addl    $5,%ebx
+       movl    %ebx,240(%edx)
+       movl    $48,%ecx
+       movl    $0,%edi
+       leal    L_vpaes_consts+0x30-L016pic_point,%ebp
+       call    __vpaes_schedule_core
+L016pic_point:
+       movl    48(%esp),%esp
+       xorl    %eax,%eax
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _vpaes_set_decrypt_key
+.align 4
+_vpaes_set_decrypt_key:
+L_vpaes_set_decrypt_key_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%eax
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movl    %eax,%ebx
+       shrl    $5,%ebx
+       addl    $5,%ebx
+       movl    %ebx,240(%edx)
+       shll    $4,%ebx
+       leal    16(%edx,%ebx,1),%edx
+       movl    $1,%edi
+       movl    %eax,%ecx
+       shrl    $1,%ecx
+       andl    $32,%ecx
+       xorl    $32,%ecx
+       leal    L_vpaes_consts+0x30-L017pic_point,%ebp
+       call    __vpaes_schedule_core
+L017pic_point:
+       movl    48(%esp),%esp
+       xorl    %eax,%eax
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _vpaes_encrypt
+.align 4
+_vpaes_encrypt:
+L_vpaes_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       leal    L_vpaes_consts+0x30-L018pic_point,%ebp
+       call    __vpaes_preheat
+L018pic_point:
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%edi
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movdqu  (%esi),%xmm0
+       call    __vpaes_encrypt_core
+       movdqu  %xmm0,(%edi)
+       movl    48(%esp),%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _vpaes_decrypt
+.align 4
+_vpaes_decrypt:
+L_vpaes_decrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       leal    L_vpaes_consts+0x30-L019pic_point,%ebp
+       call    __vpaes_preheat
+L019pic_point:
+       movl    20(%esp),%esi
+       leal    -56(%esp),%ebx
+       movl    24(%esp),%edi
+       andl    $-16,%ebx
+       movl    28(%esp),%edx
+       xchgl   %esp,%ebx
+       movl    %ebx,48(%esp)
+       movdqu  (%esi),%xmm0
+       call    __vpaes_decrypt_core
+       movdqu  %xmm0,(%edi)
+       movl    48(%esp),%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _vpaes_cbc_encrypt
+.align 4
+_vpaes_cbc_encrypt:
+L_vpaes_cbc_encrypt_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%esi
+       movl    24(%esp),%edi
+       movl    28(%esp),%eax
+       movl    32(%esp),%edx
+       subl    $16,%eax
+       jc      L020cbc_abort
+       leal    -56(%esp),%ebx
+       movl    36(%esp),%ebp
+       andl    $-16,%ebx
+       movl    40(%esp),%ecx
+       xchgl   %esp,%ebx
+       movdqu  (%ebp),%xmm1
+       subl    %esi,%edi
+       movl    %ebx,48(%esp)
+       movl    %edi,(%esp)
+       movl    %edx,4(%esp)
+       movl    %ebp,8(%esp)
+       movl    %eax,%edi
+       leal    L_vpaes_consts+0x30-L021pic_point,%ebp
+       call    __vpaes_preheat
+L021pic_point:
+       cmpl    $0,%ecx
+       je      L022cbc_dec_loop
+       jmp     L023cbc_enc_loop
+.align 4,0x90
+L023cbc_enc_loop:
+       movdqu  (%esi),%xmm0
+       pxor    %xmm1,%xmm0
+       call    __vpaes_encrypt_core
+       movl    (%esp),%ebx
+       movl    4(%esp),%edx
+       movdqa  %xmm0,%xmm1
+       movdqu  %xmm0,(%ebx,%esi,1)
+       leal    16(%esi),%esi
+       subl    $16,%edi
+       jnc     L023cbc_enc_loop
+       jmp     L024cbc_done
+.align 4,0x90
+L022cbc_dec_loop:
+       movdqu  (%esi),%xmm0
+       movdqa  %xmm1,16(%esp)
+       movdqa  %xmm0,32(%esp)
+       call    __vpaes_decrypt_core
+       movl    (%esp),%ebx
+       movl    4(%esp),%edx
+       pxor    16(%esp),%xmm0
+       movdqa  32(%esp),%xmm1
+       movdqu  %xmm0,(%ebx,%esi,1)
+       leal    16(%esi),%esi
+       subl    $16,%edi
+       jnc     L022cbc_dec_loop
+L024cbc_done:
+       movl    8(%esp),%ebx
+       movl    48(%esp),%esp
+       movdqu  %xmm1,(%ebx)
+L020cbc_abort:
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
diff --git a/deps/openssl/asm/x86-macosx-gas/modes/ghash-x86.s b/deps/openssl/asm/x86-macosx-gas/modes/ghash-x86.s
new file mode 100644 (file)
index 0000000..dc6ba14
--- /dev/null
@@ -0,0 +1,718 @@
+.file  "ghash-x86.s"
+.text
+.globl _gcm_gmult_4bit_x86
+.align 4
+_gcm_gmult_4bit_x86:
+L_gcm_gmult_4bit_x86_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       subl    $84,%esp
+       movl    104(%esp),%edi
+       movl    108(%esp),%esi
+       movl    (%edi),%ebp
+       movl    4(%edi),%edx
+       movl    8(%edi),%ecx
+       movl    12(%edi),%ebx
+       movl    $0,16(%esp)
+       movl    $471859200,20(%esp)
+       movl    $943718400,24(%esp)
+       movl    $610271232,28(%esp)
+       movl    $1887436800,32(%esp)
+       movl    $1822425088,36(%esp)
+       movl    $1220542464,40(%esp)
+       movl    $1423966208,44(%esp)
+       movl    $3774873600,48(%esp)
+       movl    $4246732800,52(%esp)
+       movl    $3644850176,56(%esp)
+       movl    $3311403008,60(%esp)
+       movl    $2441084928,64(%esp)
+       movl    $2376073216,68(%esp)
+       movl    $2847932416,72(%esp)
+       movl    $3051356160,76(%esp)
+       movl    %ebp,(%esp)
+       movl    %edx,4(%esp)
+       movl    %ecx,8(%esp)
+       movl    %ebx,12(%esp)
+       shrl    $20,%ebx
+       andl    $240,%ebx
+       movl    4(%esi,%ebx,1),%ebp
+       movl    (%esi,%ebx,1),%edx
+       movl    12(%esi,%ebx,1),%ecx
+       movl    8(%esi,%ebx,1),%ebx
+       xorl    %eax,%eax
+       movl    $15,%edi
+       jmp     L000x86_loop
+.align 4,0x90
+L000x86_loop:
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       andb    $240,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       decl    %edi
+       js      L001x86_break
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       shlb    $4,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       jmp     L000x86_loop
+.align 4,0x90
+L001x86_break:
+       bswap   %ebx
+       bswap   %ecx
+       bswap   %edx
+       bswap   %ebp
+       movl    104(%esp),%edi
+       movl    %ebx,12(%edi)
+       movl    %ecx,8(%edi)
+       movl    %edx,4(%edi)
+       movl    %ebp,(%edi)
+       addl    $84,%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _gcm_ghash_4bit_x86
+.align 4
+_gcm_ghash_4bit_x86:
+L_gcm_ghash_4bit_x86_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       subl    $84,%esp
+       movl    104(%esp),%ebx
+       movl    108(%esp),%esi
+       movl    112(%esp),%edi
+       movl    116(%esp),%ecx
+       addl    %edi,%ecx
+       movl    %ecx,116(%esp)
+       movl    (%ebx),%ebp
+       movl    4(%ebx),%edx
+       movl    8(%ebx),%ecx
+       movl    12(%ebx),%ebx
+       movl    $0,16(%esp)
+       movl    $471859200,20(%esp)
+       movl    $943718400,24(%esp)
+       movl    $610271232,28(%esp)
+       movl    $1887436800,32(%esp)
+       movl    $1822425088,36(%esp)
+       movl    $1220542464,40(%esp)
+       movl    $1423966208,44(%esp)
+       movl    $3774873600,48(%esp)
+       movl    $4246732800,52(%esp)
+       movl    $3644850176,56(%esp)
+       movl    $3311403008,60(%esp)
+       movl    $2441084928,64(%esp)
+       movl    $2376073216,68(%esp)
+       movl    $2847932416,72(%esp)
+       movl    $3051356160,76(%esp)
+.align 4,0x90
+L002x86_outer_loop:
+       xorl    12(%edi),%ebx
+       xorl    8(%edi),%ecx
+       xorl    4(%edi),%edx
+       xorl    (%edi),%ebp
+       movl    %ebx,12(%esp)
+       movl    %ecx,8(%esp)
+       movl    %edx,4(%esp)
+       movl    %ebp,(%esp)
+       shrl    $20,%ebx
+       andl    $240,%ebx
+       movl    4(%esi,%ebx,1),%ebp
+       movl    (%esi,%ebx,1),%edx
+       movl    12(%esi,%ebx,1),%ecx
+       movl    8(%esi,%ebx,1),%ebx
+       xorl    %eax,%eax
+       movl    $15,%edi
+       jmp     L003x86_loop
+.align 4,0x90
+L003x86_loop:
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       andb    $240,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       decl    %edi
+       js      L004x86_break
+       movb    %bl,%al
+       shrdl   $4,%ecx,%ebx
+       andb    $15,%al
+       shrdl   $4,%edx,%ecx
+       shrdl   $4,%ebp,%edx
+       shrl    $4,%ebp
+       xorl    16(%esp,%eax,4),%ebp
+       movb    (%esp,%edi,1),%al
+       shlb    $4,%al
+       xorl    8(%esi,%eax,1),%ebx
+       xorl    12(%esi,%eax,1),%ecx
+       xorl    (%esi,%eax,1),%edx
+       xorl    4(%esi,%eax,1),%ebp
+       jmp     L003x86_loop
+.align 4,0x90
+L004x86_break:
+       bswap   %ebx
+       bswap   %ecx
+       bswap   %edx
+       bswap   %ebp
+       movl    112(%esp),%edi
+       leal    16(%edi),%edi
+       cmpl    116(%esp),%edi
+       movl    %edi,112(%esp)
+       jb      L002x86_outer_loop
+       movl    104(%esp),%edi
+       movl    %ebx,12(%edi)
+       movl    %ecx,8(%edi)
+       movl    %edx,4(%edi)
+       movl    %ebp,(%edi)
+       addl    $84,%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.align 4
+__mmx_gmult_4bit_inner:
+       xorl    %ecx,%ecx
+       movl    %ebx,%edx
+       movb    %dl,%cl
+       shlb    $4,%cl
+       andl    $240,%edx
+       movq    8(%esi,%ecx,1),%mm0
+       movq    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    14(%edi),%cl
+       psllq   $60,%mm2
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    13(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    12(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    11(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    10(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    9(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    8(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    7(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    6(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    5(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    4(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    3(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    2(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    1(%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       movb    (%edi),%cl
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movl    %ecx,%edx
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       shlb    $4,%cl
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%ecx,1),%mm0
+       psllq   $60,%mm2
+       andl    $240,%edx
+       pxor    (%eax,%ebp,8),%mm1
+       andl    $15,%ebx
+       pxor    (%esi,%ecx,1),%mm1
+       movd    %mm0,%ebp
+       pxor    %mm2,%mm0
+       psrlq   $4,%mm0
+       movq    %mm1,%mm2
+       psrlq   $4,%mm1
+       pxor    8(%esi,%edx,1),%mm0
+       psllq   $60,%mm2
+       pxor    (%eax,%ebx,8),%mm1
+       andl    $15,%ebp
+       pxor    (%esi,%edx,1),%mm1
+       movd    %mm0,%ebx
+       pxor    %mm2,%mm0
+       movl    4(%eax,%ebp,8),%edi
+       psrlq   $32,%mm0
+       movd    %mm1,%edx
+       psrlq   $32,%mm1
+       movd    %mm0,%ecx
+       movd    %mm1,%ebp
+       shll    $4,%edi
+       bswap   %ebx
+       bswap   %edx
+       bswap   %ecx
+       xorl    %edi,%ebp
+       bswap   %ebp
+       ret
+.globl _gcm_gmult_4bit_mmx
+.align 4
+_gcm_gmult_4bit_mmx:
+L_gcm_gmult_4bit_mmx_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%edi
+       movl    24(%esp),%esi
+       call    L005pic_point
+L005pic_point:
+       popl    %eax
+       leal    Lrem_4bit-L005pic_point(%eax),%eax
+       movzbl  15(%edi),%ebx
+       call    __mmx_gmult_4bit_inner
+       movl    20(%esp),%edi
+       emms
+       movl    %ebx,12(%edi)
+       movl    %edx,4(%edi)
+       movl    %ecx,8(%edi)
+       movl    %ebp,(%edi)
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.globl _gcm_ghash_4bit_mmx
+.align 4
+_gcm_ghash_4bit_mmx:
+L_gcm_ghash_4bit_mmx_begin:
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+       movl    20(%esp),%ebp
+       movl    24(%esp),%esi
+       movl    28(%esp),%edi
+       movl    32(%esp),%ecx
+       call    L006pic_point
+L006pic_point:
+       popl    %eax
+       leal    Lrem_4bit-L006pic_point(%eax),%eax
+       addl    %edi,%ecx
+       movl    %ecx,32(%esp)
+       subl    $20,%esp
+       movl    12(%ebp),%ebx
+       movl    4(%ebp),%edx
+       movl    8(%ebp),%ecx
+       movl    (%ebp),%ebp
+       jmp     L007mmx_outer_loop
+.align 4,0x90
+L007mmx_outer_loop:
+       xorl    12(%edi),%ebx
+       xorl    4(%edi),%edx
+       xorl    8(%edi),%ecx
+       xorl    (%edi),%ebp
+       movl    %edi,48(%esp)
+       movl    %ebx,12(%esp)
+       movl    %edx,4(%esp)
+       movl    %ecx,8(%esp)
+       movl    %ebp,(%esp)
+       movl    %esp,%edi
+       shrl    $24,%ebx
+       call    __mmx_gmult_4bit_inner
+       movl    48(%esp),%edi
+       leal    16(%edi),%edi
+       cmpl    52(%esp),%edi
+       jb      L007mmx_outer_loop
+       movl    40(%esp),%edi
+       emms
+       movl    %ebx,12(%edi)
+       movl    %edx,4(%edi)
+       movl    %ecx,8(%edi)
+       movl    %ebp,(%edi)
+       addl    $20,%esp
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       popl    %ebp
+       ret
+.align 6,0x90
+Lrem_4bit:
+.long  0,0,0,29491200,0,58982400,0,38141952
+.long  0,117964800,0,113901568,0,76283904,0,88997888
+.long  0,235929600,0,265420800,0,227803136,0,206962688
+.long  0,152567808,0,148504576,0,177995776,0,190709760
+.align 6,0x90
+L008rem_8bit:
+.value 0,450,900,582,1800,1738,1164,1358
+.value 3600,4050,3476,3158,2328,2266,2716,2910
+.value 7200,7650,8100,7782,6952,6890,6316,6510
+.value 4656,5106,4532,4214,5432,5370,5820,6014
+.value 14400,14722,15300,14854,16200,16010,15564,15630
+.value 13904,14226,13780,13334,12632,12442,13020,13086
+.value 9312,9634,10212,9766,9064,8874,8428,8494
+.value 10864,11186,10740,10294,11640,11450,12028,12094
+.value 28800,28994,29444,29382,30600,30282,29708,30158
+.value 32400,32594,32020,31958,31128,30810,31260,31710
+.value 27808,28002,28452,28390,27560,27242,26668,27118
+.value 25264,25458,24884,24822,26040,25722,26172,26622
+.value 18624,18690,19268,19078,20424,19978,19532,19854
+.value 18128,18194,17748,17558,16856,16410,16988,17310
+.value 21728,21794,22372,22182,21480,21034,20588,20910
+.value 23280,23346,22900,22710,24056,23610,24188,24510
+.value 57600,57538,57988,58182,58888,59338,58764,58446
+.value 61200,61138,60564,60758,59416,59866,60316,59998
+.value 64800,64738,65188,65382,64040,64490,63916,63598
+.value 62256,62194,61620,61814,62520,62970,63420,63102
+.value 55616,55426,56004,56070,56904,57226,56780,56334
+.value 55120,54930,54484,54550,53336,53658,54236,53790
+.value 50528,50338,50916,50982,49768,50090,49644,49198
+.value 52080,51890,51444,51510,52344,52666,53244,52798
+.value 37248,36930,37380,37830,38536,38730,38156,38094
+.value 40848,40530,39956,40406,39064,39258,39708,39646
+.value 36256,35938,36388,36838,35496,35690,35116,35054
+.value 33712,33394,32820,33270,33976,34170,34620,34558
+.value 43456,43010,43588,43910,44744,44810,44364,44174
+.value 42960,42514,42068,42390,41176,41242,41820,41630
+.value 46560,46114,46692,47014,45800,45866,45420,45230
+.value 48112,47666,47220,47542,48376,48442,49020,48830
+.byte  71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte  82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte  112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte  0
diff --git a/deps/openssl/asm/x86-win32-masm/aes/vpaes-x86.asm b/deps/openssl/asm/x86-win32-masm/aes/vpaes-x86.asm
new file mode 100644 (file)
index 0000000..621f58f
--- /dev/null
@@ -0,0 +1,662 @@
+TITLE  vpaes-x86.asm
+IF @Version LT 800
+ECHO MASM version 8.00 or later is strongly recommended.
+ENDIF
+.686
+.XMM
+IF @Version LT 800
+XMMWORD STRUCT 16
+DQ     2 dup (?)
+XMMWORD        ENDS
+ENDIF
+
+.MODEL FLAT
+OPTION DOTNAME
+IF @Version LT 800
+.text$ SEGMENT PAGE 'CODE'
+ELSE
+.text$ SEGMENT ALIGN(64) 'CODE'
+ENDIF
+ALIGN  64
+$L_vpaes_consts::
+DD     218628480,235210255,168496130,67568393
+DD     252381056,17041926,33884169,51187212
+DD     252645135,252645135,252645135,252645135
+DD     1512730624,3266504856,1377990664,3401244816
+DD     830229760,1275146365,2969422977,3447763452
+DD     3411033600,2979783055,338359620,2782886510
+DD     4209124096,907596821,221174255,1006095553
+DD     191964160,3799684038,3164090317,1589111125
+DD     182528256,1777043520,2877432650,3265356744
+DD     1874708224,3503451415,3305285752,363511674
+DD     1606117888,3487855781,1093350906,2384367825
+DD     197121,67569157,134941193,202313229
+DD     67569157,134941193,202313229,197121
+DD     134941193,202313229,197121,67569157
+DD     202313229,197121,67569157,134941193
+DD     33619971,100992007,168364043,235736079
+DD     235736079,33619971,100992007,168364043
+DD     168364043,235736079,33619971,100992007
+DD     100992007,168364043,235736079,33619971
+DD     50462976,117835012,185207048,252579084
+DD     252314880,51251460,117574920,184942860
+DD     184682752,252054788,50987272,118359308
+DD     118099200,185467140,251790600,50727180
+DD     2946363062,528716217,1300004225,1881839624
+DD     1532713819,1532713819,1532713819,1532713819
+DD     3602276352,4288629033,3737020424,4153884961
+DD     1354558464,32357713,2958822624,3775749553
+DD     1201988352,132424512,1572796698,503232858
+DD     2213177600,1597421020,4103937655,675398315
+DD     2749646592,4273543773,1511898873,121693092
+DD     3040248576,1103263732,2871565598,1608280554
+DD     2236667136,2588920351,482954393,64377734
+DD     3069987328,291237287,2117370568,3650299247
+DD     533321216,3573750986,2572112006,1401264716
+DD     1339849704,2721158661,548607111,3445553514
+DD     2128193280,3054596040,2183486460,1257083700
+DD     655635200,1165381986,3923443150,2344132524
+DD     190078720,256924420,290342170,357187870
+DD     1610966272,2263057382,4103205268,309794674
+DD     2592527872,2233205587,1335446729,3402964816
+DD     3973531904,3225098121,3002836325,1918774430
+DD     3870401024,2102906079,2284471353,4117666579
+DD     617007872,1021508343,366931923,691083277
+DD     2528395776,3491914898,2968704004,1613121270
+DD     3445188352,3247741094,844474987,4093578302
+DD     651481088,1190302358,1689581232,574775300
+DD     4289380608,206939853,2555985458,2489840491
+DD     2130264064,327674451,3566485037,3349835193
+DD     2470714624,316102159,3636825756,3393945945
+DB     86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+DB     111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+DB     83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+DB     114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+DB     118,101,114,115,105,116,121,41,0
+ALIGN  64
+ALIGN  16
+__vpaes_preheat        PROC PRIVATE
+       add     ebp,DWORD PTR [esp]
+       movdqa  xmm7,XMMWORD PTR [ebp-48]
+       movdqa  xmm6,XMMWORD PTR [ebp-16]
+       ret
+__vpaes_preheat ENDP
+ALIGN  16
+__vpaes_encrypt_core   PROC PRIVATE
+       mov     ecx,16
+       mov     eax,DWORD PTR 240[edx]
+       movdqa  xmm1,xmm6
+       movdqa  xmm2,XMMWORD PTR [ebp]
+       pandn   xmm1,xmm0
+       movdqu  xmm5,XMMWORD PTR [edx]
+       psrld   xmm1,4
+       pand    xmm0,xmm6
+DB     102,15,56,0,208
+       movdqa  xmm0,XMMWORD PTR 16[ebp]
+DB     102,15,56,0,193
+       pxor    xmm2,xmm5
+       pxor    xmm0,xmm2
+       add     edx,16
+       lea     ebx,DWORD PTR 192[ebp]
+       jmp     $L000enc_entry
+ALIGN  16
+$L001enc_loop:
+       movdqa  xmm4,XMMWORD PTR 32[ebp]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm5
+       movdqa  xmm0,XMMWORD PTR 48[ebp]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       movdqa  xmm5,XMMWORD PTR 64[ebp]
+DB     102,15,56,0,234
+       movdqa  xmm1,XMMWORD PTR [ecx*1+ebx-64]
+       movdqa  xmm2,XMMWORD PTR 80[ebp]
+DB     102,15,56,0,211
+       pxor    xmm2,xmm5
+       movdqa  xmm4,XMMWORD PTR [ecx*1+ebx]
+       movdqa  xmm3,xmm0
+DB     102,15,56,0,193
+       add     edx,16
+       pxor    xmm0,xmm2
+DB     102,15,56,0,220
+       add     ecx,16
+       pxor    xmm3,xmm0
+DB     102,15,56,0,193
+       and     ecx,48
+       pxor    xmm0,xmm3
+       sub     eax,1
+$L000enc_entry:
+       movdqa  xmm1,xmm6
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm6
+       movdqa  xmm5,XMMWORD PTR [ebp-32]
+DB     102,15,56,0,232
+       pxor    xmm0,xmm1
+       movdqa  xmm3,xmm7
+DB     102,15,56,0,217
+       pxor    xmm3,xmm5
+       movdqa  xmm4,xmm7
+DB     102,15,56,0,224
+       pxor    xmm4,xmm5
+       movdqa  xmm2,xmm7
+DB     102,15,56,0,211
+       pxor    xmm2,xmm0
+       movdqa  xmm3,xmm7
+       movdqu  xmm5,XMMWORD PTR [edx]
+DB     102,15,56,0,220
+       pxor    xmm3,xmm1
+       jnz     $L001enc_loop
+       movdqa  xmm4,XMMWORD PTR 96[ebp]
+       movdqa  xmm0,XMMWORD PTR 112[ebp]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm5
+DB     102,15,56,0,195
+       movdqa  xmm1,XMMWORD PTR 64[ecx*1+ebx]
+       pxor    xmm0,xmm4
+DB     102,15,56,0,193
+       ret
+__vpaes_encrypt_core ENDP
+ALIGN  16
+__vpaes_decrypt_core   PROC PRIVATE
+       mov     eax,DWORD PTR 240[edx]
+       lea     ebx,DWORD PTR 608[ebp]
+       movdqa  xmm1,xmm6
+       movdqa  xmm2,XMMWORD PTR [ebx-64]
+       pandn   xmm1,xmm0
+       mov     ecx,eax
+       psrld   xmm1,4
+       movdqu  xmm5,XMMWORD PTR [edx]
+       shl     ecx,4
+       pand    xmm0,xmm6
+DB     102,15,56,0,208
+       movdqa  xmm0,XMMWORD PTR [ebx-48]
+       xor     ecx,48
+DB     102,15,56,0,193
+       and     ecx,48
+       pxor    xmm2,xmm5
+       movdqa  xmm5,XMMWORD PTR 176[ebp]
+       pxor    xmm0,xmm2
+       add     edx,16
+       lea     ecx,DWORD PTR [ecx*1+ebx-352]
+       jmp     $L002dec_entry
+ALIGN  16
+$L003dec_loop:
+       movdqa  xmm4,XMMWORD PTR [ebx-32]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR [ebx-16]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       add     edx,16
+DB     102,15,56,0,197
+       movdqa  xmm4,XMMWORD PTR [ebx]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR 16[ebx]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       sub     eax,1
+DB     102,15,56,0,197
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR 48[ebx]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+DB     102,15,56,0,197
+       movdqa  xmm4,XMMWORD PTR 64[ebx]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR 80[ebx]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+DB     102,15,58,15,237,12
+$L002dec_entry:
+       movdqa  xmm1,xmm6
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm6
+       movdqa  xmm2,XMMWORD PTR [ebp-32]
+DB     102,15,56,0,208
+       pxor    xmm0,xmm1
+       movdqa  xmm3,xmm7
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+       movdqa  xmm4,xmm7
+DB     102,15,56,0,224
+       pxor    xmm4,xmm2
+       movdqa  xmm2,xmm7
+DB     102,15,56,0,211
+       pxor    xmm2,xmm0
+       movdqa  xmm3,xmm7
+DB     102,15,56,0,220
+       pxor    xmm3,xmm1
+       movdqu  xmm0,XMMWORD PTR [edx]
+       jnz     $L003dec_loop
+       movdqa  xmm4,XMMWORD PTR 96[ebx]
+DB     102,15,56,0,226
+       pxor    xmm4,xmm0
+       movdqa  xmm0,XMMWORD PTR 112[ebx]
+       movdqa  xmm2,XMMWORD PTR [ecx]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+DB     102,15,56,0,194
+       ret
+__vpaes_decrypt_core ENDP
+ALIGN  16
+__vpaes_schedule_core  PROC PRIVATE
+       add     ebp,DWORD PTR [esp]
+       movdqu  xmm0,XMMWORD PTR [esi]
+       movdqa  xmm2,XMMWORD PTR 320[ebp]
+       movdqa  xmm3,xmm0
+       lea     ebx,DWORD PTR [ebp]
+       movdqa  XMMWORD PTR 4[esp],xmm2
+       call    __vpaes_schedule_transform
+       movdqa  xmm7,xmm0
+       test    edi,edi
+       jnz     $L004schedule_am_decrypting
+       movdqu  XMMWORD PTR [edx],xmm0
+       jmp     $L005schedule_go
+$L004schedule_am_decrypting:
+       movdqa  xmm1,XMMWORD PTR 256[ecx*1+ebp]
+DB     102,15,56,0,217
+       movdqu  XMMWORD PTR [edx],xmm3
+       xor     ecx,48
+$L005schedule_go:
+       cmp     eax,192
+       ja      $L006schedule_256
+       je      $L007schedule_192
+$L008schedule_128:
+       mov     eax,10
+$L009loop_schedule_128:
+       call    __vpaes_schedule_round
+       dec     eax
+       jz      $L010schedule_mangle_last
+       call    __vpaes_schedule_mangle
+       jmp     $L009loop_schedule_128
+ALIGN  16
+$L007schedule_192:
+       movdqu  xmm0,XMMWORD PTR 8[esi]
+       call    __vpaes_schedule_transform
+       movdqa  xmm6,xmm0
+       pxor    xmm4,xmm4
+       movhlps xmm6,xmm4
+       mov     eax,4
+$L011loop_schedule_192:
+       call    __vpaes_schedule_round
+DB     102,15,58,15,198,8
+       call    __vpaes_schedule_mangle
+       call    __vpaes_schedule_192_smear
+       call    __vpaes_schedule_mangle
+       call    __vpaes_schedule_round
+       dec     eax
+       jz      $L010schedule_mangle_last
+       call    __vpaes_schedule_mangle
+       call    __vpaes_schedule_192_smear
+       jmp     $L011loop_schedule_192
+ALIGN  16
+$L006schedule_256:
+       movdqu  xmm0,XMMWORD PTR 16[esi]
+       call    __vpaes_schedule_transform
+       mov     eax,7
+$L012loop_schedule_256:
+       call    __vpaes_schedule_mangle
+       movdqa  xmm6,xmm0
+       call    __vpaes_schedule_round
+       dec     eax
+       jz      $L010schedule_mangle_last
+       call    __vpaes_schedule_mangle
+       pshufd  xmm0,xmm0,255
+       movdqa  XMMWORD PTR 20[esp],xmm7
+       movdqa  xmm7,xmm6
+       call    $L_vpaes_schedule_low_round
+       movdqa  xmm7,XMMWORD PTR 20[esp]
+       jmp     $L012loop_schedule_256
+ALIGN  16
+$L010schedule_mangle_last:
+       lea     ebx,DWORD PTR 384[ebp]
+       test    edi,edi
+       jnz     $L013schedule_mangle_last_dec
+       movdqa  xmm1,XMMWORD PTR 256[ecx*1+ebp]
+DB     102,15,56,0,193
+       lea     ebx,DWORD PTR 352[ebp]
+       add     edx,32
+$L013schedule_mangle_last_dec:
+       add     edx,-16
+       pxor    xmm0,XMMWORD PTR 336[ebp]
+       call    __vpaes_schedule_transform
+       movdqu  XMMWORD PTR [edx],xmm0
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
+       ret
+__vpaes_schedule_core ENDP
+ALIGN  16
+__vpaes_schedule_192_smear     PROC PRIVATE
+       pshufd  xmm0,xmm6,128
+       pxor    xmm6,xmm0
+       pshufd  xmm0,xmm7,254
+       pxor    xmm6,xmm0
+       movdqa  xmm0,xmm6
+       pxor    xmm1,xmm1
+       movhlps xmm6,xmm1
+       ret
+__vpaes_schedule_192_smear ENDP
+ALIGN  16
+__vpaes_schedule_round PROC PRIVATE
+       movdqa  xmm2,XMMWORD PTR 8[esp]
+       pxor    xmm1,xmm1
+DB     102,15,58,15,202,15
+DB     102,15,58,15,210,15
+       pxor    xmm7,xmm1
+       pshufd  xmm0,xmm0,255
+DB     102,15,58,15,192,1
+       movdqa  XMMWORD PTR 8[esp],xmm2
+$L_vpaes_schedule_low_round::
+       movdqa  xmm1,xmm7
+       pslldq  xmm7,4
+       pxor    xmm7,xmm1
+       movdqa  xmm1,xmm7
+       pslldq  xmm7,8
+       pxor    xmm7,xmm1
+       pxor    xmm7,XMMWORD PTR 336[ebp]
+       movdqa  xmm4,XMMWORD PTR [ebp-16]
+       movdqa  xmm5,XMMWORD PTR [ebp-48]
+       movdqa  xmm1,xmm4
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm4
+       movdqa  xmm2,XMMWORD PTR [ebp-32]
+DB     102,15,56,0,208
+       pxor    xmm0,xmm1
+       movdqa  xmm3,xmm5
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+       movdqa  xmm4,xmm5
+DB     102,15,56,0,224
+       pxor    xmm4,xmm2
+       movdqa  xmm2,xmm5
+DB     102,15,56,0,211
+       pxor    xmm2,xmm0
+       movdqa  xmm3,xmm5
+DB     102,15,56,0,220
+       pxor    xmm3,xmm1
+       movdqa  xmm4,XMMWORD PTR 32[ebp]
+DB     102,15,56,0,226
+       movdqa  xmm0,XMMWORD PTR 48[ebp]
+DB     102,15,56,0,195
+       pxor    xmm0,xmm4
+       pxor    xmm0,xmm7
+       movdqa  xmm7,xmm0
+       ret
+__vpaes_schedule_round ENDP
+ALIGN  16
+__vpaes_schedule_transform     PROC PRIVATE
+       movdqa  xmm2,XMMWORD PTR [ebp-16]
+       movdqa  xmm1,xmm2
+       pandn   xmm1,xmm0
+       psrld   xmm1,4
+       pand    xmm0,xmm2
+       movdqa  xmm2,XMMWORD PTR [ebx]
+DB     102,15,56,0,208
+       movdqa  xmm0,XMMWORD PTR 16[ebx]
+DB     102,15,56,0,193
+       pxor    xmm0,xmm2
+       ret
+__vpaes_schedule_transform ENDP
+ALIGN  16
+__vpaes_schedule_mangle        PROC PRIVATE
+       movdqa  xmm4,xmm0
+       movdqa  xmm5,XMMWORD PTR 128[ebp]
+       test    edi,edi
+       jnz     $L014schedule_mangle_dec
+       add     edx,16
+       pxor    xmm4,XMMWORD PTR 336[ebp]
+DB     102,15,56,0,229
+       movdqa  xmm3,xmm4
+DB     102,15,56,0,229
+       pxor    xmm3,xmm4
+DB     102,15,56,0,229
+       pxor    xmm3,xmm4
+       jmp     $L015schedule_mangle_both
+ALIGN  16
+$L014schedule_mangle_dec:
+       movdqa  xmm2,XMMWORD PTR [ebp-16]
+       lea     esi,DWORD PTR 416[ebp]
+       movdqa  xmm1,xmm2
+       pandn   xmm1,xmm4
+       psrld   xmm1,4
+       pand    xmm4,xmm2
+       movdqa  xmm2,XMMWORD PTR [esi]
+DB     102,15,56,0,212
+       movdqa  xmm3,XMMWORD PTR 16[esi]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+DB     102,15,56,0,221
+       movdqa  xmm2,XMMWORD PTR 32[esi]
+DB     102,15,56,0,212
+       pxor    xmm2,xmm3
+       movdqa  xmm3,XMMWORD PTR 48[esi]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+DB     102,15,56,0,221
+       movdqa  xmm2,XMMWORD PTR 64[esi]
+DB     102,15,56,0,212
+       pxor    xmm2,xmm3
+       movdqa  xmm3,XMMWORD PTR 80[esi]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+DB     102,15,56,0,221
+       movdqa  xmm2,XMMWORD PTR 96[esi]
+DB     102,15,56,0,212
+       pxor    xmm2,xmm3
+       movdqa  xmm3,XMMWORD PTR 112[esi]
+DB     102,15,56,0,217
+       pxor    xmm3,xmm2
+       add     edx,-16
+$L015schedule_mangle_both:
+       movdqa  xmm1,XMMWORD PTR 256[ecx*1+ebp]
+DB     102,15,56,0,217
+       add     ecx,-16
+       and     ecx,48
+       movdqu  XMMWORD PTR [edx],xmm3
+       ret
+__vpaes_schedule_mangle ENDP
+ALIGN  16
+_vpaes_set_encrypt_key PROC PUBLIC
+$L_vpaes_set_encrypt_key_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     esi,DWORD PTR 20[esp]
+       lea     ebx,DWORD PTR [esp-56]
+       mov     eax,DWORD PTR 24[esp]
+       and     ebx,-16
+       mov     edx,DWORD PTR 28[esp]
+       xchg    ebx,esp
+       mov     DWORD PTR 48[esp],ebx
+       mov     ebx,eax
+       shr     ebx,5
+       add     ebx,5
+       mov     DWORD PTR 240[edx],ebx
+       mov     ecx,48
+       mov     edi,0
+       mov     ebp,OFFSET ($L_vpaes_consts+030h-$L016pic_point)
+       call    __vpaes_schedule_core
+$L016pic_point:
+       mov     esp,DWORD PTR 48[esp]
+       xor     eax,eax
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_vpaes_set_encrypt_key ENDP
+ALIGN  16
+_vpaes_set_decrypt_key PROC PUBLIC
+$L_vpaes_set_decrypt_key_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     esi,DWORD PTR 20[esp]
+       lea     ebx,DWORD PTR [esp-56]
+       mov     eax,DWORD PTR 24[esp]
+       and     ebx,-16
+       mov     edx,DWORD PTR 28[esp]
+       xchg    ebx,esp
+       mov     DWORD PTR 48[esp],ebx
+       mov     ebx,eax
+       shr     ebx,5
+       add     ebx,5
+       mov     DWORD PTR 240[edx],ebx
+       shl     ebx,4
+       lea     edx,DWORD PTR 16[ebx*1+edx]
+       mov     edi,1
+       mov     ecx,eax
+       shr     ecx,1
+       and     ecx,32
+       xor     ecx,32
+       mov     ebp,OFFSET ($L_vpaes_consts+030h-$L017pic_point)
+       call    __vpaes_schedule_core
+$L017pic_point:
+       mov     esp,DWORD PTR 48[esp]
+       xor     eax,eax
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_vpaes_set_decrypt_key ENDP
+ALIGN  16
+_vpaes_encrypt PROC PUBLIC
+$L_vpaes_encrypt_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     ebp,OFFSET ($L_vpaes_consts+030h-$L018pic_point)
+       call    __vpaes_preheat
+$L018pic_point:
+       mov     esi,DWORD PTR 20[esp]
+       lea     ebx,DWORD PTR [esp-56]
+       mov     edi,DWORD PTR 24[esp]
+       and     ebx,-16
+       mov     edx,DWORD PTR 28[esp]
+       xchg    ebx,esp
+       mov     DWORD PTR 48[esp],ebx
+       movdqu  xmm0,XMMWORD PTR [esi]
+       call    __vpaes_encrypt_core
+       movdqu  XMMWORD PTR [edi],xmm0
+       mov     esp,DWORD PTR 48[esp]
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_vpaes_encrypt ENDP
+ALIGN  16
+_vpaes_decrypt PROC PUBLIC
+$L_vpaes_decrypt_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     ebp,OFFSET ($L_vpaes_consts+030h-$L019pic_point)
+       call    __vpaes_preheat
+$L019pic_point:
+       mov     esi,DWORD PTR 20[esp]
+       lea     ebx,DWORD PTR [esp-56]
+       mov     edi,DWORD PTR 24[esp]
+       and     ebx,-16
+       mov     edx,DWORD PTR 28[esp]
+       xchg    ebx,esp
+       mov     DWORD PTR 48[esp],ebx
+       movdqu  xmm0,XMMWORD PTR [esi]
+       call    __vpaes_decrypt_core
+       movdqu  XMMWORD PTR [edi],xmm0
+       mov     esp,DWORD PTR 48[esp]
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_vpaes_decrypt ENDP
+ALIGN  16
+_vpaes_cbc_encrypt     PROC PUBLIC
+$L_vpaes_cbc_encrypt_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     esi,DWORD PTR 20[esp]
+       mov     edi,DWORD PTR 24[esp]
+       mov     eax,DWORD PTR 28[esp]
+       mov     edx,DWORD PTR 32[esp]
+       sub     eax,16
+       jc      $L020cbc_abort
+       lea     ebx,DWORD PTR [esp-56]
+       mov     ebp,DWORD PTR 36[esp]
+       and     ebx,-16
+       mov     ecx,DWORD PTR 40[esp]
+       xchg    ebx,esp
+       movdqu  xmm1,XMMWORD PTR [ebp]
+       sub     edi,esi
+       mov     DWORD PTR 48[esp],ebx
+       mov     DWORD PTR [esp],edi
+       mov     DWORD PTR 4[esp],edx
+       mov     DWORD PTR 8[esp],ebp
+       mov     edi,eax
+       mov     ebp,OFFSET ($L_vpaes_consts+030h-$L021pic_point)
+       call    __vpaes_preheat
+$L021pic_point:
+       cmp     ecx,0
+       je      $L022cbc_dec_loop
+       jmp     $L023cbc_enc_loop
+ALIGN  16
+$L023cbc_enc_loop:
+       movdqu  xmm0,XMMWORD PTR [esi]
+       pxor    xmm0,xmm1
+       call    __vpaes_encrypt_core
+       mov     ebx,DWORD PTR [esp]
+       mov     edx,DWORD PTR 4[esp]
+       movdqa  xmm1,xmm0
+       movdqu  XMMWORD PTR [esi*1+ebx],xmm0
+       lea     esi,DWORD PTR 16[esi]
+       sub     edi,16
+       jnc     $L023cbc_enc_loop
+       jmp     $L024cbc_done
+ALIGN  16
+$L022cbc_dec_loop:
+       movdqu  xmm0,XMMWORD PTR [esi]
+       movdqa  XMMWORD PTR 16[esp],xmm1
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       call    __vpaes_decrypt_core
+       mov     ebx,DWORD PTR [esp]
+       mov     edx,DWORD PTR 4[esp]
+       pxor    xmm0,XMMWORD PTR 16[esp]
+       movdqa  xmm1,XMMWORD PTR 32[esp]
+       movdqu  XMMWORD PTR [esi*1+ebx],xmm0
+       lea     esi,DWORD PTR 16[esi]
+       sub     edi,16
+       jnc     $L022cbc_dec_loop
+$L024cbc_done:
+       mov     ebx,DWORD PTR 8[esp]
+       mov     esp,DWORD PTR 48[esp]
+       movdqu  XMMWORD PTR [ebx],xmm1
+$L020cbc_abort:
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_vpaes_cbc_encrypt ENDP
+.text$ ENDS
+END
diff --git a/deps/openssl/asm/x86-win32-masm/modes/ghash-x86.asm b/deps/openssl/asm/x86-win32-masm/modes/ghash-x86.asm
new file mode 100644 (file)
index 0000000..d5041d2
--- /dev/null
@@ -0,0 +1,738 @@
+TITLE  ghash-x86.asm
+IF @Version LT 800
+ECHO MASM version 8.00 or later is strongly recommended.
+ENDIF
+.686
+.XMM
+IF @Version LT 800
+XMMWORD STRUCT 16
+DQ     2 dup (?)
+XMMWORD        ENDS
+ENDIF
+
+.MODEL FLAT
+OPTION DOTNAME
+IF @Version LT 800
+.text$ SEGMENT PAGE 'CODE'
+ELSE
+.text$ SEGMENT ALIGN(64) 'CODE'
+ENDIF
+ALIGN  16
+_gcm_gmult_4bit_x86    PROC PUBLIC
+$L_gcm_gmult_4bit_x86_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       sub     esp,84
+       mov     edi,DWORD PTR 104[esp]
+       mov     esi,DWORD PTR 108[esp]
+       mov     ebp,DWORD PTR [edi]
+       mov     edx,DWORD PTR 4[edi]
+       mov     ecx,DWORD PTR 8[edi]
+       mov     ebx,DWORD PTR 12[edi]
+       mov     DWORD PTR 16[esp],0
+       mov     DWORD PTR 20[esp],471859200
+       mov     DWORD PTR 24[esp],943718400
+       mov     DWORD PTR 28[esp],610271232
+       mov     DWORD PTR 32[esp],1887436800
+       mov     DWORD PTR 36[esp],1822425088
+       mov     DWORD PTR 40[esp],1220542464
+       mov     DWORD PTR 44[esp],1423966208
+       mov     DWORD PTR 48[esp],3774873600
+       mov     DWORD PTR 52[esp],4246732800
+       mov     DWORD PTR 56[esp],3644850176
+       mov     DWORD PTR 60[esp],3311403008
+       mov     DWORD PTR 64[esp],2441084928
+       mov     DWORD PTR 68[esp],2376073216
+       mov     DWORD PTR 72[esp],2847932416
+       mov     DWORD PTR 76[esp],3051356160
+       mov     DWORD PTR [esp],ebp
+       mov     DWORD PTR 4[esp],edx
+       mov     DWORD PTR 8[esp],ecx
+       mov     DWORD PTR 12[esp],ebx
+       shr     ebx,20
+       and     ebx,240
+       mov     ebp,DWORD PTR 4[ebx*1+esi]
+       mov     edx,DWORD PTR [ebx*1+esi]
+       mov     ecx,DWORD PTR 12[ebx*1+esi]
+       mov     ebx,DWORD PTR 8[ebx*1+esi]
+       xor     eax,eax
+       mov     edi,15
+       jmp     $L000x86_loop
+ALIGN  16
+$L000x86_loop:
+       mov     al,bl
+       shrd    ebx,ecx,4
+       and     al,15
+       shrd    ecx,edx,4
+       shrd    edx,ebp,4
+       shr     ebp,4
+       xor     ebp,DWORD PTR 16[eax*4+esp]
+       mov     al,BYTE PTR [edi*1+esp]
+       and     al,240
+       xor     ebx,DWORD PTR 8[eax*1+esi]
+       xor     ecx,DWORD PTR 12[eax*1+esi]
+       xor     edx,DWORD PTR [eax*1+esi]
+       xor     ebp,DWORD PTR 4[eax*1+esi]
+       dec     edi
+       js      $L001x86_break
+       mov     al,bl
+       shrd    ebx,ecx,4
+       and     al,15
+       shrd    ecx,edx,4
+       shrd    edx,ebp,4
+       shr     ebp,4
+       xor     ebp,DWORD PTR 16[eax*4+esp]
+       mov     al,BYTE PTR [edi*1+esp]
+       shl     al,4
+       xor     ebx,DWORD PTR 8[eax*1+esi]
+       xor     ecx,DWORD PTR 12[eax*1+esi]
+       xor     edx,DWORD PTR [eax*1+esi]
+       xor     ebp,DWORD PTR 4[eax*1+esi]
+       jmp     $L000x86_loop
+ALIGN  16
+$L001x86_break:
+       bswap   ebx
+       bswap   ecx
+       bswap   edx
+       bswap   ebp
+       mov     edi,DWORD PTR 104[esp]
+       mov     DWORD PTR 12[edi],ebx
+       mov     DWORD PTR 8[edi],ecx
+       mov     DWORD PTR 4[edi],edx
+       mov     DWORD PTR [edi],ebp
+       add     esp,84
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_gcm_gmult_4bit_x86 ENDP
+ALIGN  16
+_gcm_ghash_4bit_x86    PROC PUBLIC
+$L_gcm_ghash_4bit_x86_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       sub     esp,84
+       mov     ebx,DWORD PTR 104[esp]
+       mov     esi,DWORD PTR 108[esp]
+       mov     edi,DWORD PTR 112[esp]
+       mov     ecx,DWORD PTR 116[esp]
+       add     ecx,edi
+       mov     DWORD PTR 116[esp],ecx
+       mov     ebp,DWORD PTR [ebx]
+       mov     edx,DWORD PTR 4[ebx]
+       mov     ecx,DWORD PTR 8[ebx]
+       mov     ebx,DWORD PTR 12[ebx]
+       mov     DWORD PTR 16[esp],0
+       mov     DWORD PTR 20[esp],471859200
+       mov     DWORD PTR 24[esp],943718400
+       mov     DWORD PTR 28[esp],610271232
+       mov     DWORD PTR 32[esp],1887436800
+       mov     DWORD PTR 36[esp],1822425088
+       mov     DWORD PTR 40[esp],1220542464
+       mov     DWORD PTR 44[esp],1423966208
+       mov     DWORD PTR 48[esp],3774873600
+       mov     DWORD PTR 52[esp],4246732800
+       mov     DWORD PTR 56[esp],3644850176
+       mov     DWORD PTR 60[esp],3311403008
+       mov     DWORD PTR 64[esp],2441084928
+       mov     DWORD PTR 68[esp],2376073216
+       mov     DWORD PTR 72[esp],2847932416
+       mov     DWORD PTR 76[esp],3051356160
+ALIGN  16
+$L002x86_outer_loop:
+       xor     ebx,DWORD PTR 12[edi]
+       xor     ecx,DWORD PTR 8[edi]
+       xor     edx,DWORD PTR 4[edi]
+       xor     ebp,DWORD PTR [edi]
+       mov     DWORD PTR 12[esp],ebx
+       mov     DWORD PTR 8[esp],ecx
+       mov     DWORD PTR 4[esp],edx
+       mov     DWORD PTR [esp],ebp
+       shr     ebx,20
+       and     ebx,240
+       mov     ebp,DWORD PTR 4[ebx*1+esi]
+       mov     edx,DWORD PTR [ebx*1+esi]
+       mov     ecx,DWORD PTR 12[ebx*1+esi]
+       mov     ebx,DWORD PTR 8[ebx*1+esi]
+       xor     eax,eax
+       mov     edi,15
+       jmp     $L003x86_loop
+ALIGN  16
+$L003x86_loop:
+       mov     al,bl
+       shrd    ebx,ecx,4
+       and     al,15
+       shrd    ecx,edx,4
+       shrd    edx,ebp,4
+       shr     ebp,4
+       xor     ebp,DWORD PTR 16[eax*4+esp]
+       mov     al,BYTE PTR [edi*1+esp]
+       and     al,240
+       xor     ebx,DWORD PTR 8[eax*1+esi]
+       xor     ecx,DWORD PTR 12[eax*1+esi]
+       xor     edx,DWORD PTR [eax*1+esi]
+       xor     ebp,DWORD PTR 4[eax*1+esi]
+       dec     edi
+       js      $L004x86_break
+       mov     al,bl
+       shrd    ebx,ecx,4
+       and     al,15
+       shrd    ecx,edx,4
+       shrd    edx,ebp,4
+       shr     ebp,4
+       xor     ebp,DWORD PTR 16[eax*4+esp]
+       mov     al,BYTE PTR [edi*1+esp]
+       shl     al,4
+       xor     ebx,DWORD PTR 8[eax*1+esi]
+       xor     ecx,DWORD PTR 12[eax*1+esi]
+       xor     edx,DWORD PTR [eax*1+esi]
+       xor     ebp,DWORD PTR 4[eax*1+esi]
+       jmp     $L003x86_loop
+ALIGN  16
+$L004x86_break:
+       bswap   ebx
+       bswap   ecx
+       bswap   edx
+       bswap   ebp
+       mov     edi,DWORD PTR 112[esp]
+       lea     edi,DWORD PTR 16[edi]
+       cmp     edi,DWORD PTR 116[esp]
+       mov     DWORD PTR 112[esp],edi
+       jb      $L002x86_outer_loop
+       mov     edi,DWORD PTR 104[esp]
+       mov     DWORD PTR 12[edi],ebx
+       mov     DWORD PTR 8[edi],ecx
+       mov     DWORD PTR 4[edi],edx
+       mov     DWORD PTR [edi],ebp
+       add     esp,84
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_gcm_ghash_4bit_x86 ENDP
+ALIGN  16
+__mmx_gmult_4bit_inner PROC PRIVATE
+       xor     ecx,ecx
+       mov     edx,ebx
+       mov     cl,dl
+       shl     cl,4
+       and     edx,240
+       movq    mm0,QWORD PTR 8[ecx*1+esi]
+       movq    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 14[edi]
+       psllq   mm2,60
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 13[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 12[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 11[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 10[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 9[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 8[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 7[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 6[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 5[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 4[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 3[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 2[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR 1[edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       mov     cl,BYTE PTR [edi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       mov     edx,ecx
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       shl     cl,4
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[ecx*1+esi]
+       psllq   mm2,60
+       and     edx,240
+       pxor    mm1,QWORD PTR [ebp*8+eax]
+       and     ebx,15
+       pxor    mm1,QWORD PTR [ecx*1+esi]
+       movd    ebp,mm0
+       pxor    mm0,mm2
+       psrlq   mm0,4
+       movq    mm2,mm1
+       psrlq   mm1,4
+       pxor    mm0,QWORD PTR 8[edx*1+esi]
+       psllq   mm2,60
+       pxor    mm1,QWORD PTR [ebx*8+eax]
+       and     ebp,15
+       pxor    mm1,QWORD PTR [edx*1+esi]
+       movd    ebx,mm0
+       pxor    mm0,mm2
+       mov     edi,DWORD PTR 4[ebp*8+eax]
+       psrlq   mm0,32
+       movd    edx,mm1
+       psrlq   mm1,32
+       movd    ecx,mm0
+       movd    ebp,mm1
+       shl     edi,4
+       bswap   ebx
+       bswap   edx
+       bswap   ecx
+       xor     ebp,edi
+       bswap   ebp
+       ret
+__mmx_gmult_4bit_inner ENDP
+ALIGN  16
+_gcm_gmult_4bit_mmx    PROC PUBLIC
+$L_gcm_gmult_4bit_mmx_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     edi,DWORD PTR 20[esp]
+       mov     esi,DWORD PTR 24[esp]
+       call    $L005pic_point
+$L005pic_point:
+       pop     eax
+       lea     eax,DWORD PTR ($Lrem_4bit-$L005pic_point)[eax]
+       movzx   ebx,BYTE PTR 15[edi]
+       call    __mmx_gmult_4bit_inner
+       mov     edi,DWORD PTR 20[esp]
+       emms
+       mov     DWORD PTR 12[edi],ebx
+       mov     DWORD PTR 4[edi],edx
+       mov     DWORD PTR 8[edi],ecx
+       mov     DWORD PTR [edi],ebp
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_gcm_gmult_4bit_mmx ENDP
+ALIGN  16
+_gcm_ghash_4bit_mmx    PROC PUBLIC
+$L_gcm_ghash_4bit_mmx_begin::
+       push    ebp
+       push    ebx
+       push    esi
+       push    edi
+       mov     ebp,DWORD PTR 20[esp]
+       mov     esi,DWORD PTR 24[esp]
+       mov     edi,DWORD PTR 28[esp]
+       mov     ecx,DWORD PTR 32[esp]
+       call    $L006pic_point
+$L006pic_point:
+       pop     eax
+       lea     eax,DWORD PTR ($Lrem_4bit-$L006pic_point)[eax]
+       add     ecx,edi
+       mov     DWORD PTR 32[esp],ecx
+       sub     esp,20
+       mov     ebx,DWORD PTR 12[ebp]
+       mov     edx,DWORD PTR 4[ebp]
+       mov     ecx,DWORD PTR 8[ebp]
+       mov     ebp,DWORD PTR [ebp]
+       jmp     $L007mmx_outer_loop
+ALIGN  16
+$L007mmx_outer_loop:
+       xor     ebx,DWORD PTR 12[edi]
+       xor     edx,DWORD PTR 4[edi]
+       xor     ecx,DWORD PTR 8[edi]
+       xor     ebp,DWORD PTR [edi]
+       mov     DWORD PTR 48[esp],edi
+       mov     DWORD PTR 12[esp],ebx
+       mov     DWORD PTR 4[esp],edx
+       mov     DWORD PTR 8[esp],ecx
+       mov     DWORD PTR [esp],ebp
+       mov     edi,esp
+       shr     ebx,24
+       call    __mmx_gmult_4bit_inner
+       mov     edi,DWORD PTR 48[esp]
+       lea     edi,DWORD PTR 16[edi]
+       cmp     edi,DWORD PTR 52[esp]
+       jb      $L007mmx_outer_loop
+       mov     edi,DWORD PTR 40[esp]
+       emms
+       mov     DWORD PTR 12[edi],ebx
+       mov     DWORD PTR 4[edi],edx
+       mov     DWORD PTR 8[edi],ecx
+       mov     DWORD PTR [edi],ebp
+       add     esp,20
+       pop     edi
+       pop     esi
+       pop     ebx
+       pop     ebp
+       ret
+_gcm_ghash_4bit_mmx ENDP
+ALIGN  64
+$Lrem_4bit::
+DD     0,0,0,29491200,0,58982400,0,38141952
+DD     0,117964800,0,113901568,0,76283904,0,88997888
+DD     0,235929600,0,265420800,0,227803136,0,206962688
+DD     0,152567808,0,148504576,0,177995776,0,190709760
+ALIGN  64
+$L008rem_8bit:
+DW     0,450,900,582,1800,1738,1164,1358
+DW     3600,4050,3476,3158,2328,2266,2716,2910
+DW     7200,7650,8100,7782,6952,6890,6316,6510
+DW     4656,5106,4532,4214,5432,5370,5820,6014
+DW     14400,14722,15300,14854,16200,16010,15564,15630
+DW     13904,14226,13780,13334,12632,12442,13020,13086
+DW     9312,9634,10212,9766,9064,8874,8428,8494
+DW     10864,11186,10740,10294,11640,11450,12028,12094
+DW     28800,28994,29444,29382,30600,30282,29708,30158
+DW     32400,32594,32020,31958,31128,30810,31260,31710
+DW     27808,28002,28452,28390,27560,27242,26668,27118
+DW     25264,25458,24884,24822,26040,25722,26172,26622
+DW     18624,18690,19268,19078,20424,19978,19532,19854
+DW     18128,18194,17748,17558,16856,16410,16988,17310
+DW     21728,21794,22372,22182,21480,21034,20588,20910
+DW     23280,23346,22900,22710,24056,23610,24188,24510
+DW     57600,57538,57988,58182,58888,59338,58764,58446
+DW     61200,61138,60564,60758,59416,59866,60316,59998
+DW     64800,64738,65188,65382,64040,64490,63916,63598
+DW     62256,62194,61620,61814,62520,62970,63420,63102
+DW     55616,55426,56004,56070,56904,57226,56780,56334
+DW     55120,54930,54484,54550,53336,53658,54236,53790
+DW     50528,50338,50916,50982,49768,50090,49644,49198
+DW     52080,51890,51444,51510,52344,52666,53244,52798
+DW     37248,36930,37380,37830,38536,38730,38156,38094
+DW     40848,40530,39956,40406,39064,39258,39708,39646
+DW     36256,35938,36388,36838,35496,35690,35116,35054
+DW     33712,33394,32820,33270,33976,34170,34620,34558
+DW     43456,43010,43588,43910,44744,44810,44364,44174
+DW     42960,42514,42068,42390,41176,41242,41820,41630
+DW     46560,46114,46692,47014,45800,45866,45420,45230
+DW     48112,47666,47220,47542,48376,48442,49020,48830
+DB     71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+DB     82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+DB     112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+DB     0
+.text$ ENDS
+END
index 3becfca..3dc9106 100644 (file)
           # Enable asm
           'defines': [
             'AES_ASM',
+            'VPAES_ASM',
             'BF_ASM',
             'BNCO_ASM',
             'BN_ASM',
             'SHA1_ASM',
             'SHA256_ASM',
             'SHA512_ASM',
+            'GHASH_ASM',
             'WHIRLPOOL_ASM',
             'WP_ASM'
           ],
               'sources': [
                 'asm/x86-elf-gas/aes/aes-586.s',
                 'asm/x86-elf-gas/aes/aesni-x86.s',
+                'asm/x86-elf-gas/aes/vpaes-x86.s',
                 'asm/x86-elf-gas/bf/bf-686.s',
                 'asm/x86-elf-gas/bn/x86-mont.s',
                 'asm/x86-elf-gas/bn/x86.s',
                 'asm/x86-elf-gas/sha/sha256-586.s',
                 'asm/x86-elf-gas/sha/sha512-586.s',
                 'asm/x86-elf-gas/whrlpool/wp-mmx.s',
+                'asm/x86-elf-gas/modes/ghash-x86.s',
                 'asm/x86-elf-gas/x86cpuid.s',
                 'openssl/crypto/whrlpool/wp_block.c'
               ]
               'defines': [
                 'OPENSSL_BN_ASM_MONT5',
                 'OPENSSL_BN_ASM_GF2m',
+                'OPENSSL_IA32_SSE2',
+                'BSAES_ASM',
               ],
               'sources': [
                 'asm/x64-elf-gas/aes/aes-x86_64.s',
                 'asm/x64-elf-gas/aes/aesni-x86_64.s',
+                'asm/x64-elf-gas/aes/vpaes-x86_64.s',
+                'asm/x64-elf-gas/aes/bsaes-x86_64.s',
                 'asm/x64-elf-gas/aes/aesni-sha1-x86_64.s',
                 'asm/x64-elf-gas/bn/modexp512-x86_64.s',
                 'asm/x64-elf-gas/bn/x86_64-mont.s',
                 'asm/x64-elf-gas/sha/sha256-x86_64.s',
                 'asm/x64-elf-gas/sha/sha512-x86_64.s',
                 'asm/x64-elf-gas/whrlpool/wp-x86_64.s',
+                'asm/x64-elf-gas/modes/ghash-x86_64.s',
                 'asm/x64-elf-gas/x86_64cpuid.s',
                 # Non-generated asm
                 'openssl/crypto/bn/asm/x86_64-gcc.c',
               'sources': [
                 'asm/x86-macosx-gas/aes/aes-586.s',
                 'asm/x86-macosx-gas/aes/aesni-x86.s',
+                'asm/x86-macosx-gas/aes/vpaes-x86.s',
                 'asm/x86-macosx-gas/bf/bf-686.s',
                 'asm/x86-macosx-gas/bn/x86-mont.s',
                 'asm/x86-macosx-gas/bn/x86.s',
                 'asm/x86-macosx-gas/sha/sha256-586.s',
                 'asm/x86-macosx-gas/sha/sha512-586.s',
                 'asm/x86-macosx-gas/whrlpool/wp-mmx.s',
+                'asm/x86-macosx-gas/modes/ghash-x86.s',
                 'asm/x86-macosx-gas/x86cpuid.s',
                 'openssl/crypto/whrlpool/wp_block.c'
               ]
               'defines': [
                 'OPENSSL_BN_ASM_MONT5',
                 'OPENSSL_BN_ASM_GF2m',
+                'OPENSSL_IA32_SSE2',
+                'BSAES_ASM',
               ],
               'sources': [
                 'asm/x64-macosx-gas/aes/aes-x86_64.s',
                 'asm/x64-macosx-gas/aes/aesni-x86_64.s',
+                'asm/x64-macosx-gas/aes/vpaes-x86_64.s',
+                'asm/x64-macosx-gas/aes/bsaes-x86_64.s',
                 'asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s',
                 'asm/x64-macosx-gas/bn/modexp512-x86_64.s',
                 'asm/x64-macosx-gas/bn/x86_64-mont.s',
                 'asm/x64-macosx-gas/sha/sha256-x86_64.s',
                 'asm/x64-macosx-gas/sha/sha512-x86_64.s',
                 'asm/x64-macosx-gas/whrlpool/wp-x86_64.s',
+                'asm/x64-macosx-gas/modes/ghash-x86_64.s',
                 'asm/x64-macosx-gas/x86_64cpuid.s',
                 # Non-generated asm
                 'openssl/crypto/bn/asm/x86_64-gcc.c',
               'sources': [
                 'asm/x86-win32-masm/aes/aes-586.asm',
                 'asm/x86-win32-masm/aes/aesni-x86.asm',
+                'asm/x86-win32-masm/aes/vpaes-x86.asm',
                 'asm/x86-win32-masm/bf/bf-686.asm',
                 'asm/x86-win32-masm/bn/x86-mont.asm',
                 'asm/x86-win32-masm/bn/x86.asm',
                 'asm/x86-win32-masm/sha/sha256-586.asm',
                 'asm/x86-win32-masm/sha/sha512-586.asm',
                 'asm/x86-win32-masm/whrlpool/wp-mmx.asm',
+                'asm/x86-win32-masm/modes/ghash-x86.asm',
                 'asm/x86-win32-masm/x86cpuid.asm',
                 'openssl/crypto/whrlpool/wp_block.c'
               ],
               'defines': [
                 'OPENSSL_BN_ASM_MONT5',
                 'OPENSSL_BN_ASM_GF2m',
+                'OPENSSL_IA32_SSE2',
+                'BSAES_ASM',
               ],
               'sources': [
                 'asm/x64-win32-masm/aes/aes-x86_64.asm',
                 'asm/x64-win32-masm/aes/aesni-x86_64.asm',
+                'asm/x64-win32-masm/aes/vpaes-x86_64.asm',
+                'asm/x64-win32-masm/aes/bsaes-x86_64.asm',
                 'asm/x64-win32-masm/aes/aesni-sha1-x86_64.asm',
                 'asm/x64-win32-masm/bn/modexp512-x86_64.asm',
                 'asm/x64-win32-masm/bn/x86_64-mont.asm',
                 'asm/x64-win32-masm/sha/sha256-x86_64.asm',
                 'asm/x64-win32-masm/sha/sha512-x86_64.asm',
                 'asm/x64-win32-masm/whrlpool/wp-x86_64.asm',
+                'asm/x64-win32-masm/modes/ghash-x86_64.asm',
                 'asm/x64-win32-masm/x86_64cpuid.asm',
                 # Non-generated asm
                 'openssl/crypto/bn/asm/x86_64-win32-masm.asm',