deps: update asm files for openssl-1.0.2b
authorShigeki Ohtsu <ohtsu@iij.ad.jp>
Thu, 11 Jun 2015 15:09:20 +0000 (00:09 +0900)
committerShigeki Ohtsu <ohtsu@iij.ad.jp>
Fri, 12 Jun 2015 00:47:45 +0000 (09:47 +0900)
asm files are generated as
  - In `deps/openssl/asm/`, make with CC=gcc and ASM=nasm
  - In `deps/openssl/asm_obsolute/`, make with no envs for compilers

Fixes: https://github.com/nodejs/io.js/issues/1921
PR-URL: https://github.com/nodejs/io.js/pull/1950
Reviewed-By: Fedor Indutny <fedor@indutny.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
31 files changed:
deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S
deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S
deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S
deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S
deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S
deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S
deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s
deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s
deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm
deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s
deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s
deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm
deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S
deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S
deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S
deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S
deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S
deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S
deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm
deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm
deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s
deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s
deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm

index 732ba3d..fd979d0 100644 (file)
@@ -230,17 +230,17 @@ aes_v8_encrypt:
 
 .Loop_enc:
        .byte   0x00,0x43,0xb0,0xf3     @ aese q2,q0
-       vld1.32 {q0},[r2]!
        .byte   0x84,0x43,0xb0,0xf3     @ aesmc q2,q2
+       vld1.32 {q0},[r2]!
        subs    r3,r3,#2
        .byte   0x02,0x43,0xb0,0xf3     @ aese q2,q1
-       vld1.32 {q1},[r2]!
        .byte   0x84,0x43,0xb0,0xf3     @ aesmc q2,q2
+       vld1.32 {q1},[r2]!
        bgt     .Loop_enc
 
        .byte   0x00,0x43,0xb0,0xf3     @ aese q2,q0
-       vld1.32 {q0},[r2]
        .byte   0x84,0x43,0xb0,0xf3     @ aesmc q2,q2
+       vld1.32 {q0},[r2]
        .byte   0x02,0x43,0xb0,0xf3     @ aese q2,q1
        veor    q2,q2,q0
 
@@ -259,17 +259,17 @@ aes_v8_decrypt:
 
 .Loop_dec:
        .byte   0x40,0x43,0xb0,0xf3     @ aesd q2,q0
-       vld1.32 {q0},[r2]!
        .byte   0xc4,0x43,0xb0,0xf3     @ aesimc q2,q2
+       vld1.32 {q0},[r2]!
        subs    r3,r3,#2
        .byte   0x42,0x43,0xb0,0xf3     @ aesd q2,q1
-       vld1.32 {q1},[r2]!
        .byte   0xc4,0x43,0xb0,0xf3     @ aesimc q2,q2
+       vld1.32 {q1},[r2]!
        bgt     .Loop_dec
 
        .byte   0x40,0x43,0xb0,0xf3     @ aesd q2,q0
-       vld1.32 {q0},[r2]
        .byte   0xc4,0x43,0xb0,0xf3     @ aesimc q2,q2
+       vld1.32 {q0},[r2]
        .byte   0x42,0x43,0xb0,0xf3     @ aesd q2,q1
        veor    q2,q2,q0
 
@@ -313,16 +313,42 @@ aes_v8_cbc_encrypt:
        veor    q5,q8,q7
        beq     .Lcbc_enc128
 
+       vld1.32 {q2-q3},[r7]
+       add     r7,r3,#16
+       add     r6,r3,#16*4
+       add     r12,r3,#16*5
+       .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       add     r14,r3,#16*6
+       add     r3,r3,#16*7
+       b       .Lenter_cbc_enc
+
+.align 4
 .Loop_cbc_enc:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       vld1.32 {q8},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-       subs    r6,r6,#2
+        vst1.8 {q6},[r1]!
+.Lenter_cbc_enc:
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       vld1.32 {q9},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-       bgt     .Loop_cbc_enc
+       .byte   0x04,0x03,0xb0,0xf3     @ aese q0,q2
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q8},[r6]
+       cmp     r5,#4
+       .byte   0x06,0x03,0xb0,0xf3     @ aese q0,q3
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q9},[r12]
+       beq     .Lcbc_enc192
+
+       .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q8},[r14]
+       .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q9},[r3]
+       nop
 
+.Lcbc_enc192:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
         subs   r2,r2,#16
@@ -331,7 +357,6 @@ aes_v8_cbc_encrypt:
         moveq  r8,#0
        .byte   0x24,0x03,0xb0,0xf3     @ aese q0,q10
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-        add    r7,r3,#16
        .byte   0x26,0x03,0xb0,0xf3     @ aese q0,q11
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
         vld1.8 {q8},[r0],r8
@@ -340,16 +365,14 @@ aes_v8_cbc_encrypt:
         veor   q8,q8,q5
        .byte   0x2a,0x03,0xb0,0xf3     @ aese q0,q13
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-        vld1.32 {q9},[r7]!     @ re-pre-load rndkey[1]
+        vld1.32 {q9},[r7]              @ re-pre-load rndkey[1]
        .byte   0x2c,0x03,0xb0,0xf3     @ aese q0,q14
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
        .byte   0x2e,0x03,0xb0,0xf3     @ aese q0,q15
-
-        mov    r6,r5
        veor    q6,q0,q7
-       vst1.8  {q6},[r1]!
        bhs     .Loop_cbc_enc
 
+       vst1.8  {q6},[r1]!
        b       .Lcbc_done
 
 .align 5
@@ -407,79 +430,78 @@ aes_v8_cbc_encrypt:
 
 .Loop3x_cbc_dec:
        .byte   0x60,0x03,0xb0,0xf3     @ aesd q0,q8
-       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
-       vld1.32 {q8},[r7]!
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q8},[r7]!
        subs    r6,r6,#2
        .byte   0x62,0x03,0xb0,0xf3     @ aesd q0,q9
-       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
-       vld1.32 {q9},[r7]!
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q9},[r7]!
        bgt     .Loop3x_cbc_dec
 
        .byte   0x60,0x03,0xb0,0xf3     @ aesd q0,q8
-       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
-        veor   q4,q6,q7
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+        veor   q4,q6,q7
+        subs   r2,r2,#0x30
         veor   q5,q2,q7
+        movlo  r6,r2                   @ r6, r6, is zero at this point
        .byte   0x62,0x03,0xb0,0xf3     @ aesd q0,q9
-       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
-        veor   q9,q3,q7
-        subs   r2,r2,#0x30
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
-        vorr   q6,q11,q11
-        movlo  r6,r2                   @ r6, r6, is zero at this point
-       .byte   0x68,0x03,0xb0,0xf3     @ aesd q0,q12
-       .byte   0x68,0x23,0xb0,0xf3     @ aesd q1,q12
-       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
+        veor   q9,q3,q7
         add    r0,r0,r6                @ r0 is adjusted in such way that
                                        @ at exit from the loop q1-q10
                                        @ are loaded with last "words"
+        vorr   q6,q11,q11
+        mov    r7,r3
+       .byte   0x68,0x03,0xb0,0xf3     @ aesd q0,q12
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x68,0x23,0xb0,0xf3     @ aesd q1,q12
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
-        mov    r7,r3
-       .byte   0x6a,0x03,0xb0,0xf3     @ aesd q0,q13
-       .byte   0x6a,0x23,0xb0,0xf3     @ aesd q1,q13
-       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
         vld1.8 {q2},[r0]!
+       .byte   0x6a,0x03,0xb0,0xf3     @ aesd q0,q13
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x6a,0x23,0xb0,0xf3     @ aesd q1,q13
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         vld1.8 {q3},[r0]!
        .byte   0x6c,0x03,0xb0,0xf3     @ aesd q0,q14
-       .byte   0x6c,0x23,0xb0,0xf3     @ aesd q1,q14
-       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
-        vld1.8 {q11},[r0]!
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x6c,0x23,0xb0,0xf3     @ aesd q1,q14
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
-        vld1.32 {q8},[r7]!     @ re-pre-load rndkey[0]
+        vld1.8 {q11},[r0]!
        .byte   0x6e,0x03,0xb0,0xf3     @ aesd q0,q15
        .byte   0x6e,0x23,0xb0,0xf3     @ aesd q1,q15
        .byte   0x6e,0x43,0xf0,0xf3     @ aesd q10,q15
-
+        vld1.32 {q8},[r7]!     @ re-pre-load rndkey[0]
         add    r6,r5,#2
        veor    q4,q4,q0
        veor    q5,q5,q1
        veor    q10,q10,q9
         vld1.32 {q9},[r7]!     @ re-pre-load rndkey[1]
-        vorr   q0,q2,q2
        vst1.8  {q4},[r1]!
-        vorr   q1,q3,q3
+        vorr   q0,q2,q2
        vst1.8  {q5},[r1]!
+        vorr   q1,q3,q3
        vst1.8  {q10},[r1]!
         vorr   q10,q11,q11
        bhs     .Loop3x_cbc_dec
@@ -490,39 +512,39 @@ aes_v8_cbc_encrypt:
 
 .Lcbc_dec_tail:
        .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
-       vld1.32 {q8},[r7]!
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q8},[r7]!
        subs    r6,r6,#2
        .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
-       vld1.32 {q9},[r7]!
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q9},[r7]!
        bgt     .Lcbc_dec_tail
 
        .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
        .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
        .byte   0x68,0x23,0xb0,0xf3     @ aesd q1,q12
-       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         cmn    r2,#0x20
        .byte   0x6a,0x23,0xb0,0xf3     @ aesd q1,q13
-       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         veor   q5,q6,q7
        .byte   0x6c,0x23,0xb0,0xf3     @ aesd q1,q14
-       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         veor   q9,q3,q7
        .byte   0x6e,0x23,0xb0,0xf3     @ aesd q1,q15
@@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks:
 .align 4
 .Loop3x_ctr32:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
-       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
-       vld1.32         {q8},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
        .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
+       vld1.32         {q8},[r7]!
        subs            r6,r6,#2
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
-       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
-       vld1.32         {q9},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
        .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
+       vld1.32         {q9},[r7]!
        bgt             .Loop3x_ctr32
 
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
-       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
-        mov            r7,r3
        .byte   0x80,0x83,0xb0,0xf3     @ aesmc q4,q0
-        vld1.8         {q2},[r0]!
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0xa3,0xb0,0xf3     @ aesmc q5,q1
-       .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
+        vld1.8         {q2},[r0]!
         vorr           q0,q6,q6
-       .byte   0x22,0x83,0xb0,0xf3     @ aese q4,q9
+       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
+       .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
         vld1.8         {q3},[r0]!
-       .byte   0x22,0xa3,0xb0,0xf3     @ aese q5,q9
-       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
         vorr           q1,q6,q6
+       .byte   0x22,0x83,0xb0,0xf3     @ aese q4,q9
        .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-        vld1.8         {q11},[r0]!
+       .byte   0x22,0xa3,0xb0,0xf3     @ aese q5,q9
        .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+        vld1.8         {q11},[r0]!
+        mov            r7,r3
+       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
        .byte   0xa4,0x23,0xf0,0xf3     @ aesmc q9,q10
         vorr           q10,q6,q6
         add            r9,r8,#1
        .byte   0x28,0x83,0xb0,0xf3     @ aese q4,q12
+       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
        .byte   0x28,0xa3,0xb0,0xf3     @ aese q5,q12
-       .byte   0x28,0x23,0xf0,0xf3     @ aese q9,q12
+       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
         veor           q2,q2,q7
         add            r10,r8,#2
-       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+       .byte   0x28,0x23,0xf0,0xf3     @ aese q9,q12
        .byte   0xa2,0x23,0xf0,0xf3     @ aesmc q9,q9
         veor           q3,q3,q7
         add            r8,r8,#3
        .byte   0x2a,0x83,0xb0,0xf3     @ aese q4,q13
+       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
        .byte   0x2a,0xa3,0xb0,0xf3     @ aese q5,q13
-       .byte   0x2a,0x23,0xf0,0xf3     @ aese q9,q13
+       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
         veor           q11,q11,q7
         rev            r9,r9
-       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-        vld1.32         {q8},[r7]!     @ re-pre-load rndkey[0]
-       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+       .byte   0x2a,0x23,0xf0,0xf3     @ aese q9,q13
        .byte   0xa2,0x23,0xf0,0xf3     @ aesmc q9,q9
         vmov.32        d1[1], r9
         rev            r10,r10
        .byte   0x2c,0x83,0xb0,0xf3     @ aese q4,q14
+       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
        .byte   0x2c,0xa3,0xb0,0xf3     @ aese q5,q14
-       .byte   0x2c,0x23,0xf0,0xf3     @ aese q9,q14
+       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
         vmov.32        d3[1], r10
         rev            r12,r8
-       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+       .byte   0x2c,0x23,0xf0,0xf3     @ aese q9,q14
        .byte   0xa2,0x23,0xf0,0xf3     @ aesmc q9,q9
         vmov.32        d21[1], r12
         subs           r2,r2,#3
@@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks:
        .byte   0x2e,0xa3,0xb0,0xf3     @ aese q5,q15
        .byte   0x2e,0x23,0xf0,0xf3     @ aese q9,q15
 
-        mov            r6,r5
        veor            q2,q2,q4
+        vld1.32         {q8},[r7]!     @ re-pre-load rndkey[0]
+       vst1.8          {q2},[r1]!
        veor            q3,q3,q5
+        mov            r6,r5
+       vst1.8          {q3},[r1]!
        veor            q11,q11,q9
         vld1.32         {q9},[r7]!     @ re-pre-load rndkey[1]
-       vst1.8          {q2},[r1]!
-       vst1.8          {q3},[r1]!
        vst1.8          {q11},[r1]!
        bhs             .Loop3x_ctr32
 
@@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks:
 
 .Lctr32_tail:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
-       vld1.32         {q8},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       vld1.32         {q8},[r7]!
        subs            r6,r6,#2
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
-       vld1.32         {q9},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       vld1.32         {q9},[r7]!
        bgt             .Lctr32_tail
 
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
         vld1.8         {q2},[r0],r12
        .byte   0x28,0x03,0xb0,0xf3     @ aese q0,q12
-       .byte   0x28,0x23,0xb0,0xf3     @ aese q1,q12
-        vld1.8         {q3},[r0]
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x28,0x23,0xb0,0xf3     @ aese q1,q12
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+        vld1.8         {q3},[r0]
        .byte   0x2a,0x03,0xb0,0xf3     @ aese q0,q13
-       .byte   0x2a,0x23,0xb0,0xf3     @ aese q1,q13
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x2a,0x23,0xb0,0xf3     @ aese q1,q13
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
-       .byte   0x2c,0x03,0xb0,0xf3     @ aese q0,q14
-       .byte   0x2c,0x23,0xb0,0xf3     @ aese q1,q14
         veor           q2,q2,q7
+       .byte   0x2c,0x03,0xb0,0xf3     @ aese q0,q14
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x2c,0x23,0xb0,0xf3     @ aese q1,q14
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
         veor           q3,q3,q7
        .byte   0x2e,0x03,0xb0,0xf3     @ aese q0,q15
index d321235..c54f514 100644 (file)
@@ -495,7 +495,7 @@ gcm_ghash_neon:
        veor            q10,q10,q9              @
        vshl.i64        q9,q0,#63
        veor            q10, q10, q9            @
-       veor            d1,d1,d20       @
+       veor            d1,d1,d20       @
        veor            d4,d4,d21
 
        vshr.u64        q10,q0,#1               @ 2nd phase
index 570d917..2695749 100644 (file)
 .type  gcm_init_v8,%function
 .align 4
 gcm_init_v8:
-       vld1.64         {q9},[r1]               @ load H
-       vmov.i8         q8,#0xe1
+       vld1.64         {q9},[r1]               @ load input H
+       vmov.i8         q11,#0xe1
+       vshl.i64        q11,q11,#57             @ 0xc2.0
        vext.8          q3,q9,q9,#8
-       vshl.i64        q8,q8,#57
-       vshr.u64        q10,q8,#63
-       vext.8          q8,q10,q8,#8            @ t0=0xc2....01
+       vshr.u64        q10,q11,#63
        vdup.32 q9,d18[1]
-       vshr.u64        q11,q3,#63
+       vext.8          q8,q10,q11,#8           @ t0=0xc2....01
+       vshr.u64        q10,q3,#63
        vshr.s32        q9,q9,#31               @ broadcast carry bit
-       vand            q11,q11,q8
+       vand            q10,q10,q8
        vshl.i64        q3,q3,#1
-       vext.8          q11,q11,q11,#8
+       vext.8          q10,q10,q10,#8
        vand            q8,q8,q9
-       vorr            q3,q3,q11               @ H<<<=1
-       veor            q3,q3,q8                @ twisted H
-       vst1.64         {q3},[r0]
+       vorr            q3,q3,q10               @ H<<<=1
+       veor            q12,q3,q8               @ twisted H
+       vst1.64         {q12},[r0]!             @ store Htable[0]
+
+       @ calculate H^2
+       vext.8          q8,q12,q12,#8           @ Karatsuba pre-processing
+       .byte   0xa8,0x0e,0xa8,0xf2     @ pmull q0,q12,q12
+       veor            q8,q8,q12
+       .byte   0xa9,0x4e,0xa9,0xf2     @ pmull2 q2,q12,q12
+       .byte   0xa0,0x2e,0xa0,0xf2     @ pmull q1,q8,q8
+
+       vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
+       veor            q10,q0,q2
+       veor            q1,q1,q9
+       veor            q1,q1,q10
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase
+
+       vmov            d4,d3           @ Xh|Xm - 256-bit result
+       vmov            d3,d0           @ Xm is rotated Xl
+       veor            q0,q1,q10
+
+       vext.8          q10,q0,q0,#8            @ 2nd phase
+       .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
+       veor            q10,q10,q2
+       veor            q14,q0,q10
+
+       vext.8          q9,q14,q14,#8           @ Karatsuba pre-processing
+       veor            q9,q9,q14
+       vext.8          q13,q8,q9,#8            @ pack Karatsuba pre-processed
+       vst1.64         {q13-q14},[r0]          @ store Htable[1..2]
 
        bx      lr
 .size  gcm_init_v8,.-gcm_init_v8
-
 .global        gcm_gmult_v8
 .type  gcm_gmult_v8,%function
 .align 4
 gcm_gmult_v8:
        vld1.64         {q9},[r0]               @ load Xi
        vmov.i8         q11,#0xe1
-       vld1.64         {q12},[r1]              @ load twisted H
+       vld1.64         {q12-q13},[r1]  @ load twisted H, ...
        vshl.u64        q11,q11,#57
 #ifndef __ARMEB__
        vrev64.8        q9,q9
 #endif
-       vext.8          q13,q12,q12,#8
-       mov             r3,#0
        vext.8          q3,q9,q9,#8
-       mov             r12,#0
-       veor            q13,q13,q12             @ Karatsuba pre-processing
-       mov             r2,r0
-       b               .Lgmult_v8
-.size  gcm_gmult_v8,.-gcm_gmult_v8
 
+       .byte   0x86,0x0e,0xa8,0xf2     @ pmull q0,q12,q3               @ H.lo·Xi.lo
+       veor            q9,q9,q3                @ Karatsuba pre-processing
+       .byte   0x87,0x4e,0xa9,0xf2     @ pmull2 q2,q12,q3              @ H.hi·Xi.hi
+       .byte   0xa2,0x2e,0xaa,0xf2     @ pmull q1,q13,q9               @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+       vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
+       veor            q10,q0,q2
+       veor            q1,q1,q9
+       veor            q1,q1,q10
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase of reduction
+
+       vmov            d4,d3           @ Xh|Xm - 256-bit result
+       vmov            d3,d0           @ Xm is rotated Xl
+       veor            q0,q1,q10
+
+       vext.8          q10,q0,q0,#8            @ 2nd phase of reduction
+       .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
+       veor            q10,q10,q2
+       veor            q0,q0,q10
+
+#ifndef __ARMEB__
+       vrev64.8        q0,q0
+#endif
+       vext.8          q0,q0,q0,#8
+       vst1.64         {q0},[r0]               @ write out Xi
+
+       bx      lr
+.size  gcm_gmult_v8,.-gcm_gmult_v8
 .global        gcm_ghash_v8
 .type  gcm_ghash_v8,%function
 .align 4
 gcm_ghash_v8:
+       vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
        vld1.64         {q0},[r0]               @ load [rotated] Xi
-       subs            r3,r3,#16
+                                               @ "[rotated]" means that
+                                               @ loaded value would have
+                                               @ to be rotated in order to
+                                               @ make it appear as in
+                                               @ alorithm specification
+       subs            r3,r3,#32               @ see if r3 is 32 or larger
+       mov             r12,#16         @ r12 is used as post-
+                                               @ increment for input pointer;
+                                               @ as loop is modulo-scheduled
+                                               @ r12 is zeroed just in time
+                                               @ to preclude oversteping
+                                               @ inp[len], which means that
+                                               @ last block[s] are actually
+                                               @ loaded twice, but last
+                                               @ copy is not processed
+       vld1.64         {q12-q13},[r1]! @ load twisted H, ..., H^2
        vmov.i8         q11,#0xe1
-       mov             r12,#16
-       vld1.64         {q12},[r1]              @ load twisted H
-       moveq   r12,#0
-       vext.8          q0,q0,q0,#8
-       vshl.u64        q11,q11,#57
-       vld1.64         {q9},[r2],r12   @ load [rotated] inp
-       vext.8          q13,q12,q12,#8
+       vld1.64         {q14},[r1]
+       moveq   r12,#0                  @ is it time to zero r12?
+       vext.8          q0,q0,q0,#8             @ rotate Xi
+       vld1.64         {q8},[r2]!      @ load [rotated] I[0]
+       vshl.u64        q11,q11,#57             @ compose 0xc2.0 constant
 #ifndef __ARMEB__
+       vrev64.8        q8,q8
        vrev64.8        q0,q0
+#endif
+       vext.8          q3,q8,q8,#8             @ rotate I[0]
+       blo             .Lodd_tail_v8           @ r3 was less than 32
+       vld1.64         {q9},[r2],r12   @ load [rotated] I[1]
+#ifndef __ARMEB__
        vrev64.8        q9,q9
 #endif
-       veor            q13,q13,q12             @ Karatsuba pre-processing
-       vext.8          q3,q9,q9,#8
-       b               .Loop_v8
+       vext.8          q7,q9,q9,#8
+       veor            q3,q3,q0                @ I[i]^=Xi
+       .byte   0x8e,0x8e,0xa8,0xf2     @ pmull q4,q12,q7               @ H·Ii+1
+       veor            q9,q9,q7                @ Karatsuba pre-processing
+       .byte   0x8f,0xce,0xa9,0xf2     @ pmull2 q6,q12,q7
+       b               .Loop_mod2x_v8
 
 .align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+       vext.8          q10,q3,q3,#8
+       subs            r3,r3,#32               @ is there more data?
+       .byte   0x86,0x0e,0xac,0xf2     @ pmull q0,q14,q3               @ H^2.lo·Xi.lo
+       movlo   r12,#0                  @ is it time to zero r12?
+
+        .byte  0xa2,0xae,0xaa,0xf2     @ pmull q5,q13,q9
+       veor            q10,q10,q3              @ Karatsuba pre-processing
+       .byte   0x87,0x4e,0xad,0xf2     @ pmull2 q2,q14,q3              @ H^2.hi·Xi.hi
+       veor            q0,q0,q4                @ accumulate
+       .byte   0xa5,0x2e,0xab,0xf2     @ pmull2 q1,q13,q10             @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+        vld1.64        {q8},[r2],r12   @ load [rotated] I[i+2]
+
+       veor            q2,q2,q6
+        moveq  r12,#0                  @ is it time to zero r12?
+       veor            q1,q1,q5
+
+       vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
+       veor            q10,q0,q2
+       veor            q1,q1,q9
+        vld1.64        {q9},[r2],r12   @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+        vrev64.8       q8,q8
+#endif
+       veor            q1,q1,q10
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase of reduction
+
+#ifndef __ARMEB__
+        vrev64.8       q9,q9
+#endif
+       vmov            d4,d3           @ Xh|Xm - 256-bit result
+       vmov            d3,d0           @ Xm is rotated Xl
+        vext.8         q7,q9,q9,#8
+        vext.8         q3,q8,q8,#8
+       veor            q0,q1,q10
+        .byte  0x8e,0x8e,0xa8,0xf2     @ pmull q4,q12,q7               @ H·Ii+1
+       veor            q3,q3,q2                @ accumulate q3 early
+
+       vext.8          q10,q0,q0,#8            @ 2nd phase of reduction
+       .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
+       veor            q3,q3,q10
+        veor           q9,q9,q7                @ Karatsuba pre-processing
+       veor            q3,q3,q0
+        .byte  0x8f,0xce,0xa9,0xf2     @ pmull2 q6,q12,q7
+       bhs             .Loop_mod2x_v8          @ there was at least 32 more bytes
+
+       veor            q2,q2,q10
+       vext.8          q3,q8,q8,#8             @ re-construct q3
+       adds            r3,r3,#32               @ re-construct r3
+       veor            q0,q0,q2                @ re-construct q0
+       beq             .Ldone_v8               @ is r3 zero?
+.Lodd_tail_v8:
        vext.8          q10,q0,q0,#8
        veor            q3,q3,q0                @ inp^=Xi
-       veor            q9,q9,q10               @ q9 is rotated inp^Xi
+       veor            q9,q8,q10               @ q9 is rotated inp^Xi
 
-.Lgmult_v8:
        .byte   0x86,0x0e,0xa8,0xf2     @ pmull q0,q12,q3               @ H.lo·Xi.lo
        veor            q9,q9,q3                @ Karatsuba pre-processing
        .byte   0x87,0x4e,0xa9,0xf2     @ pmull2 q2,q12,q3              @ H.hi·Xi.hi
-       subs            r3,r3,#16
        .byte   0xa2,0x2e,0xaa,0xf2     @ pmull q1,q13,q9               @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-       moveq   r12,#0
 
        vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
        veor            q10,q0,q2
        veor            q1,q1,q9
-        vld1.64        {q9},[r2],r12   @ load [rotated] inp
        veor            q1,q1,q10
-       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase of reduction
 
        vmov            d4,d3           @ Xh|Xm - 256-bit result
        vmov            d3,d0           @ Xm is rotated Xl
-#ifndef __ARMEB__
-        vrev64.8       q9,q9
-#endif
        veor            q0,q1,q10
-        vext.8         q3,q9,q9,#8
 
-       vext.8          q10,q0,q0,#8            @ 2nd phase
+       vext.8          q10,q0,q0,#8            @ 2nd phase of reduction
        .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
        veor            q10,q10,q2
        veor            q0,q0,q10
-       bhs             .Loop_v8
 
+.Ldone_v8:
 #ifndef __ARMEB__
        vrev64.8        q0,q0
 #endif
        vext.8          q0,q0,q0,#8
        vst1.64         {q0},[r0]               @ write out Xi
 
+       vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
        bx      lr
 .size  gcm_ghash_v8,.-gcm_ghash_v8
 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
index bf1ce4f..683f1cc 100644 (file)
@@ -1,7 +1,59 @@
-#include "arm_arch.h"
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
 
 .text
+#if __ARM_ARCH__<7
 .code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
 
 .type  K256,%object
 .align 5
@@ -24,7 +76,7 @@ K256:
 .word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size  K256,.-K256
 .word  0                               @ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
 .word  OPENSSL_armcap_P-sha256_block_data_order
 #endif
@@ -33,9 +85,12 @@ K256:
 .global        sha256_block_data_order
 .type  sha256_block_data_order,%function
 sha256_block_data_order:
+#if __ARM_ARCH__<7
        sub     r3,pc,#8                @ sha256_block_data_order
-       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+       adr     r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
        ldr     r12,.LOPENSSL_armcap
        ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
        tst     r12,#ARMV8_SHA256
@@ -43,6 +98,7 @@ sha256_block_data_order:
        tst     r12,#ARMV7_NEON
        bne     .LNEON
 #endif
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
        stmdb   sp!,{r0,r1,r2,r4-r11,lr}
        ldmia   r0,{r4,r5,r6,r7,r8,r9,r10,r11}
        sub     r14,r3,#256+32  @ K256
@@ -1736,6 +1792,9 @@ sha256_block_data_order:
        eor     r12,r12,r6                      @ Maj(a,b,c)
        add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
        @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
        ldreq   r3,[sp,#16*4]           @ pull ctx
        bne     .Lrounds_16_xx
 
@@ -1777,16 +1836,19 @@ sha256_block_data_order:
 .arch  armv7-a
 .fpu   neon
 
+.global        sha256_block_data_order_neon
 .type  sha256_block_data_order_neon,%function
 .align 4
 sha256_block_data_order_neon:
 .LNEON:
        stmdb   sp!,{r4-r12,lr}
 
+       sub     r11,sp,#16*4+16
+       adr     r14,K256
+       bic     r11,r11,#15             @ align for 128-bit stores
        mov     r12,sp
-       sub     sp,sp,#16*4+16          @ alloca
-       sub     r14,r3,#256+32  @ K256
-       bic     sp,sp,#15               @ align for 128-bit stores
+       mov     sp,r11                  @ alloca
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
 
        vld1.8          {q0},[r1]!
        vld1.8          {q1},[r1]!
@@ -2224,11 +2286,13 @@ sha256_block_data_order_neon:
        ldr             r0,[sp,#72]
        sub             r14,r14,#256    @ rewind r14
        teq             r1,r0
+       it              eq
        subeq           r1,r1,#64               @ avoid SEGV
        vld1.8          {q0},[r1]!              @ load next input block
        vld1.8          {q1},[r1]!
        vld1.8          {q2},[r1]!
        vld1.8          {q3},[r1]!
+       it              ne
        strne           r1,[sp,#68]
        mov             r1,sp
        add     r11,r11,r2
@@ -2542,23 +2606,38 @@ sha256_block_data_order_neon:
        str     r7,[r2],#4
        stmia   r2,{r8-r11}
 
+       ittte   ne
        movne   r1,sp
        ldrne   r2,[sp,#0]
        eorne   r12,r12,r12
        ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
        eorne   r3,r5,r6
        bne     .L_00_48
 
        ldmia   sp!,{r4-r12,pc}
 .size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
 #endif
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
 .type  sha256_block_data_order_armv8,%function
 .align 5
 sha256_block_data_order_armv8:
 .LARMv8:
        vld1.32 {q0,q1},[r0]
-       sub     r3,r3,#sha256_block_data_order-K256
+# ifdef __thumb2__
+       adr     r3,.LARMv8
+       sub     r3,r3,#.LARMv8-K256
+# else
+       adrl    r3,K256
+# endif
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
 
 .Loop_v8:
        vld1.8          {q8-q9},[r1]!
@@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8:
        teq             r1,r2
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
-       .byte   0xe2,0x03,0xfa,0xf3     @ sha256su0 q8,q9
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe6,0x0c,0x64,0xf3     @ sha256su1 q8,q10,q11
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
-       .byte   0xe4,0x23,0xfa,0xf3     @ sha256su0 q9,q10
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe0,0x2c,0x66,0xf3     @ sha256su1 q9,q11,q8
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q10
-       .byte   0xe6,0x43,0xfa,0xf3     @ sha256su0 q10,q11
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe2,0x4c,0x60,0xf3     @ sha256su1 q10,q8,q9
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q11
-       .byte   0xe0,0x63,0xfa,0xf3     @ sha256su0 q11,q8
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe4,0x6c,0x62,0xf3     @ sha256su1 q11,q9,q10
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
-       .byte   0xe2,0x03,0xfa,0xf3     @ sha256su0 q8,q9
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe6,0x0c,0x64,0xf3     @ sha256su1 q8,q10,q11
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
-       .byte   0xe4,0x23,0xfa,0xf3     @ sha256su0 q9,q10
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe0,0x2c,0x66,0xf3     @ sha256su1 q9,q11,q8
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q10
-       .byte   0xe6,0x43,0xfa,0xf3     @ sha256su0 q10,q11
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe2,0x4c,0x60,0xf3     @ sha256su1 q10,q8,q9
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q11
-       .byte   0xe0,0x63,0xfa,0xf3     @ sha256su0 q11,q8
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe4,0x6c,0x62,0xf3     @ sha256su1 q11,q9,q10
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
-       .byte   0xe2,0x03,0xfa,0xf3     @ sha256su0 q8,q9
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe6,0x0c,0x64,0xf3     @ sha256su1 q8,q10,q11
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
-       .byte   0xe4,0x23,0xfa,0xf3     @ sha256su0 q9,q10
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe0,0x2c,0x66,0xf3     @ sha256su1 q9,q11,q8
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q10
-       .byte   0xe6,0x43,0xfa,0xf3     @ sha256su0 q10,q11
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe2,0x4c,0x60,0xf3     @ sha256su1 q10,q8,q9
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q11
-       .byte   0xe0,0x63,0xfa,0xf3     @ sha256su0 q11,q8
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe4,0x6c,0x62,0xf3     @ sha256su1 q11,q9,q10
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
 
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
 
        vld1.32         {q13},[r3]
        vadd.i32        q12,q12,q10
        sub             r3,r3,#256-16   @ rewind
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
 
        vadd.i32        q13,q13,q11
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
 
        vadd.i32        q0,q0,q14
        vadd.i32        q1,q1,q15
+       it              ne
        bne             .Loop_v8
 
        vst1.32         {q0,q1},[r0]
@@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8:
 #endif
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
 .align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .comm   OPENSSL_armcap_P,4,4
 #endif
index 0a4b1ac..f5dd6cb 100644 (file)
@@ -227,17 +227,17 @@ aes_v8_encrypt:
 
 .Loop_enc:
        aese    v2.16b,v0.16b
-       ld1     {v0.4s},[x2],#16
        aesmc   v2.16b,v2.16b
+       ld1     {v0.4s},[x2],#16
        subs    w3,w3,#2
        aese    v2.16b,v1.16b
-       ld1     {v1.4s},[x2],#16
        aesmc   v2.16b,v2.16b
+       ld1     {v1.4s},[x2],#16
        b.gt    .Loop_enc
 
        aese    v2.16b,v0.16b
-       ld1     {v0.4s},[x2]
        aesmc   v2.16b,v2.16b
+       ld1     {v0.4s},[x2]
        aese    v2.16b,v1.16b
        eor     v2.16b,v2.16b,v0.16b
 
@@ -256,17 +256,17 @@ aes_v8_decrypt:
 
 .Loop_dec:
        aesd    v2.16b,v0.16b
-       ld1     {v0.4s},[x2],#16
        aesimc  v2.16b,v2.16b
+       ld1     {v0.4s},[x2],#16
        subs    w3,w3,#2
        aesd    v2.16b,v1.16b
-       ld1     {v1.4s},[x2],#16
        aesimc  v2.16b,v2.16b
+       ld1     {v1.4s},[x2],#16
        b.gt    .Loop_dec
 
        aesd    v2.16b,v0.16b
-       ld1     {v0.4s},[x2]
        aesimc  v2.16b,v2.16b
+       ld1     {v0.4s},[x2]
        aesd    v2.16b,v1.16b
        eor     v2.16b,v2.16b,v0.16b
 
@@ -308,16 +308,42 @@ aes_v8_cbc_encrypt:
        eor     v5.16b,v16.16b,v7.16b
        b.eq    .Lcbc_enc128
 
+       ld1     {v2.4s-v3.4s},[x7]
+       add     x7,x3,#16
+       add     x6,x3,#16*4
+       add     x12,x3,#16*5
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       add     x14,x3,#16*6
+       add     x3,x3,#16*7
+       b       .Lenter_cbc_enc
+
+.align 4
 .Loop_cbc_enc:
        aese    v0.16b,v16.16b
-       ld1     {v16.4s},[x7],#16
        aesmc   v0.16b,v0.16b
-       subs    w6,w6,#2
+        st1    {v6.16b},[x1],#16
+.Lenter_cbc_enc:
        aese    v0.16b,v17.16b
-       ld1     {v17.4s},[x7],#16
        aesmc   v0.16b,v0.16b
-       b.gt    .Loop_cbc_enc
+       aese    v0.16b,v2.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v16.4s},[x6]
+       cmp     w5,#4
+       aese    v0.16b,v3.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v17.4s},[x12]
+       b.eq    .Lcbc_enc192
+
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v16.4s},[x14]
+       aese    v0.16b,v17.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v17.4s},[x3]
+       nop
 
+.Lcbc_enc192:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
         subs   x2,x2,#16
@@ -326,7 +352,6 @@ aes_v8_cbc_encrypt:
         csel   x8,xzr,x8,eq
        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
-        add    x7,x3,#16
        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
         ld1    {v16.16b},[x0],x8
@@ -335,16 +360,14 @@ aes_v8_cbc_encrypt:
         eor    v16.16b,v16.16b,v5.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
-        ld1 {v17.4s},[x7],#16  // re-pre-load rndkey[1]
+        ld1 {v17.4s},[x7]              // re-pre-load rndkey[1]
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v23.16b
-
-        mov    w6,w5
        eor     v6.16b,v0.16b,v7.16b
-       st1     {v6.16b},[x1],#16
        b.hs    .Loop_cbc_enc
 
+       st1     {v6.16b},[x1],#16
        b       .Lcbc_done
 
 .align 5
@@ -402,79 +425,78 @@ aes_v8_cbc_encrypt:
 
 .Loop3x_cbc_dec:
        aesd    v0.16b,v16.16b
-       aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
-       ld1     {v16.4s},[x7],#16
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
-       aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
-       ld1     {v17.4s},[x7],#16
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v17.4s},[x7],#16
        b.gt    .Loop3x_cbc_dec
 
        aesd    v0.16b,v16.16b
-       aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
-        eor    v4.16b,v6.16b,v7.16b
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
+        eor    v4.16b,v6.16b,v7.16b
+        subs   x2,x2,#0x30
         eor    v5.16b,v2.16b,v7.16b
+        csel   x6,x2,x6,lo                     // x6, w6, is zero at this point
        aesd    v0.16b,v17.16b
-       aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
-        eor    v17.16b,v3.16b,v7.16b
-        subs   x2,x2,#0x30
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
-        orr    v6.16b,v19.16b,v19.16b
-        csel   x6,x2,x6,lo                     // x6, w6, is zero at this point
-       aesd    v0.16b,v20.16b
-       aesd    v1.16b,v20.16b
-       aesd    v18.16b,v20.16b
+        eor    v17.16b,v3.16b,v7.16b
         add    x0,x0,x6                // x0 is adjusted in such way that
                                        // at exit from the loop v1.16b-v18.16b
                                        // are loaded with last "words"
+        orr    v6.16b,v19.16b,v19.16b
+        mov    x7,x3
+       aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v20.16b
        aesimc  v18.16b,v18.16b
-        mov    x7,x3
-       aesd    v0.16b,v21.16b
-       aesd    v1.16b,v21.16b
-       aesd    v18.16b,v21.16b
         ld1    {v2.16b},[x0],#16
+       aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v21.16b
        aesimc  v18.16b,v18.16b
         ld1    {v3.16b},[x0],#16
        aesd    v0.16b,v22.16b
-       aesd    v1.16b,v22.16b
-       aesd    v18.16b,v22.16b
-        ld1    {v19.16b},[x0],#16
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v22.16b
        aesimc  v18.16b,v18.16b
-        ld1 {v16.4s},[x7],#16  // re-pre-load rndkey[0]
+        ld1    {v19.16b},[x0],#16
        aesd    v0.16b,v23.16b
        aesd    v1.16b,v23.16b
        aesd    v18.16b,v23.16b
-
+        ld1 {v16.4s},[x7],#16  // re-pre-load rndkey[0]
         add    w6,w5,#2
        eor     v4.16b,v4.16b,v0.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v18.16b,v18.16b,v17.16b
         ld1 {v17.4s},[x7],#16  // re-pre-load rndkey[1]
-        orr    v0.16b,v2.16b,v2.16b
        st1     {v4.16b},[x1],#16
-        orr    v1.16b,v3.16b,v3.16b
+        orr    v0.16b,v2.16b,v2.16b
        st1     {v5.16b},[x1],#16
+        orr    v1.16b,v3.16b,v3.16b
        st1     {v18.16b},[x1],#16
         orr    v18.16b,v19.16b,v19.16b
        b.hs    .Loop3x_cbc_dec
@@ -485,39 +507,39 @@ aes_v8_cbc_encrypt:
 
 .Lcbc_dec_tail:
        aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
-       ld1     {v16.4s},[x7],#16
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
-       ld1     {v17.4s},[x7],#16
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v17.4s},[x7],#16
        b.gt    .Lcbc_dec_tail
 
        aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
        aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
        aesd    v1.16b,v20.16b
-       aesd    v18.16b,v20.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v20.16b
        aesimc  v18.16b,v18.16b
         cmn    x2,#0x20
        aesd    v1.16b,v21.16b
-       aesd    v18.16b,v21.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v21.16b
        aesimc  v18.16b,v18.16b
         eor    v5.16b,v6.16b,v7.16b
        aesd    v1.16b,v22.16b
-       aesd    v18.16b,v22.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v22.16b
        aesimc  v18.16b,v18.16b
         eor    v17.16b,v3.16b,v7.16b
        aesd    v1.16b,v23.16b
@@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks:
 .align 4
 .Loop3x_ctr32:
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
-       aese            v18.16b,v16.16b
-       ld1             {v16.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
        aesmc           v1.16b,v1.16b
+       aese            v18.16b,v16.16b
        aesmc           v18.16b,v18.16b
+       ld1             {v16.4s},[x7],#16
        subs            w6,w6,#2
        aese            v0.16b,v17.16b
-       aese            v1.16b,v17.16b
-       aese            v18.16b,v17.16b
-       ld1             {v17.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
        aesmc           v1.16b,v1.16b
+       aese            v18.16b,v17.16b
        aesmc           v18.16b,v18.16b
+       ld1             {v17.4s},[x7],#16
        b.gt            .Loop3x_ctr32
 
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
-       aese            v18.16b,v16.16b
-        mov            x7,x3
        aesmc           v4.16b,v0.16b
-        ld1            {v2.16b},[x0],#16
+       aese            v1.16b,v16.16b
        aesmc           v5.16b,v1.16b
-       aesmc           v18.16b,v18.16b
+        ld1            {v2.16b},[x0],#16
         orr            v0.16b,v6.16b,v6.16b
-       aese            v4.16b,v17.16b
+       aese            v18.16b,v16.16b
+       aesmc           v18.16b,v18.16b
         ld1            {v3.16b},[x0],#16
-       aese            v5.16b,v17.16b
-       aese            v18.16b,v17.16b
         orr            v1.16b,v6.16b,v6.16b
+       aese            v4.16b,v17.16b
        aesmc           v4.16b,v4.16b
-        ld1            {v19.16b},[x0],#16
+       aese            v5.16b,v17.16b
        aesmc           v5.16b,v5.16b
+        ld1            {v19.16b},[x0],#16
+        mov            x7,x3
+       aese            v18.16b,v17.16b
        aesmc           v17.16b,v18.16b
         orr            v18.16b,v6.16b,v6.16b
         add            w9,w8,#1
        aese            v4.16b,v20.16b
+       aesmc           v4.16b,v4.16b
        aese            v5.16b,v20.16b
-       aese            v17.16b,v20.16b
+       aesmc           v5.16b,v5.16b
         eor            v2.16b,v2.16b,v7.16b
         add            w10,w8,#2
-       aesmc           v4.16b,v4.16b
-       aesmc           v5.16b,v5.16b
+       aese            v17.16b,v20.16b
        aesmc           v17.16b,v17.16b
         eor            v3.16b,v3.16b,v7.16b
         add            w8,w8,#3
        aese            v4.16b,v21.16b
+       aesmc           v4.16b,v4.16b
        aese            v5.16b,v21.16b
-       aese            v17.16b,v21.16b
+       aesmc           v5.16b,v5.16b
         eor            v19.16b,v19.16b,v7.16b
         rev            w9,w9
-       aesmc           v4.16b,v4.16b
-        ld1     {v16.4s},[x7],#16      // re-pre-load rndkey[0]
-       aesmc           v5.16b,v5.16b
+       aese            v17.16b,v21.16b
        aesmc           v17.16b,v17.16b
         mov    v0.s[3], w9
         rev            w10,w10
        aese            v4.16b,v22.16b
+       aesmc           v4.16b,v4.16b
        aese            v5.16b,v22.16b
-       aese            v17.16b,v22.16b
+       aesmc           v5.16b,v5.16b
         mov    v1.s[3], w10
         rev            w12,w8
-       aesmc           v4.16b,v4.16b
-       aesmc           v5.16b,v5.16b
+       aese            v17.16b,v22.16b
        aesmc           v17.16b,v17.16b
         mov    v18.s[3], w12
         subs           x2,x2,#3
@@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks:
        aese            v5.16b,v23.16b
        aese            v17.16b,v23.16b
 
-        mov            w6,w5
        eor             v2.16b,v2.16b,v4.16b
+        ld1     {v16.4s},[x7],#16      // re-pre-load rndkey[0]
+       st1             {v2.16b},[x1],#16
        eor             v3.16b,v3.16b,v5.16b
+        mov            w6,w5
+       st1             {v3.16b},[x1],#16
        eor             v19.16b,v19.16b,v17.16b
         ld1     {v17.4s},[x7],#16      // re-pre-load rndkey[1]
-       st1             {v2.16b},[x1],#16
-       st1             {v3.16b},[x1],#16
        st1             {v19.16b},[x1],#16
        b.hs            .Loop3x_ctr32
 
@@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks:
 
 .Lctr32_tail:
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
-       ld1             {v16.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
        aesmc           v1.16b,v1.16b
+       ld1             {v16.4s},[x7],#16
        subs            w6,w6,#2
        aese            v0.16b,v17.16b
-       aese            v1.16b,v17.16b
-       ld1             {v17.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
        aesmc           v1.16b,v1.16b
+       ld1             {v17.4s},[x7],#16
        b.gt            .Lctr32_tail
 
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
        aesmc           v1.16b,v1.16b
        aese            v0.16b,v17.16b
-       aese            v1.16b,v17.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
        aesmc           v1.16b,v1.16b
         ld1            {v2.16b},[x0],x12
        aese            v0.16b,v20.16b
-       aese            v1.16b,v20.16b
-        ld1            {v3.16b},[x0]
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v20.16b
        aesmc           v1.16b,v1.16b
+        ld1            {v3.16b},[x0]
        aese            v0.16b,v21.16b
-       aese            v1.16b,v21.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v21.16b
        aesmc           v1.16b,v1.16b
-       aese            v0.16b,v22.16b
-       aese            v1.16b,v22.16b
         eor            v2.16b,v2.16b,v7.16b
+       aese            v0.16b,v22.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v22.16b
        aesmc           v1.16b,v1.16b
         eor            v3.16b,v3.16b,v7.16b
        aese            v0.16b,v23.16b
index 1bfb263..479007d 100644 (file)
 .type  gcm_init_v8,%function
 .align 4
 gcm_init_v8:
-       ld1             {v17.2d},[x1]           //load H
-       movi            v16.16b,#0xe1
+       ld1             {v17.2d},[x1]           //load input H
+       movi            v19.16b,#0xe1
+       shl     v19.2d,v19.2d,#57               //0xc2.0
        ext             v3.16b,v17.16b,v17.16b,#8
-       shl     v16.2d,v16.2d,#57
-       ushr    v18.2d,v16.2d,#63
-       ext             v16.16b,v18.16b,v16.16b,#8              //t0=0xc2....01
+       ushr    v18.2d,v19.2d,#63
        dup             v17.4s,v17.s[1]
-       ushr    v19.2d,v3.2d,#63
+       ext             v16.16b,v18.16b,v19.16b,#8              //t0=0xc2....01
+       ushr    v18.2d,v3.2d,#63
        sshr    v17.4s,v17.4s,#31               //broadcast carry bit
-       and             v19.16b,v19.16b,v16.16b
+       and             v18.16b,v18.16b,v16.16b
        shl     v3.2d,v3.2d,#1
-       ext             v19.16b,v19.16b,v19.16b,#8
+       ext             v18.16b,v18.16b,v18.16b,#8
        and             v16.16b,v16.16b,v17.16b
-       orr             v3.16b,v3.16b,v19.16b           //H<<<=1
-       eor             v3.16b,v3.16b,v16.16b           //twisted H
-       st1             {v3.2d},[x0]
+       orr             v3.16b,v3.16b,v18.16b           //H<<<=1
+       eor             v20.16b,v3.16b,v16.16b          //twisted H
+       st1             {v20.2d},[x0],#16               //store Htable[0]
+
+       //calculate H^2
+       ext             v16.16b,v20.16b,v20.16b,#8              //Karatsuba pre-processing
+       pmull   v0.1q,v20.1d,v20.1d
+       eor             v16.16b,v16.16b,v20.16b
+       pmull2  v2.1q,v20.2d,v20.2d
+       pmull   v1.1q,v16.1d,v16.1d
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase
+
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+       eor             v0.16b,v1.16b,v18.16b
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v18.16b,v18.16b,v2.16b
+       eor             v22.16b,v0.16b,v18.16b
+
+       ext             v17.16b,v22.16b,v22.16b,#8              //Karatsuba pre-processing
+       eor             v17.16b,v17.16b,v22.16b
+       ext             v21.16b,v16.16b,v17.16b,#8              //pack Karatsuba pre-processed
+       st1             {v21.2d-v22.2d},[x0]            //store Htable[1..2]
 
        ret
 .size  gcm_init_v8,.-gcm_init_v8
-
 .global        gcm_gmult_v8
 .type  gcm_gmult_v8,%function
 .align 4
 gcm_gmult_v8:
        ld1             {v17.2d},[x0]           //load Xi
        movi            v19.16b,#0xe1
-       ld1             {v20.2d},[x1]           //load twisted H
+       ld1             {v20.2d-v21.2d},[x1]    //load twisted H, ...
        shl     v19.2d,v19.2d,#57
 #ifndef __ARMEB__
        rev64   v17.16b,v17.16b
 #endif
-       ext             v21.16b,v20.16b,v20.16b,#8
-       mov             x3,#0
        ext             v3.16b,v17.16b,v17.16b,#8
-       mov             x12,#0
-       eor             v21.16b,v21.16b,v20.16b         //Karatsuba pre-processing
-       mov             x2,x0
-       b               .Lgmult_v8
-.size  gcm_gmult_v8,.-gcm_gmult_v8
 
+       pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
+       eor             v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
+       pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
+       pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
+
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+       eor             v0.16b,v1.16b,v18.16b
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v18.16b,v18.16b,v2.16b
+       eor             v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+       rev64   v0.16b,v0.16b
+#endif
+       ext             v0.16b,v0.16b,v0.16b,#8
+       st1             {v0.2d},[x0]            //write out Xi
+
+       ret
+.size  gcm_gmult_v8,.-gcm_gmult_v8
 .global        gcm_ghash_v8
 .type  gcm_ghash_v8,%function
 .align 4
 gcm_ghash_v8:
        ld1             {v0.2d},[x0]            //load [rotated] Xi
-       subs            x3,x3,#16
+                                               //"[rotated]" means that
+                                               //loaded value would have
+                                               //to be rotated in order to
+                                               //make it appear as in
+                                               //alorithm specification
+       subs            x3,x3,#32               //see if x3 is 32 or larger
+       mov             x12,#16         //x12 is used as post-
+                                               //increment for input pointer;
+                                               //as loop is modulo-scheduled
+                                               //x12 is zeroed just in time
+                                               //to preclude oversteping
+                                               //inp[len], which means that
+                                               //last block[s] are actually
+                                               //loaded twice, but last
+                                               //copy is not processed
+       ld1             {v20.2d-v21.2d},[x1],#32        //load twisted H, ..., H^2
        movi            v19.16b,#0xe1
-       mov             x12,#16
-       ld1             {v20.2d},[x1]           //load twisted H
-       csel    x12,xzr,x12,eq
-       ext             v0.16b,v0.16b,v0.16b,#8
-       shl     v19.2d,v19.2d,#57
-       ld1             {v17.2d},[x2],x12       //load [rotated] inp
-       ext             v21.16b,v20.16b,v20.16b,#8
+       ld1             {v22.2d},[x1]
+       csel    x12,xzr,x12,eq                  //is it time to zero x12?
+       ext             v0.16b,v0.16b,v0.16b,#8         //rotate Xi
+       ld1             {v16.2d},[x2],#16       //load [rotated] I[0]
+       shl     v19.2d,v19.2d,#57               //compose 0xc2.0 constant
 #ifndef __ARMEB__
+       rev64   v16.16b,v16.16b
        rev64   v0.16b,v0.16b
+#endif
+       ext             v3.16b,v16.16b,v16.16b,#8               //rotate I[0]
+       b.lo            .Lodd_tail_v8           //x3 was less than 32
+       ld1             {v17.2d},[x2],x12       //load [rotated] I[1]
+#ifndef __ARMEB__
        rev64   v17.16b,v17.16b
 #endif
-       eor             v21.16b,v21.16b,v20.16b         //Karatsuba pre-processing
-       ext             v3.16b,v17.16b,v17.16b,#8
-       b               .Loop_v8
+       ext             v7.16b,v17.16b,v17.16b,#8
+       eor             v3.16b,v3.16b,v0.16b            //I[i]^=Xi
+       pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
+       eor             v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
+       pmull2  v6.1q,v20.2d,v7.2d
+       b               .Loop_mod2x_v8
 
 .align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+       ext             v18.16b,v3.16b,v3.16b,#8
+       subs            x3,x3,#32               //is there more data?
+       pmull   v0.1q,v22.1d,v3.1d              //H^2.lo·Xi.lo
+       csel    x12,xzr,x12,lo                  //is it time to zero x12?
+
+        pmull  v5.1q,v21.1d,v17.1d
+       eor             v18.16b,v18.16b,v3.16b          //Karatsuba pre-processing
+       pmull2  v2.1q,v22.2d,v3.2d              //H^2.hi·Xi.hi
+       eor             v0.16b,v0.16b,v4.16b            //accumulate
+       pmull2  v1.1q,v21.2d,v18.2d             //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+        ld1    {v16.2d},[x2],x12       //load [rotated] I[i+2]
+
+       eor             v2.16b,v2.16b,v6.16b
+        csel   x12,xzr,x12,eq                  //is it time to zero x12?
+       eor             v1.16b,v1.16b,v5.16b
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+        ld1    {v17.2d},[x2],x12       //load [rotated] I[i+3]
+#ifndef __ARMEB__
+        rev64  v16.16b,v16.16b
+#endif
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
+
+#ifndef __ARMEB__
+        rev64  v17.16b,v17.16b
+#endif
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+        ext            v7.16b,v17.16b,v17.16b,#8
+        ext            v3.16b,v16.16b,v16.16b,#8
+       eor             v0.16b,v1.16b,v18.16b
+        pmull  v4.1q,v20.1d,v7.1d              //H·Ii+1
+       eor             v3.16b,v3.16b,v2.16b            //accumulate v3.16b early
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v3.16b,v3.16b,v18.16b
+        eor            v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
+       eor             v3.16b,v3.16b,v0.16b
+        pmull2 v6.1q,v20.2d,v7.2d
+       b.hs            .Loop_mod2x_v8          //there was at least 32 more bytes
+
+       eor             v2.16b,v2.16b,v18.16b
+       ext             v3.16b,v16.16b,v16.16b,#8               //re-construct v3.16b
+       adds            x3,x3,#32               //re-construct x3
+       eor             v0.16b,v0.16b,v2.16b            //re-construct v0.16b
+       b.eq            .Ldone_v8               //is x3 zero?
+.Lodd_tail_v8:
        ext             v18.16b,v0.16b,v0.16b,#8
        eor             v3.16b,v3.16b,v0.16b            //inp^=Xi
-       eor             v17.16b,v17.16b,v18.16b         //v17.16b is rotated inp^Xi
+       eor             v17.16b,v16.16b,v18.16b         //v17.16b is rotated inp^Xi
 
-.Lgmult_v8:
        pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
        eor             v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
        pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
-       subs            x3,x3,#16
        pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
-       csel    x12,xzr,x12,eq
 
        ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
        eor             v18.16b,v0.16b,v2.16b
        eor             v1.16b,v1.16b,v17.16b
-        ld1    {v17.2d},[x2],x12       //load [rotated] inp
        eor             v1.16b,v1.16b,v18.16b
-       pmull   v18.1q,v0.1d,v19.1d             //1st phase
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
 
        ins     v2.d[0],v1.d[1]
        ins     v1.d[1],v0.d[0]
-#ifndef __ARMEB__
-        rev64  v17.16b,v17.16b
-#endif
        eor             v0.16b,v1.16b,v18.16b
-        ext            v3.16b,v17.16b,v17.16b,#8
 
-       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
        pmull   v0.1q,v0.1d,v19.1d
        eor             v18.16b,v18.16b,v2.16b
        eor             v0.16b,v0.16b,v18.16b
-       b.hs            .Loop_v8
 
+.Ldone_v8:
 #ifndef __ARMEB__
        rev64   v0.16b,v0.16b
 #endif
index 84708af..6573fe4 100644 (file)
@@ -17,7 +17,10 @@ aesni_encrypt:
        leaq    16(%rdx),%rdx
        jnz     .Loop_enc1_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 .size  aesni_encrypt,.-aesni_encrypt
 
@@ -38,7 +41,10 @@ aesni_decrypt:
        leaq    16(%rdx),%rdx
        jnz     .Loop_dec1_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 .size  aesni_decrypt, .-aesni_decrypt
 .type  _aesni_encrypt2,@function
@@ -264,21 +270,18 @@ _aesni_encrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     .Lenc_loop6_enter
 .align 16
 .Lenc_loop6:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+.Lenc_loop6_enter:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.Lenc_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     .Ldec_loop6_enter
 .align 16
 .Ldec_loop6:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+.Ldec_loop6_enter:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.Ldec_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,220,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,220,225
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-.byte  102,68,15,56,220,193
-.byte  102,68,15,56,220,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     .Lenc_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     .Lenc_loop8_inner
 .align 16
 .Lenc_loop8:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
+.Lenc_loop8_inner:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,222,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     .Ldec_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     .Ldec_loop8_inner
 .align 16
 .Ldec_loop8:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
+.Ldec_loop8_inner:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -587,6 +577,7 @@ aesni_ecb_encrypt:
        movups  80(%rdi),%xmm7
        je      .Lecb_enc_six
        movdqu  96(%rdi),%xmm8
+       xorps   %xmm9,%xmm9
        call    _aesni_encrypt8
        movups  %xmm2,(%rsi)
        movups  %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ aesni_ecb_encrypt:
        jnc     .Lecb_dec_loop8
 
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movq    %r11,%rcx
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movl    %r10d,%eax
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
        movups  %xmm9,112(%rsi)
+       pxor    %xmm9,%xmm9
        leaq    128(%rsi),%rsi
        addq    $128,%rdx
        jz      .Lecb_ret
@@ -731,14 +730,23 @@ aesni_ecb_encrypt:
        je      .Lecb_dec_six
        movups  96(%rdi),%xmm8
        movups  (%rcx),%xmm0
+       xorps   %xmm9,%xmm9
        call    _aesni_decrypt8
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_one:
@@ -754,49 +762,73 @@ aesni_ecb_encrypt:
        jnz     .Loop_dec1_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_two:
        call    _aesni_decrypt2
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_five:
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_six:
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
 
 .Lecb_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
 .size  aesni_ecb_encrypt,.-aesni_ecb_encrypt
 .globl aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks:
        leaq    16(%rsi),%rsi
        jnz     .Lccm64_enc_outer
 
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 .size  aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
 .globl aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks:
        leaq    16(%r11),%r11
        jnz     .Loop_enc1_6
 .byte  102,15,56,221,217
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 .size  aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
 .globl aesni_ctr32_encrypt_blocks
 .type  aesni_ctr32_encrypt_blocks,@function
 .align 16
 aesni_ctr32_encrypt_blocks:
+       cmpq    $1,%rdx
+       jne     .Lctr32_bulk
+
+
+
+       movups  (%r8),%xmm2
+       movups  (%rdi),%xmm3
+       movl    240(%rcx),%edx
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_enc1_7:
+.byte  102,15,56,220,209
+       decl    %edx
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_enc1_7
+.byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm2
+       jmp     .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $128,%rsp
        andq    $-16,%rsp
        leaq    -8(%rax),%rbp
 
-       cmpq    $1,%rdx
-       je      .Lctr32_one_shortcut
+
+
 
        movdqu  (%r8),%xmm2
        movdqu  (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks:
        leaq    -128(%rcx),%rcx
 
 .Lctr32_tail:
+
+
        leaq    16(%rcx),%rcx
        cmpq    $4,%rdx
        jb      .Lctr32_loop3
        je      .Lctr32_loop4
 
+
        shll    $4,%eax
        movdqa  96(%rsp),%xmm8
        pxor    %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks:
        movups  32(%rdi),%xmm12
        xorps   %xmm12,%xmm4
        movups  %xmm4,32(%rsi)
-       jmp     .Lctr32_done
 
-.align 16
-.Lctr32_one_shortcut:
-       movups  (%r8),%xmm2
-       movups  (%rdi),%xmm10
-       movl    240(%rcx),%eax
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-.Loop_enc1_7:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_7
-.byte  102,15,56,221,209
-       xorps   %xmm10,%xmm2
-       movups  %xmm2,(%rsi)
-       jmp     .Lctr32_done
-
-.align 16
 .Lctr32_done:
+       xorps   %xmm0,%xmm0
+       xorl    %r11d,%r11d
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       movaps  %xmm0,112(%rsp)
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lctr32_epilogue:
@@ -1750,6 +1829,7 @@ aesni_xts_encrypt:
        shrl    $4,%eax
 
 .Lxts_enc_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        addq    $96,%rdx
@@ -1778,6 +1858,7 @@ aesni_xts_encrypt:
        pxor    %xmm12,%xmm4
        pxor    %xmm13,%xmm5
        pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm7
 
        call    _aesni_encrypt6
 
@@ -1920,6 +2001,29 @@ aesni_xts_encrypt:
        movups  %xmm2,-16(%rsi)
 
 .Lxts_enc_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lxts_enc_epilogue:
@@ -2196,6 +2300,7 @@ aesni_xts_decrypt:
        shrl    $4,%eax
 
 .Lxts_dec_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        pxor    %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ aesni_xts_decrypt:
        movups  %xmm2,(%rsi)
 
 .Lxts_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lxts_dec_epilogue:
@@ -2446,7 +2574,11 @@ aesni_cbc_encrypt:
        jnc     .Lcbc_enc_loop
        addq    $16,%rdx
        jnz     .Lcbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
        jmp     .Lcbc_ret
 
 .Lcbc_enc_tail:
@@ -2466,6 +2598,35 @@ aesni_cbc_encrypt:
 
 .align 16
 .Lcbc_decrypt:
+       cmpq    $16,%rdx
+       jne     .Lcbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $16,%rsp
@@ -2702,7 +2863,7 @@ aesni_cbc_encrypt:
        movaps  %xmm9,%xmm2
        leaq    -112(%rcx),%rcx
        addq    $112,%rdx
-       jle     .Lcbc_dec_tail_collected
+       jle     .Lcbc_dec_clear_tail_collected
        movups  %xmm9,(%rsi)
        leaq    16(%rsi),%rsi
        cmpq    $80,%rdx
@@ -2721,14 +2882,19 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        leaq    80(%rsi),%rsi
        movdqa  %xmm7,%xmm2
+       pxor    %xmm7,%xmm7
        jmp     .Lcbc_dec_tail_collected
 
 .align 16
@@ -2743,16 +2909,23 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        pxor    %xmm9,%xmm8
        movdqu  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        leaq    96(%rsi),%rsi
        movdqa  %xmm8,%xmm2
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     .Lcbc_dec_tail_collected
 
 .align 16
@@ -2796,7 +2969,7 @@ aesni_cbc_encrypt:
 
        movdqa  %xmm7,%xmm2
        addq    $80,%rdx
-       jle     .Lcbc_dec_tail_collected
+       jle     .Lcbc_dec_clear_tail_collected
        movups  %xmm7,(%rsi)
        leaq    16(%rsi),%rsi
 
@@ -2831,12 +3004,17 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        leaq    64(%rsi),%rsi
        movdqa  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        subq    $16,%rdx
        jmp     .Lcbc_dec_tail_collected
 
@@ -2847,12 +3025,12 @@ aesni_cbc_encrypt:
        movups  16(%rcx),%xmm1
        leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
 .byte  102,15,56,222,209
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_dec1_16
+       jnz     .Loop_dec1_17
 .byte  102,15,56,223,209
        xorps   %xmm10,%xmm2
        movaps  %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ aesni_cbc_encrypt:
        pxor    %xmm11,%xmm3
        movdqu  %xmm2,(%rsi)
        movdqa  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leaq    16(%rsi),%rsi
        jmp     .Lcbc_dec_tail_collected
 .align 16
@@ -2878,7 +3057,9 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movdqa  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        leaq    32(%rsi),%rsi
        jmp     .Lcbc_dec_tail_collected
 .align 16
@@ -2891,29 +3072,45 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movdqa  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        leaq    48(%rsi),%rsi
        jmp     .Lcbc_dec_tail_collected
 
 .align 16
+.Lcbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
 .Lcbc_dec_tail_collected:
        movups  %xmm10,(%r8)
        andq    $15,%rdx
        jnz     .Lcbc_dec_tail_partial
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     .Lcbc_dec_ret
 .align 16
 .Lcbc_dec_tail_partial:
        movaps  %xmm2,(%rsp)
+       pxor    %xmm2,%xmm2
        movq    $16,%rcx
        movq    %rsi,%rdi
        subq    %rdx,%rcx
        leaq    (%rsp),%rsi
 .long  0x9066A4F3
+       movdqa  %xmm2,(%rsp)
 
 .Lcbc_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lcbc_ret:
@@ -2951,7 +3148,9 @@ aesni_set_decrypt_key:
 
        movups  (%rdx),%xmm0
 .byte  102,15,56,219,192
+       pxor    %xmm1,%xmm1
        movups  %xmm0,(%rdi)
+       pxor    %xmm0,%xmm0
 .Ldec_key_ret:
        addq    $8,%rsp
        .byte   0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
        testq   %rdx,%rdx
        jz      .Lenc_key_ret
 
+       movl    $268437504,%r10d
        movups  (%rdi),%xmm0
        xorps   %xmm4,%xmm4
+       andl    OPENSSL_ia32cap_P+4(%rip),%r10d
        leaq    16(%rdx),%rax
        cmpl    $256,%esi
        je      .L14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
 
 .L10rounds:
        movl    $9,%esi
+       cmpl    $268435456,%r10d
+       je      .L10rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,200,1
        call    .Lkey_expansion_128_cold
@@ -3008,9 +3212,79 @@ __aesni_set_encrypt_key:
        jmp     .Lenc_key_ret
 
 .align 16
+.L10rounds_alt:
+       movdqa  .Lkey_rotate(%rip),%xmm5
+       movl    $8,%r10d
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,(%rdx)
+       jmp     .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leaq    16(%rax),%rax
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%rax)
+       movdqa  %xmm0,%xmm2
+
+       decl    %r10d
+       jnz     .Loop_key128
+
+       movdqa  .Lkey_rcon1b(%rip),%xmm4
+
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%rax)
+
+       movl    %esi,96(%rax)
+       xorl    %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
 .L12rounds:
        movq    16(%rdi),%xmm2
        movl    $11,%esi
+       cmpl    $268435456,%r10d
+       je      .L12rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,202,1
        call    .Lkey_expansion_192a_cold
@@ -3034,10 +3308,54 @@ __aesni_set_encrypt_key:
        jmp     .Lenc_key_ret
 
 .align 16
+.L12rounds_alt:
+       movdqa  .Lkey_rotate192(%rip),%xmm5
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movl    $8,%r10d
+       movdqu  %xmm0,(%rdx)
+       jmp     .Loop_key192
+
+.align 16
+.Loop_key192:
+       movq    %xmm2,0(%rax)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leaq    24(%rax),%rax
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%rax)
+
+       decl    %r10d
+       jnz     .Loop_key192
+
+       movl    %esi,32(%rax)
+       xorl    %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
 .L14rounds:
        movups  16(%rdi),%xmm2
        movl    $13,%esi
        leaq    16(%rax),%rax
+       cmpl    $268435456,%r10d
+       je      .L14rounds_alt
+
        movups  %xmm0,(%rdx)
        movups  %xmm2,16(%rdx)
 .byte  102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ __aesni_set_encrypt_key:
        jmp     .Lenc_key_ret
 
 .align 16
+.L14rounds_alt:
+       movdqa  .Lkey_rotate(%rip),%xmm5
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movl    $7,%r10d
+       movdqu  %xmm0,0(%rdx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,16(%rdx)
+       jmp     .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       decl    %r10d
+       jz      .Ldone_key256
+
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%rax)
+       leaq    32(%rax),%rax
+       movdqa  %xmm2,%xmm1
+
+       jmp     .Loop_key256
+
+.Ldone_key256:
+       movl    %esi,16(%rax)
+       xorl    %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
 .Lbad_keybits:
        movq    $-2,%rax
 .Lenc_key_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
        addq    $8,%rsp
        .byte   0xf3,0xc3
 .LSEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key:
 .long  0x87,0,1,0
 .Lincrement1:
 .byte  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long  0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long  1,1,1,1
+.Lkey_rcon1b:
+.long  0x1b,0x1b,0x1b,0x1b
 
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 64
index 84dd720..db3fe39 100644 (file)
@@ -2884,11 +2884,16 @@ sqrx8x_reduction:
 .type  bn_get_bits5,@function
 .align 16
 bn_get_bits5:
-       movq    %rdi,%r10
+       leaq    0(%rdi),%r10
+       leaq    1(%rdi),%r11
        movl    %esi,%ecx
-       shrl    $3,%esi
-       movzwl  (%r10,%rsi,1),%eax
-       andl    $7,%ecx
+       shrl    $4,%esi
+       andl    $15,%ecx
+       leal    -8(%rcx),%eax
+       cmpl    $11,%ecx
+       cmovaq  %r11,%r10
+       cmoval  %eax,%ecx
+       movzwl  (%r10,%rsi,2),%eax
        shrl    %cl,%eax
        andl    $31,%eax
        .byte   0xf3,0xc3
index 57509ae..41ad80e 100644 (file)
@@ -17,7 +17,10 @@ L$oop_enc1_1:
        leaq    16(%rdx),%rdx
        jnz     L$oop_enc1_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 
 
@@ -38,7 +41,10 @@ L$oop_dec1_2:
        leaq    16(%rdx),%rdx
        jnz     L$oop_dec1_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 
 
@@ -264,21 +270,18 @@ _aesni_encrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     L$enc_loop6_enter
 .p2align       4
 L$enc_loop6:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+L$enc_loop6_enter:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-L$enc_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     L$dec_loop6_enter
 .p2align       4
 L$dec_loop6:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+L$dec_loop6_enter:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-L$dec_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,220,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,220,225
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-.byte  102,68,15,56,220,193
-.byte  102,68,15,56,220,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     L$enc_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     L$enc_loop8_inner
 .p2align       4
 L$enc_loop8:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
+L$enc_loop8_inner:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,222,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     L$dec_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     L$dec_loop8_inner
 .p2align       4
 L$dec_loop8:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
+L$dec_loop8_inner:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -587,6 +577,7 @@ L$ecb_enc_tail:
        movups  80(%rdi),%xmm7
        je      L$ecb_enc_six
        movdqu  96(%rdi),%xmm8
+       xorps   %xmm9,%xmm9
        call    _aesni_encrypt8
        movups  %xmm2,(%rsi)
        movups  %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter:
        jnc     L$ecb_dec_loop8
 
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movq    %r11,%rcx
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movl    %r10d,%eax
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
        movups  %xmm9,112(%rsi)
+       pxor    %xmm9,%xmm9
        leaq    128(%rsi),%rsi
        addq    $128,%rdx
        jz      L$ecb_ret
@@ -731,14 +730,23 @@ L$ecb_dec_tail:
        je      L$ecb_dec_six
        movups  96(%rdi),%xmm8
        movups  (%rcx),%xmm0
+       xorps   %xmm9,%xmm9
        call    _aesni_decrypt8
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_one:
@@ -754,49 +762,73 @@ L$oop_dec1_4:
        jnz     L$oop_dec1_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_two:
        call    _aesni_decrypt2
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_five:
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_six:
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
 
 L$ecb_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
 
 .globl _aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ L$ccm64_enc2_loop:
        leaq    16(%rsi),%rsi
        jnz     L$ccm64_enc_outer
 
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 
 .globl _aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ L$oop_enc1_6:
        leaq    16(%r11),%r11
        jnz     L$oop_enc1_6
 .byte  102,15,56,221,217
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 
 .globl _aesni_ctr32_encrypt_blocks
 
 .p2align       4
 _aesni_ctr32_encrypt_blocks:
+       cmpq    $1,%rdx
+       jne     L$ctr32_bulk
+
+
+
+       movups  (%r8),%xmm2
+       movups  (%rdi),%xmm3
+       movl    240(%rcx),%edx
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+L$oop_enc1_7:
+.byte  102,15,56,220,209
+       decl    %edx
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     L$oop_enc1_7
+.byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm2
+       jmp     L$ctr32_epilogue
+
+.p2align       4
+L$ctr32_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $128,%rsp
        andq    $-16,%rsp
        leaq    -8(%rax),%rbp
 
-       cmpq    $1,%rdx
-       je      L$ctr32_one_shortcut
+
+
 
        movdqu  (%r8),%xmm2
        movdqu  (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ L$ctr32_enc_done:
        leaq    -128(%rcx),%rcx
 
 L$ctr32_tail:
+
+
        leaq    16(%rcx),%rcx
        cmpq    $4,%rdx
        jb      L$ctr32_loop3
        je      L$ctr32_loop4
 
+
        shll    $4,%eax
        movdqa  96(%rsp),%xmm8
        pxor    %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ L$ctr32_loop3:
        movups  32(%rdi),%xmm12
        xorps   %xmm12,%xmm4
        movups  %xmm4,32(%rsi)
-       jmp     L$ctr32_done
 
-.p2align       4
-L$ctr32_one_shortcut:
-       movups  (%r8),%xmm2
-       movups  (%rdi),%xmm10
-       movl    240(%rcx),%eax
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-L$oop_enc1_7:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     L$oop_enc1_7
-.byte  102,15,56,221,209
-       xorps   %xmm10,%xmm2
-       movups  %xmm2,(%rsi)
-       jmp     L$ctr32_done
-
-.p2align       4
 L$ctr32_done:
+       xorps   %xmm0,%xmm0
+       xorl    %r11d,%r11d
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       movaps  %xmm0,112(%rsp)
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 L$ctr32_epilogue:
@@ -1750,6 +1829,7 @@ L$xts_enc_loop6:
        shrl    $4,%eax
 
 L$xts_enc_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        addq    $96,%rdx
@@ -1778,6 +1858,7 @@ L$xts_enc_short:
        pxor    %xmm12,%xmm4
        pxor    %xmm13,%xmm5
        pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm7
 
        call    _aesni_encrypt6
 
@@ -1920,6 +2001,29 @@ L$oop_enc1_10:
        movups  %xmm2,-16(%rsi)
 
 L$xts_enc_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 L$xts_enc_epilogue:
@@ -2196,6 +2300,7 @@ L$xts_dec_loop6:
        shrl    $4,%eax
 
 L$xts_dec_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        pxor    %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ L$oop_dec1_14:
        movups  %xmm2,(%rsi)
 
 L$xts_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 L$xts_dec_epilogue:
@@ -2446,7 +2574,11 @@ L$oop_enc1_15:
        jnc     L$cbc_enc_loop
        addq    $16,%rdx
        jnz     L$cbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
        jmp     L$cbc_ret
 
 L$cbc_enc_tail:
@@ -2466,6 +2598,35 @@ L$cbc_enc_tail:
 
 .p2align       4
 L$cbc_decrypt:
+       cmpq    $16,%rdx
+       jne     L$cbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+L$oop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     L$oop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     L$cbc_ret
+.p2align       4
+L$cbc_decrypt_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $16,%rsp
@@ -2702,7 +2863,7 @@ L$cbc_dec_done:
        movaps  %xmm9,%xmm2
        leaq    -112(%rcx),%rcx
        addq    $112,%rdx
-       jle     L$cbc_dec_tail_collected
+       jle     L$cbc_dec_clear_tail_collected
        movups  %xmm9,(%rsi)
        leaq    16(%rsi),%rsi
        cmpq    $80,%rdx
@@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        leaq    80(%rsi),%rsi
        movdqa  %xmm7,%xmm2
+       pxor    %xmm7,%xmm7
        jmp     L$cbc_dec_tail_collected
 
 .p2align       4
@@ -2743,16 +2909,23 @@ L$cbc_dec_seven:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        pxor    %xmm9,%xmm8
        movdqu  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        leaq    96(%rsi),%rsi
        movdqa  %xmm8,%xmm2
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     L$cbc_dec_tail_collected
 
 .p2align       4
@@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter:
 
        movdqa  %xmm7,%xmm2
        addq    $80,%rdx
-       jle     L$cbc_dec_tail_collected
+       jle     L$cbc_dec_clear_tail_collected
        movups  %xmm7,(%rsi)
        leaq    16(%rsi),%rsi
 
@@ -2831,12 +3004,17 @@ L$cbc_dec_tail:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        leaq    64(%rsi),%rsi
        movdqa  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        subq    $16,%rdx
        jmp     L$cbc_dec_tail_collected
 
@@ -2847,12 +3025,12 @@ L$cbc_dec_one:
        movups  16(%rcx),%xmm1
        leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
-L$oop_dec1_16:
+L$oop_dec1_17:
 .byte  102,15,56,222,209
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     L$oop_dec1_16
+       jnz     L$oop_dec1_17
 .byte  102,15,56,223,209
        xorps   %xmm10,%xmm2
        movaps  %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ L$cbc_dec_two:
        pxor    %xmm11,%xmm3
        movdqu  %xmm2,(%rsi)
        movdqa  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leaq    16(%rsi),%rsi
        jmp     L$cbc_dec_tail_collected
 .p2align       4
@@ -2878,7 +3057,9 @@ L$cbc_dec_three:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movdqa  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        leaq    32(%rsi),%rsi
        jmp     L$cbc_dec_tail_collected
 .p2align       4
@@ -2891,29 +3072,45 @@ L$cbc_dec_four:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movdqa  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        leaq    48(%rsi),%rsi
        jmp     L$cbc_dec_tail_collected
 
 .p2align       4
+L$cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
 L$cbc_dec_tail_collected:
        movups  %xmm10,(%r8)
        andq    $15,%rdx
        jnz     L$cbc_dec_tail_partial
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     L$cbc_dec_ret
 .p2align       4
 L$cbc_dec_tail_partial:
        movaps  %xmm2,(%rsp)
+       pxor    %xmm2,%xmm2
        movq    $16,%rcx
        movq    %rsi,%rdi
        subq    %rdx,%rcx
        leaq    (%rsp),%rsi
 .long  0x9066A4F3
+       movdqa  %xmm2,(%rsp)
 
 L$cbc_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        leaq    (%rbp),%rsp
        popq    %rbp
 L$cbc_ret:
@@ -2951,7 +3148,9 @@ L$dec_key_inverse:
 
        movups  (%rdx),%xmm0
 .byte  102,15,56,219,192
+       pxor    %xmm1,%xmm1
        movups  %xmm0,(%rdi)
+       pxor    %xmm0,%xmm0
 L$dec_key_ret:
        addq    $8,%rsp
        .byte   0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
        testq   %rdx,%rdx
        jz      L$enc_key_ret
 
+       movl    $268437504,%r10d
        movups  (%rdi),%xmm0
        xorps   %xmm4,%xmm4
+       andl    _OPENSSL_ia32cap_P+4(%rip),%r10d
        leaq    16(%rdx),%rax
        cmpl    $256,%esi
        je      L$14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
 
 L$10rounds:
        movl    $9,%esi
+       cmpl    $268435456,%r10d
+       je      L$10rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,200,1
        call    L$key_expansion_128_cold
@@ -3008,9 +3212,79 @@ L$10rounds:
        jmp     L$enc_key_ret
 
 .p2align       4
+L$10rounds_alt:
+       movdqa  L$key_rotate(%rip),%xmm5
+       movl    $8,%r10d
+       movdqa  L$key_rcon1(%rip),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,(%rdx)
+       jmp     L$oop_key128
+
+.p2align       4
+L$oop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leaq    16(%rax),%rax
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%rax)
+       movdqa  %xmm0,%xmm2
+
+       decl    %r10d
+       jnz     L$oop_key128
+
+       movdqa  L$key_rcon1b(%rip),%xmm4
+
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%rax)
+
+       movl    %esi,96(%rax)
+       xorl    %eax,%eax
+       jmp     L$enc_key_ret
+
+.p2align       4
 L$12rounds:
        movq    16(%rdi),%xmm2
        movl    $11,%esi
+       cmpl    $268435456,%r10d
+       je      L$12rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,202,1
        call    L$key_expansion_192a_cold
@@ -3034,10 +3308,54 @@ L$12rounds:
        jmp     L$enc_key_ret
 
 .p2align       4
+L$12rounds_alt:
+       movdqa  L$key_rotate192(%rip),%xmm5
+       movdqa  L$key_rcon1(%rip),%xmm4
+       movl    $8,%r10d
+       movdqu  %xmm0,(%rdx)
+       jmp     L$oop_key192
+
+.p2align       4
+L$oop_key192:
+       movq    %xmm2,0(%rax)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leaq    24(%rax),%rax
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%rax)
+
+       decl    %r10d
+       jnz     L$oop_key192
+
+       movl    %esi,32(%rax)
+       xorl    %eax,%eax
+       jmp     L$enc_key_ret
+
+.p2align       4
 L$14rounds:
        movups  16(%rdi),%xmm2
        movl    $13,%esi
        leaq    16(%rax),%rax
+       cmpl    $268435456,%r10d
+       je      L$14rounds_alt
+
        movups  %xmm0,(%rdx)
        movups  %xmm2,16(%rdx)
 .byte  102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ L$14rounds:
        jmp     L$enc_key_ret
 
 .p2align       4
+L$14rounds_alt:
+       movdqa  L$key_rotate(%rip),%xmm5
+       movdqa  L$key_rcon1(%rip),%xmm4
+       movl    $7,%r10d
+       movdqu  %xmm0,0(%rdx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,16(%rdx)
+       jmp     L$oop_key256
+
+.p2align       4
+L$oop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       decl    %r10d
+       jz      L$done_key256
+
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%rax)
+       leaq    32(%rax),%rax
+       movdqa  %xmm2,%xmm1
+
+       jmp     L$oop_key256
+
+L$done_key256:
+       movl    %esi,16(%rax)
+       xorl    %eax,%eax
+       jmp     L$enc_key_ret
+
+.p2align       4
 L$bad_keybits:
        movq    $-2,%rax
 L$enc_key_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
        addq    $8,%rsp
        .byte   0xf3,0xc3
 L$SEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ L$xts_magic:
 .long  0x87,0,1,0
 L$increment1:
 .byte  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long  0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long  1,1,1,1
+L$key_rcon1b:
+.long  0x1b,0x1b,0x1b,0x1b
 
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .p2align       6
index 65cf999..5470fb0 100644 (file)
@@ -2884,11 +2884,16 @@ L$sqrx4x_sub:
 
 .p2align       4
 _bn_get_bits5:
-       movq    %rdi,%r10
+       leaq    0(%rdi),%r10
+       leaq    1(%rdi),%r11
        movl    %esi,%ecx
-       shrl    $3,%esi
-       movzwl  (%r10,%rsi,1),%eax
-       andl    $7,%ecx
+       shrl    $4,%esi
+       andl    $15,%ecx
+       leal    -8(%rcx),%eax
+       cmpl    $11,%ecx
+       cmovaq  %r11,%r10
+       cmoval  %eax,%ecx
+       movzwl  (%r10,%rsi,2),%eax
        shrl    %cl,%eax
        andl    $31,%eax
        .byte   0xf3,0xc3
index 53d8afc..5e84812 100644 (file)
@@ -18,7 +18,10 @@ DB   102,15,56,220,209
        lea     r8,QWORD PTR[16+r8]
        jnz     $L$oop_enc1_1
 DB     102,15,56,221,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[rdx],xmm2
+       pxor    xmm2,xmm2
        DB      0F3h,0C3h               ;repret
 aesni_encrypt  ENDP
 
@@ -39,7 +42,10 @@ DB   102,15,56,222,209
        lea     r8,QWORD PTR[16+r8]
        jnz     $L$oop_dec1_2
 DB     102,15,56,223,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[rdx],xmm2
+       pxor    xmm2,xmm2
        DB      0F3h,0C3h               ;repret
 aesni_decrypt  ENDP
 
@@ -265,21 +271,18 @@ DB        102,15,56,220,217
        pxor    xmm6,xmm0
 DB     102,15,56,220,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
        add     rax,16
-DB     102,15,56,220,233
-DB     102,15,56,220,241
-DB     102,15,56,220,249
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
        jmp     $L$enc_loop6_enter
 ALIGN  16
 $L$enc_loop6::
 DB     102,15,56,220,209
 DB     102,15,56,220,217
 DB     102,15,56,220,225
+$L$enc_loop6_enter::
 DB     102,15,56,220,233
 DB     102,15,56,220,241
 DB     102,15,56,220,249
-$L$enc_loop6_enter::
        movups  xmm1,XMMWORD PTR[rax*1+rcx]
        add     rax,32
 DB     102,15,56,220,208
@@ -322,21 +325,18 @@ DB        102,15,56,222,217
        pxor    xmm6,xmm0
 DB     102,15,56,222,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
        add     rax,16
-DB     102,15,56,222,233
-DB     102,15,56,222,241
-DB     102,15,56,222,249
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
        jmp     $L$dec_loop6_enter
 ALIGN  16
 $L$dec_loop6::
 DB     102,15,56,222,209
 DB     102,15,56,222,217
 DB     102,15,56,222,225
+$L$dec_loop6_enter::
 DB     102,15,56,222,233
 DB     102,15,56,222,241
 DB     102,15,56,222,249
-$L$dec_loop6_enter::
        movups  xmm1,XMMWORD PTR[rax*1+rcx]
        add     rax,32
 DB     102,15,56,222,208
@@ -376,23 +376,18 @@ _aesni_encrypt8   PROC PRIVATE
        lea     rcx,QWORD PTR[32+rax*1+rcx]
        neg     rax
 DB     102,15,56,220,209
-       add     rax,16
        pxor    xmm7,xmm0
-DB     102,15,56,220,217
        pxor    xmm8,xmm0
+DB     102,15,56,220,217
        pxor    xmm9,xmm0
-DB     102,15,56,220,225
-DB     102,15,56,220,233
-DB     102,15,56,220,241
-DB     102,15,56,220,249
-DB     102,68,15,56,220,193
-DB     102,68,15,56,220,201
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
-       jmp     $L$enc_loop8_enter
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
+       add     rax,16
+       jmp     $L$enc_loop8_inner
 ALIGN  16
 $L$enc_loop8::
 DB     102,15,56,220,209
 DB     102,15,56,220,217
+$L$enc_loop8_inner::
 DB     102,15,56,220,225
 DB     102,15,56,220,233
 DB     102,15,56,220,241
@@ -445,23 +440,18 @@ _aesni_decrypt8   PROC PRIVATE
        lea     rcx,QWORD PTR[32+rax*1+rcx]
        neg     rax
 DB     102,15,56,222,209
-       add     rax,16
        pxor    xmm7,xmm0
-DB     102,15,56,222,217
        pxor    xmm8,xmm0
+DB     102,15,56,222,217
        pxor    xmm9,xmm0
-DB     102,15,56,222,225
-DB     102,15,56,222,233
-DB     102,15,56,222,241
-DB     102,15,56,222,249
-DB     102,68,15,56,222,193
-DB     102,68,15,56,222,201
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
-       jmp     $L$dec_loop8_enter
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
+       add     rax,16
+       jmp     $L$dec_loop8_inner
 ALIGN  16
 $L$dec_loop8::
 DB     102,15,56,222,209
 DB     102,15,56,222,217
+$L$dec_loop8_inner::
 DB     102,15,56,222,225
 DB     102,15,56,222,233
 DB     102,15,56,222,241
@@ -605,6 +595,7 @@ $L$ecb_enc_tail::
        movups  xmm7,XMMWORD PTR[80+rdi]
        je      $L$ecb_enc_six
        movdqu  xmm8,XMMWORD PTR[96+rdi]
+       xorps   xmm9,xmm9
        call    _aesni_encrypt8
        movups  XMMWORD PTR[rsi],xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
@@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter::
        jnc     $L$ecb_dec_loop8
 
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        mov     rcx,r11
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        mov     eax,r10d
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        movups  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
        movups  XMMWORD PTR[96+rsi],xmm8
+       pxor    xmm8,xmm8
        movups  XMMWORD PTR[112+rsi],xmm9
+       pxor    xmm9,xmm9
        lea     rsi,QWORD PTR[128+rsi]
        add     rdx,080h
        jz      $L$ecb_ret
@@ -749,14 +748,23 @@ $L$ecb_dec_tail::
        je      $L$ecb_dec_six
        movups  xmm8,XMMWORD PTR[96+rdi]
        movups  xmm0,XMMWORD PTR[rcx]
+       xorps   xmm9,xmm9
        call    _aesni_decrypt8
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        movups  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
        movups  XMMWORD PTR[96+rsi],xmm8
+       pxor    xmm8,xmm8
+       pxor    xmm9,xmm9
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_one::
@@ -772,53 +780,81 @@ DB        102,15,56,222,209
        jnz     $L$oop_dec1_4
 DB     102,15,56,223,209
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_two::
        call    _aesni_decrypt2
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_three::
        call    _aesni_decrypt3
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_four::
        call    _aesni_decrypt4
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_five::
        xorps   xmm7,xmm7
        call    _aesni_decrypt6
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_six::
        call    _aesni_decrypt6
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        movups  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
 
 $L$ecb_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
        movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  XMMWORD PTR[rsp],xmm0
        movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        lea     rsp,QWORD PTR[88+rsp]
 $L$ecb_enc_ret::
        mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
@@ -898,11 +934,21 @@ DB        102,15,56,0,215
        lea     rsi,QWORD PTR[16+rsi]
        jnz     $L$ccm64_enc_outer
 
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[r9],xmm3
+       pxor    xmm3,xmm3
+       pxor    xmm8,xmm8
+       pxor    xmm6,xmm6
        movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  XMMWORD PTR[rsp],xmm0
        movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        lea     rsp,QWORD PTR[88+rsp]
 $L$ccm64_enc_ret::
        mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
@@ -1016,11 +1062,21 @@ DB      102,15,56,220,217
        lea     r11,QWORD PTR[16+r11]
        jnz     $L$oop_enc1_6
 DB     102,15,56,221,217
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[r9],xmm3
+       pxor    xmm3,xmm3
+       pxor    xmm8,xmm8
+       pxor    xmm6,xmm6
        movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  XMMWORD PTR[rsp],xmm0
        movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        lea     rsp,QWORD PTR[88+rsp]
 $L$ccm64_dec_ret::
        mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
@@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
        mov     r8,QWORD PTR[40+rsp]
 
 
+       cmp     rdx,1
+       jne     $L$ctr32_bulk
+
+
+
+       movups  xmm2,XMMWORD PTR[r8]
+       movups  xmm3,XMMWORD PTR[rdi]
+       mov     edx,DWORD PTR[240+rcx]
+       movups  xmm0,XMMWORD PTR[rcx]
+       movups  xmm1,XMMWORD PTR[16+rcx]
+       lea     rcx,QWORD PTR[32+rcx]
+       xorps   xmm2,xmm0
+$L$oop_enc1_7::
+DB     102,15,56,220,209
+       dec     edx
+       movups  xmm1,XMMWORD PTR[rcx]
+       lea     rcx,QWORD PTR[16+rcx]
+       jnz     $L$oop_enc1_7
+DB     102,15,56,221,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       xorps   xmm2,xmm3
+       pxor    xmm3,xmm3
+       movups  XMMWORD PTR[rsi],xmm2
+       xorps   xmm2,xmm2
+       jmp     $L$ctr32_epilogue
+
+ALIGN  16
+$L$ctr32_bulk::
        lea     rax,QWORD PTR[rsp]
        push    rbp
        sub     rsp,288
@@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
 $L$ctr32_body::
        lea     rbp,QWORD PTR[((-8))+rax]
 
-       cmp     rdx,1
-       je      $L$ctr32_one_shortcut
+
+
 
        movdqu  xmm2,XMMWORD PTR[r8]
        movdqu  xmm0,XMMWORD PTR[rcx]
@@ -1452,11 +1537,14 @@ DB      102,69,15,56,221,202
        lea     rcx,QWORD PTR[((-128))+rcx]
 
 $L$ctr32_tail::
+
+
        lea     rcx,QWORD PTR[16+rcx]
        cmp     rdx,4
        jb      $L$ctr32_loop3
        je      $L$ctr32_loop4
 
+
        shl     eax,4
        movdqa  xmm8,XMMWORD PTR[96+rsp]
        pxor    xmm9,xmm9
@@ -1559,40 +1647,43 @@ DB      102,15,56,221,225
        movups  xmm12,XMMWORD PTR[32+rdi]
        xorps   xmm4,xmm12
        movups  XMMWORD PTR[32+rsi],xmm4
-       jmp     $L$ctr32_done
 
-ALIGN  16
-$L$ctr32_one_shortcut::
-       movups  xmm2,XMMWORD PTR[r8]
-       movups  xmm10,XMMWORD PTR[rdi]
-       mov     eax,DWORD PTR[240+rcx]
-       movups  xmm0,XMMWORD PTR[rcx]
-       movups  xmm1,XMMWORD PTR[16+rcx]
-       lea     rcx,QWORD PTR[32+rcx]
-       xorps   xmm2,xmm0
-$L$oop_enc1_7::
-DB     102,15,56,220,209
-       dec     eax
-       movups  xmm1,XMMWORD PTR[rcx]
-       lea     rcx,QWORD PTR[16+rcx]
-       jnz     $L$oop_enc1_7
-DB     102,15,56,221,209
-       xorps   xmm2,xmm10
-       movups  XMMWORD PTR[rsi],xmm2
-       jmp     $L$ctr32_done
-
-ALIGN  16
 $L$ctr32_done::
+       xorps   xmm0,xmm0
+       xor     r11d,r11d
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        movaps  xmm6,XMMWORD PTR[((-160))+rbp]
+       movaps  XMMWORD PTR[(-160)+rbp],xmm0
        movaps  xmm7,XMMWORD PTR[((-144))+rbp]
+       movaps  XMMWORD PTR[(-144)+rbp],xmm0
        movaps  xmm8,XMMWORD PTR[((-128))+rbp]
+       movaps  XMMWORD PTR[(-128)+rbp],xmm0
        movaps  xmm9,XMMWORD PTR[((-112))+rbp]
+       movaps  XMMWORD PTR[(-112)+rbp],xmm0
        movaps  xmm10,XMMWORD PTR[((-96))+rbp]
+       movaps  XMMWORD PTR[(-96)+rbp],xmm0
        movaps  xmm11,XMMWORD PTR[((-80))+rbp]
+       movaps  XMMWORD PTR[(-80)+rbp],xmm0
        movaps  xmm12,XMMWORD PTR[((-64))+rbp]
+       movaps  XMMWORD PTR[(-64)+rbp],xmm0
        movaps  xmm13,XMMWORD PTR[((-48))+rbp]
+       movaps  XMMWORD PTR[(-48)+rbp],xmm0
        movaps  xmm14,XMMWORD PTR[((-32))+rbp]
+       movaps  XMMWORD PTR[(-32)+rbp],xmm0
        movaps  xmm15,XMMWORD PTR[((-16))+rbp]
+       movaps  XMMWORD PTR[(-16)+rbp],xmm0
+       movaps  XMMWORD PTR[rsp],xmm0
+       movaps  XMMWORD PTR[16+rsp],xmm0
+       movaps  XMMWORD PTR[32+rsp],xmm0
+       movaps  XMMWORD PTR[48+rsp],xmm0
+       movaps  XMMWORD PTR[64+rsp],xmm0
+       movaps  XMMWORD PTR[80+rsp],xmm0
+       movaps  XMMWORD PTR[96+rsp],xmm0
+       movaps  XMMWORD PTR[112+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$ctr32_epilogue::
@@ -1889,6 +1980,7 @@ DB        102,15,56,221,124,36,80
        shr     eax,4
 
 $L$xts_enc_short::
+
        mov     r10d,eax
        pxor    xmm10,xmm0
        add     rdx,16*6
@@ -1917,6 +2009,7 @@ $L$xts_enc_short::
        pxor    xmm4,xmm12
        pxor    xmm5,xmm13
        pxor    xmm6,xmm14
+       pxor    xmm7,xmm7
 
        call    _aesni_encrypt6
 
@@ -2059,16 +2152,39 @@ DB      102,15,56,221,209
        movups  XMMWORD PTR[(-16)+rsi],xmm2
 
 $L$xts_enc_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        movaps  xmm6,XMMWORD PTR[((-160))+rbp]
+       movaps  XMMWORD PTR[(-160)+rbp],xmm0
        movaps  xmm7,XMMWORD PTR[((-144))+rbp]
+       movaps  XMMWORD PTR[(-144)+rbp],xmm0
        movaps  xmm8,XMMWORD PTR[((-128))+rbp]
+       movaps  XMMWORD PTR[(-128)+rbp],xmm0
        movaps  xmm9,XMMWORD PTR[((-112))+rbp]
+       movaps  XMMWORD PTR[(-112)+rbp],xmm0
        movaps  xmm10,XMMWORD PTR[((-96))+rbp]
+       movaps  XMMWORD PTR[(-96)+rbp],xmm0
        movaps  xmm11,XMMWORD PTR[((-80))+rbp]
+       movaps  XMMWORD PTR[(-80)+rbp],xmm0
        movaps  xmm12,XMMWORD PTR[((-64))+rbp]
+       movaps  XMMWORD PTR[(-64)+rbp],xmm0
        movaps  xmm13,XMMWORD PTR[((-48))+rbp]
+       movaps  XMMWORD PTR[(-48)+rbp],xmm0
        movaps  xmm14,XMMWORD PTR[((-32))+rbp]
+       movaps  XMMWORD PTR[(-32)+rbp],xmm0
        movaps  xmm15,XMMWORD PTR[((-16))+rbp]
+       movaps  XMMWORD PTR[(-16)+rbp],xmm0
+       movaps  XMMWORD PTR[rsp],xmm0
+       movaps  XMMWORD PTR[16+rsp],xmm0
+       movaps  XMMWORD PTR[32+rsp],xmm0
+       movaps  XMMWORD PTR[48+rsp],xmm0
+       movaps  XMMWORD PTR[64+rsp],xmm0
+       movaps  XMMWORD PTR[80+rsp],xmm0
+       movaps  XMMWORD PTR[96+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$xts_enc_epilogue::
@@ -2371,6 +2487,7 @@ DB        102,15,56,223,124,36,80
        shr     eax,4
 
 $L$xts_dec_short::
+
        mov     r10d,eax
        pxor    xmm10,xmm0
        pxor    xmm11,xmm0
@@ -2573,16 +2690,39 @@ DB      102,15,56,223,209
        movups  XMMWORD PTR[rsi],xmm2
 
 $L$xts_dec_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        movaps  xmm6,XMMWORD PTR[((-160))+rbp]
+       movaps  XMMWORD PTR[(-160)+rbp],xmm0
        movaps  xmm7,XMMWORD PTR[((-144))+rbp]
+       movaps  XMMWORD PTR[(-144)+rbp],xmm0
        movaps  xmm8,XMMWORD PTR[((-128))+rbp]
+       movaps  XMMWORD PTR[(-128)+rbp],xmm0
        movaps  xmm9,XMMWORD PTR[((-112))+rbp]
+       movaps  XMMWORD PTR[(-112)+rbp],xmm0
        movaps  xmm10,XMMWORD PTR[((-96))+rbp]
+       movaps  XMMWORD PTR[(-96)+rbp],xmm0
        movaps  xmm11,XMMWORD PTR[((-80))+rbp]
+       movaps  XMMWORD PTR[(-80)+rbp],xmm0
        movaps  xmm12,XMMWORD PTR[((-64))+rbp]
+       movaps  XMMWORD PTR[(-64)+rbp],xmm0
        movaps  xmm13,XMMWORD PTR[((-48))+rbp]
+       movaps  XMMWORD PTR[(-48)+rbp],xmm0
        movaps  xmm14,XMMWORD PTR[((-32))+rbp]
+       movaps  XMMWORD PTR[(-32)+rbp],xmm0
        movaps  xmm15,XMMWORD PTR[((-16))+rbp]
+       movaps  XMMWORD PTR[(-16)+rbp],xmm0
+       movaps  XMMWORD PTR[rsp],xmm0
+       movaps  XMMWORD PTR[16+rsp],xmm0
+       movaps  XMMWORD PTR[32+rsp],xmm0
+       movaps  XMMWORD PTR[48+rsp],xmm0
+       movaps  XMMWORD PTR[64+rsp],xmm0
+       movaps  XMMWORD PTR[80+rsp],xmm0
+       movaps  XMMWORD PTR[96+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$xts_dec_epilogue::
@@ -2646,7 +2786,11 @@ DB       102,15,56,221,209
        jnc     $L$cbc_enc_loop
        add     rdx,16
        jnz     $L$cbc_enc_tail
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[r8],xmm2
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
        jmp     $L$cbc_ret
 
 $L$cbc_enc_tail::
@@ -2666,6 +2810,35 @@ $L$cbc_enc_tail::
 
 ALIGN  16
 $L$cbc_decrypt::
+       cmp     rdx,16
+       jne     $L$cbc_decrypt_bulk
+
+
+
+       movdqu  xmm2,XMMWORD PTR[rdi]
+       movdqu  xmm3,XMMWORD PTR[r8]
+       movdqa  xmm4,xmm2
+       movups  xmm0,XMMWORD PTR[rcx]
+       movups  xmm1,XMMWORD PTR[16+rcx]
+       lea     rcx,QWORD PTR[32+rcx]
+       xorps   xmm2,xmm0
+$L$oop_dec1_16::
+DB     102,15,56,222,209
+       dec     r10d
+       movups  xmm1,XMMWORD PTR[rcx]
+       lea     rcx,QWORD PTR[16+rcx]
+       jnz     $L$oop_dec1_16
+DB     102,15,56,223,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       movdqu  XMMWORD PTR[r8],xmm4
+       xorps   xmm2,xmm3
+       pxor    xmm3,xmm3
+       movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
+       jmp     $L$cbc_ret
+ALIGN  16
+$L$cbc_decrypt_bulk::
        lea     rax,QWORD PTR[rsp]
        push    rbp
        sub     rsp,176
@@ -2913,7 +3086,7 @@ DB        102,69,15,56,223,202
        movaps  xmm2,xmm9
        lea     rcx,QWORD PTR[((-112))+rcx]
        add     rdx,070h
-       jle     $L$cbc_dec_tail_collected
+       jle     $L$cbc_dec_clear_tail_collected
        movups  XMMWORD PTR[rsi],xmm9
        lea     rsi,QWORD PTR[16+rsi]
        cmp     rdx,050h
@@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        pxor    xmm6,xmm14
        movdqu  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        pxor    xmm7,xmm15
        movdqu  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        lea     rsi,QWORD PTR[80+rsi]
        movdqa  xmm2,xmm7
+       pxor    xmm7,xmm7
        jmp     $L$cbc_dec_tail_collected
 
 ALIGN  16
@@ -2954,16 +3132,23 @@ $L$cbc_dec_seven::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        pxor    xmm6,xmm14
        movdqu  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        pxor    xmm7,xmm15
        movdqu  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        pxor    xmm8,xmm9
        movdqu  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
        lea     rsi,QWORD PTR[96+rsi]
        movdqa  xmm2,xmm8
+       pxor    xmm8,xmm8
+       pxor    xmm9,xmm9
        jmp     $L$cbc_dec_tail_collected
 
 ALIGN  16
@@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter::
 
        movdqa  xmm2,xmm7
        add     rdx,050h
-       jle     $L$cbc_dec_tail_collected
+       jle     $L$cbc_dec_clear_tail_collected
        movups  XMMWORD PTR[rsi],xmm7
        lea     rsi,QWORD PTR[16+rsi]
 
@@ -3042,12 +3227,17 @@ $L$cbc_dec_tail::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        pxor    xmm6,xmm14
        movdqu  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        lea     rsi,QWORD PTR[64+rsi]
        movdqa  xmm2,xmm6
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        sub     rdx,010h
        jmp     $L$cbc_dec_tail_collected
 
@@ -3058,12 +3248,12 @@ $L$cbc_dec_one::
        movups  xmm1,XMMWORD PTR[16+rcx]
        lea     rcx,QWORD PTR[32+rcx]
        xorps   xmm2,xmm0
-$L$oop_dec1_16::
+$L$oop_dec1_17::
 DB     102,15,56,222,209
        dec     eax
        movups  xmm1,XMMWORD PTR[rcx]
        lea     rcx,QWORD PTR[16+rcx]
-       jnz     $L$oop_dec1_16
+       jnz     $L$oop_dec1_17
 DB     102,15,56,223,209
        xorps   xmm2,xmm10
        movaps  xmm10,xmm11
@@ -3077,6 +3267,7 @@ $L$cbc_dec_two::
        pxor    xmm3,xmm11
        movdqu  XMMWORD PTR[rsi],xmm2
        movdqa  xmm2,xmm3
+       pxor    xmm3,xmm3
        lea     rsi,QWORD PTR[16+rsi]
        jmp     $L$cbc_dec_tail_collected
 ALIGN  16
@@ -3089,7 +3280,9 @@ $L$cbc_dec_three::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movdqa  xmm2,xmm4
+       pxor    xmm4,xmm4
        lea     rsi,QWORD PTR[32+rsi]
        jmp     $L$cbc_dec_tail_collected
 ALIGN  16
@@ -3102,39 +3295,61 @@ $L$cbc_dec_four::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movdqa  xmm2,xmm5
+       pxor    xmm5,xmm5
        lea     rsi,QWORD PTR[48+rsi]
        jmp     $L$cbc_dec_tail_collected
 
 ALIGN  16
+$L$cbc_dec_clear_tail_collected::
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
 $L$cbc_dec_tail_collected::
        movups  XMMWORD PTR[r8],xmm10
        and     rdx,15
        jnz     $L$cbc_dec_tail_partial
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        jmp     $L$cbc_dec_ret
 ALIGN  16
 $L$cbc_dec_tail_partial::
        movaps  XMMWORD PTR[rsp],xmm2
+       pxor    xmm2,xmm2
        mov     rcx,16
        mov     rdi,rsi
        sub     rcx,rdx
        lea     rsi,QWORD PTR[rsp]
        DD      09066A4F3h
+       movdqa  XMMWORD PTR[rsp],xmm2
 
 $L$cbc_dec_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
        movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  XMMWORD PTR[64+rsp],xmm0
        movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  XMMWORD PTR[80+rsp],xmm0
        movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  XMMWORD PTR[96+rsp],xmm0
        movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  XMMWORD PTR[112+rsp],xmm0
        movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  XMMWORD PTR[128+rsp],xmm0
        movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  XMMWORD PTR[144+rsp],xmm0
        movaps  xmm15,XMMWORD PTR[160+rsp]
+       movaps  XMMWORD PTR[160+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$cbc_ret::
@@ -3175,7 +3390,9 @@ DB        102,15,56,219,201
 
        movups  xmm0,XMMWORD PTR[r8]
 DB     102,15,56,219,192
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[rcx],xmm0
+       pxor    xmm0,xmm0
 $L$dec_key_ret::
        add     rsp,8
        DB      0F3h,0C3h               ;repret
@@ -3193,8 +3410,10 @@ DB       048h,083h,0ECh,008h
        test    r8,r8
        jz      $L$enc_key_ret
 
+       mov     r10d,268437504
        movups  xmm0,XMMWORD PTR[rcx]
        xorps   xmm4,xmm4
+       and     r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))]
        lea     rax,QWORD PTR[16+r8]
        cmp     edx,256
        je      $L$14rounds
@@ -3205,6 +3424,9 @@ DB        048h,083h,0ECh,008h
 
 $L$10rounds::
        mov     edx,9
+       cmp     r10d,268435456
+       je      $L$10rounds_alt
+
        movups  XMMWORD PTR[r8],xmm0
 DB     102,15,58,223,200,1
        call    $L$key_expansion_128_cold
@@ -3232,9 +3454,79 @@ DB       102,15,58,223,200,54
        jmp     $L$enc_key_ret
 
 ALIGN  16
+$L$10rounds_alt::
+       movdqa  xmm5,XMMWORD PTR[$L$key_rotate]
+       mov     r10d,8
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1]
+       movdqa  xmm2,xmm0
+       movdqu  XMMWORD PTR[r8],xmm0
+       jmp     $L$oop_key128
+
+ALIGN  16
+$L$oop_key128::
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+       lea     rax,QWORD PTR[16+rax]
+
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[(-16)+rax],xmm0
+       movdqa  xmm2,xmm0
+
+       dec     r10d
+       jnz     $L$oop_key128
+
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1b]
+
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[rax],xmm0
+
+       movdqa  xmm2,xmm0
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[16+rax],xmm0
+
+       mov     DWORD PTR[96+rax],edx
+       xor     eax,eax
+       jmp     $L$enc_key_ret
+
+ALIGN  16
 $L$12rounds::
        movq    xmm2,QWORD PTR[16+rcx]
        mov     edx,11
+       cmp     r10d,268435456
+       je      $L$12rounds_alt
+
        movups  XMMWORD PTR[r8],xmm0
 DB     102,15,58,223,202,1
        call    $L$key_expansion_192a_cold
@@ -3258,10 +3550,54 @@ DB      102,15,58,223,202,128
        jmp     $L$enc_key_ret
 
 ALIGN  16
+$L$12rounds_alt::
+       movdqa  xmm5,XMMWORD PTR[$L$key_rotate192]
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1]
+       mov     r10d,8
+       movdqu  XMMWORD PTR[r8],xmm0
+       jmp     $L$oop_key192
+
+ALIGN  16
+$L$oop_key192::
+       movq    QWORD PTR[rax],xmm2
+       movdqa  xmm1,xmm2
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+       pslld   xmm4,1
+       lea     rax,QWORD PTR[24+rax]
+
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+
+       pshufd  xmm3,xmm0,0ffh
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+
+       pxor    xmm0,xmm2
+       pxor    xmm2,xmm3
+       movdqu  XMMWORD PTR[(-16)+rax],xmm0
+
+       dec     r10d
+       jnz     $L$oop_key192
+
+       mov     DWORD PTR[32+rax],edx
+       xor     eax,eax
+       jmp     $L$enc_key_ret
+
+ALIGN  16
 $L$14rounds::
        movups  xmm2,XMMWORD PTR[16+rcx]
        mov     edx,13
        lea     rax,QWORD PTR[16+rax]
+       cmp     r10d,268435456
+       je      $L$14rounds_alt
+
        movups  XMMWORD PTR[r8],xmm0
        movups  XMMWORD PTR[16+r8],xmm2
 DB     102,15,58,223,202,1
@@ -3296,9 +3632,69 @@ DB       102,15,58,223,202,64
        jmp     $L$enc_key_ret
 
 ALIGN  16
+$L$14rounds_alt::
+       movdqa  xmm5,XMMWORD PTR[$L$key_rotate]
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1]
+       mov     r10d,7
+       movdqu  XMMWORD PTR[r8],xmm0
+       movdqa  xmm1,xmm2
+       movdqu  XMMWORD PTR[16+r8],xmm2
+       jmp     $L$oop_key256
+
+ALIGN  16
+$L$oop_key256::
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+       pslld   xmm4,1
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[rax],xmm0
+
+       dec     r10d
+       jz      $L$done_key256
+
+       pshufd  xmm2,xmm0,0ffh
+       pxor    xmm3,xmm3
+DB     102,15,56,221,211
+
+       movdqa  xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm1,xmm3
+
+       pxor    xmm2,xmm1
+       movdqu  XMMWORD PTR[16+rax],xmm2
+       lea     rax,QWORD PTR[32+rax]
+       movdqa  xmm1,xmm2
+
+       jmp     $L$oop_key256
+
+$L$done_key256::
+       mov     DWORD PTR[16+rax],edx
+       xor     eax,eax
+       jmp     $L$enc_key_ret
+
+ALIGN  16
 $L$bad_keybits::
        mov     rax,-2
 $L$enc_key_ret::
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        add     rsp,8
        DB      0F3h,0C3h               ;repret
 $L$SEH_end_set_encrypt_key::
@@ -3384,6 +3780,14 @@ $L$xts_magic::
        DD      087h,0,1,0
 $L$increment1::
 DB     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate::
+       DD      00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh
+$L$key_rotate192::
+       DD      004070605h,004070605h,004070605h,004070605h
+$L$key_rcon1::
+       DD      1,1,1,1
+$L$key_rcon1b::
+       DD      01bh,01bh,01bh,01bh
 
 DB     65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 DB     83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
@@ -3489,7 +3893,7 @@ cbc_se_handler    PROC PRIVATE
        mov     rax,QWORD PTR[152+r8]
        mov     rbx,QWORD PTR[248+r8]
 
-       lea     r10,QWORD PTR[$L$cbc_decrypt]
+       lea     r10,QWORD PTR[$L$cbc_decrypt_bulk]
        cmp     rbx,r10
        jb      $L$common_seh_tail
 
index 64a1b42..9fdd91d 100644 (file)
@@ -3001,11 +3001,16 @@ PUBLIC  bn_get_bits5
 
 ALIGN  16
 bn_get_bits5   PROC PUBLIC
-       mov     r10,rcx
+       lea     r10,QWORD PTR[rcx]
+       lea     r11,QWORD PTR[1+rcx]
        mov     ecx,edx
-       shr     edx,3
-       movzx   eax,WORD PTR[rdx*1+r10]
-       and     ecx,7
+       shr     edx,4
+       and     ecx,15
+       lea     eax,DWORD PTR[((-8))+rcx]
+       cmp     ecx,11
+       cmova   r10,r11
+       cmova   ecx,eax
+       movzx   eax,WORD PTR[rdx*2+r10]
        shr     eax,cl
        and     eax,31
        DB      0F3h,0C3h               ;repret
index a68f7cd..3bbc4e4 100644 (file)
@@ -21,7 +21,10 @@ aesni_encrypt:
        leal    16(%edx),%edx
        jnz     .L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .size  aesni_encrypt,.-.L_aesni_encrypt_begin
 .globl aesni_decrypt
@@ -45,7 +48,10 @@ aesni_decrypt:
        leal    16(%edx),%edx
        jnz     .L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .size  aesni_decrypt,.-.L_aesni_decrypt_begin
 .type  _aesni_encrypt2,@function
@@ -259,17 +265,15 @@ _aesni_encrypt6:
        negl    %ecx
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     .L_aesni_encrypt6_enter
+       jmp     .L008_aesni_encrypt6_inner
 .align 16
-.L008enc6_loop:
+.L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+.L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
@@ -283,7 +287,7 @@ _aesni_encrypt6:
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L008enc6_loop
+       jnz     .L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -315,17 +319,15 @@ _aesni_decrypt6:
        negl    %ecx
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     .L_aesni_decrypt6_enter
+       jmp     .L010_aesni_decrypt6_inner
 .align 16
-.L009dec6_loop:
+.L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+.L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
@@ -339,7 +341,7 @@ _aesni_decrypt6:
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L009dec6_loop
+       jnz     .L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -369,14 +371,14 @@ aesni_ecb_encrypt:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      .L010ecb_ret
+       jz      .L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      .L011ecb_decrypt
+       jz      .L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L012ecb_enc_tail
+       jb      .L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -385,9 +387,9 @@ aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L013ecb_enc_loop6_enter
+       jmp     .L015ecb_enc_loop6_enter
 .align 16
-.L014ecb_enc_loop6:
+.L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -402,12 +404,12 @@ aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L013ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
        call    _aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L014ecb_enc_loop6
+       jnc     .L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -416,18 +418,18 @@ aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L010ecb_ret
-.L012ecb_enc_tail:
+       jz      .L012ecb_ret
+.L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L015ecb_enc_one
+       jb      .L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      .L016ecb_enc_two
+       je      .L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L017ecb_enc_three
+       jb      .L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      .L018ecb_enc_four
+       je      .L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    _aesni_encrypt6
@@ -436,49 +438,49 @@ aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L015ecb_enc_one:
+.L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L019enc1_loop_3:
+.L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L019enc1_loop_3
+       jnz     .L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L016ecb_enc_two:
+.L018ecb_enc_two:
        call    _aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L017ecb_enc_three:
+.L019ecb_enc_three:
        call    _aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L018ecb_enc_four:
+.L020ecb_enc_four:
        call    _aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L011ecb_decrypt:
+.L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L020ecb_dec_tail
+       jb      .L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -487,9 +489,9 @@ aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L021ecb_dec_loop6_enter
+       jmp     .L023ecb_dec_loop6_enter
 .align 16
-.L022ecb_dec_loop6:
+.L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -504,12 +506,12 @@ aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L021ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
        call    _aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L022ecb_dec_loop6
+       jnc     .L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -518,18 +520,18 @@ aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L010ecb_ret
-.L020ecb_dec_tail:
+       jz      .L012ecb_ret
+.L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L023ecb_dec_one
+       jb      .L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      .L024ecb_dec_two
+       je      .L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L025ecb_dec_three
+       jb      .L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      .L026ecb_dec_four
+       je      .L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
@@ -538,43 +540,51 @@ aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L023ecb_dec_one:
+.L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L027dec1_loop_4:
+.L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L027dec1_loop_4
+       jnz     .L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L024ecb_dec_two:
+.L026ecb_dec_two:
        call    _aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L025ecb_dec_three:
+.L027ecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L026ecb_dec_four:
+.L028ecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L010ecb_ret:
+.L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -621,7 +631,7 @@ aesni_ccm64_encrypt_blocks:
        leal    32(%edx,%ecx,1),%edx
        subl    %ecx,%ebx
 .byte  102,15,56,0,253
-.L028ccm64_enc_outer:
+.L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
@@ -630,7 +640,7 @@ aesni_ccm64_encrypt_blocks:
        xorps   %xmm6,%xmm0
        xorps   %xmm0,%xmm3
        movups  32(%ebp),%xmm0
-.L029ccm64_enc2_loop:
+.L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -638,7 +648,7 @@ aesni_ccm64_encrypt_blocks:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L029ccm64_enc2_loop
+       jnz     .L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
@@ -651,10 +661,18 @@ aesni_ccm64_encrypt_blocks:
        movups  %xmm6,(%edi)
 .byte  102,15,56,0,213
        leal    16(%edi),%edi
-       jnz     .L028ccm64_enc_outer
+       jnz     .L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -702,12 +720,12 @@ aesni_ccm64_decrypt_blocks:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L030enc1_loop_5:
+.L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L030enc1_loop_5
+       jnz     .L032enc1_loop_5
 .byte  102,15,56,221,209
        shll    $4,%ebx
        movl    $16,%ecx
@@ -717,16 +735,16 @@ aesni_ccm64_decrypt_blocks:
        subl    %ebx,%ecx
        leal    32(%ebp,%ebx,1),%edx
        movl    %ecx,%ebx
-       jmp     .L031ccm64_dec_outer
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L031ccm64_dec_outer:
+.L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      .L032ccm64_dec_break
+       jz      .L034ccm64_dec_break
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
@@ -734,7 +752,7 @@ aesni_ccm64_decrypt_blocks:
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
        movups  32(%ebp),%xmm0
-.L033ccm64_dec2_loop:
+.L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -742,7 +760,7 @@ aesni_ccm64_decrypt_blocks:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L033ccm64_dec2_loop
+       jnz     .L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
@@ -750,9 +768,9 @@ aesni_ccm64_decrypt_blocks:
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
        leal    16(%esi),%esi
-       jmp     .L031ccm64_dec_outer
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L032ccm64_dec_break:
+.L034ccm64_dec_break:
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
@@ -760,16 +778,24 @@ aesni_ccm64_decrypt_blocks:
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-.L034enc1_loop_6:
+.L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L034enc1_loop_6
+       jnz     .L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -795,7 +821,7 @@ aesni_ctr32_encrypt_blocks:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      .L035ctr32_one_shortcut
+       je      .L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -833,7 +859,7 @@ aesni_ctr32_encrypt_blocks:
        pshufd  $192,%xmm0,%xmm2
        pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      .L036ctr32_tail
+       jb      .L038ctr32_tail
        pxor    %xmm6,%xmm7
        shll    $4,%ecx
        movl    $16,%ebx
@@ -842,9 +868,9 @@ aesni_ctr32_encrypt_blocks:
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     .L037ctr32_loop6
+       jmp     .L039ctr32_loop6
 .align 16
-.L037ctr32_loop6:
+.L039ctr32_loop6:
        pshufd  $64,%xmm0,%xmm4
        movdqa  32(%esp),%xmm0
        pshufd  $192,%xmm1,%xmm5
@@ -898,27 +924,27 @@ aesni_ctr32_encrypt_blocks:
        leal    96(%edi),%edi
        pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     .L037ctr32_loop6
+       jnc     .L039ctr32_loop6
        addl    $6,%eax
-       jz      .L038ctr32_ret
+       jz      .L040ctr32_ret
        movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
        pxor    32(%esp),%xmm7
        movl    240(%ebp),%ecx
-.L036ctr32_tail:
+.L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      .L039ctr32_one
+       jb      .L041ctr32_one
        pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      .L040ctr32_two
+       je      .L042ctr32_two
        pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      .L041ctr32_three
+       jb      .L043ctr32_three
        pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      .L042ctr32_four
+       je      .L044ctr32_four
        por     %xmm7,%xmm6
        call    _aesni_encrypt6
        movups  (%esi),%xmm1
@@ -936,29 +962,29 @@ aesni_ctr32_encrypt_blocks:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L035ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-.L039ctr32_one:
+.L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L043enc1_loop_7:
+.L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L043enc1_loop_7
+       jnz     .L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L040ctr32_two:
+.L042ctr32_two:
        call    _aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -966,9 +992,9 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L041ctr32_three:
+.L043ctr32_three:
        call    _aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -979,9 +1005,9 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L042ctr32_four:
+.L044ctr32_four:
        call    _aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -995,7 +1021,18 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L038ctr32_ret:
+.L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1020,12 +1057,12 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L044enc1_loop_8:
+.L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L044enc1_loop_8
+       jnz     .L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1049,14 +1086,14 @@ aesni_xts_encrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      .L045xts_enc_short
+       jc      .L047xts_enc_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     .L046xts_enc_loop6
+       jmp     .L048xts_enc_loop6
 .align 16
-.L046xts_enc_loop6:
+.L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1145,23 +1182,23 @@ aesni_xts_encrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L046xts_enc_loop6
+       jnc     .L048xts_enc_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L045xts_enc_short:
+.L047xts_enc_short:
        addl    $96,%eax
-       jz      .L047xts_enc_done6x
+       jz      .L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L048xts_enc_one
+       jb      .L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L049xts_enc_two
+       je      .L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1170,7 +1207,7 @@ aesni_xts_encrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L050xts_enc_three
+       jb      .L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1180,7 +1217,7 @@ aesni_xts_encrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L051xts_enc_four
+       je      .L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1212,9 +1249,9 @@ aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L048xts_enc_one:
+.L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1222,20 +1259,20 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L053enc1_loop_9:
+.L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L053enc1_loop_9
+       jnz     .L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L049xts_enc_two:
+.L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1249,9 +1286,9 @@ aesni_xts_encrypt:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L050xts_enc_three:
+.L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1269,9 +1306,9 @@ aesni_xts_encrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L051xts_enc_four:
+.L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1293,28 +1330,28 @@ aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L047xts_enc_done6x:
+.L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L054xts_enc_ret
+       jz      .L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     .L055xts_enc_steal
+       jmp     .L057xts_enc_steal
 .align 16
-.L052xts_enc_done:
+.L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L054xts_enc_ret
+       jz      .L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-.L055xts_enc_steal:
+.L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1322,7 +1359,7 @@ aesni_xts_encrypt:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L055xts_enc_steal
+       jnz     .L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1332,16 +1369,30 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L056enc1_loop_10:
+.L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L056enc1_loop_10
+       jnz     .L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-.L054xts_enc_ret:
+.L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1366,12 +1417,12 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L057enc1_loop_11:
+.L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L057enc1_loop_11
+       jnz     .L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1400,14 +1451,14 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      .L058xts_dec_short
+       jc      .L060xts_dec_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     .L059xts_dec_loop6
+       jmp     .L061xts_dec_loop6
 .align 16
-.L059xts_dec_loop6:
+.L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1496,23 +1547,23 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L059xts_dec_loop6
+       jnc     .L061xts_dec_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L058xts_dec_short:
+.L060xts_dec_short:
        addl    $96,%eax
-       jz      .L060xts_dec_done6x
+       jz      .L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L061xts_dec_one
+       jb      .L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L062xts_dec_two
+       je      .L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1521,7 +1572,7 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L063xts_dec_three
+       jb      .L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1531,7 +1582,7 @@ aesni_xts_decrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L064xts_dec_four
+       je      .L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1563,9 +1614,9 @@ aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L061xts_dec_one:
+.L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1573,20 +1624,20 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L066dec1_loop_12:
+.L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L066dec1_loop_12
+       jnz     .L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L062xts_dec_two:
+.L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1600,9 +1651,9 @@ aesni_xts_decrypt:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L063xts_dec_three:
+.L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1620,9 +1671,9 @@ aesni_xts_decrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L064xts_dec_four:
+.L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1644,20 +1695,20 @@ aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L060xts_dec_done6x:
+.L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L067xts_dec_ret
+       jz      .L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     .L068xts_dec_only_one_more
+       jmp     .L070xts_dec_only_one_more
 .align 16
-.L065xts_dec_done:
+.L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L067xts_dec_ret
+       jz      .L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1667,7 +1718,7 @@ aesni_xts_decrypt:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-.L068xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1681,16 +1732,16 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L069dec1_loop_13:
+.L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L069dec1_loop_13
+       jnz     .L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-.L070xts_dec_steal:
+.L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1698,7 +1749,7 @@ aesni_xts_decrypt:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L070xts_dec_steal
+       jnz     .L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1708,16 +1759,30 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L071dec1_loop_14:
+.L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L071dec1_loop_14
+       jnz     .L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-.L067xts_dec_ret:
+.L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1743,7 +1808,7 @@ aesni_cbc_encrypt:
        movl    32(%esp),%edx
        movl    36(%esp),%ebp
        testl   %eax,%eax
-       jz      .L072cbc_abort
+       jz      .L074cbc_abort
        cmpl    $0,40(%esp)
        xchgl   %esp,%ebx
        movups  (%ebp),%xmm7
@@ -1751,14 +1816,14 @@ aesni_cbc_encrypt:
        movl    %edx,%ebp
        movl    %ebx,16(%esp)
        movl    %ecx,%ebx
-       je      .L073cbc_decrypt
+       je      .L075cbc_decrypt
        movaps  %xmm7,%xmm2
        cmpl    $16,%eax
-       jb      .L074cbc_enc_tail
+       jb      .L076cbc_enc_tail
        subl    $16,%eax
-       jmp     .L075cbc_enc_loop
+       jmp     .L077cbc_enc_loop
 .align 16
-.L075cbc_enc_loop:
+.L077cbc_enc_loop:
        movups  (%esi),%xmm7
        leal    16(%esi),%esi
        movups  (%edx),%xmm0
@@ -1766,24 +1831,25 @@ aesni_cbc_encrypt:
        xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
        xorps   %xmm7,%xmm2
-.L076enc1_loop_15:
+.L078enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L076enc1_loop_15
+       jnz     .L078enc1_loop_15
 .byte  102,15,56,221,209
        movl    %ebx,%ecx
        movl    %ebp,%edx
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        subl    $16,%eax
-       jnc     .L075cbc_enc_loop
+       jnc     .L077cbc_enc_loop
        addl    $16,%eax
-       jnz     .L074cbc_enc_tail
+       jnz     .L076cbc_enc_tail
        movaps  %xmm2,%xmm7
-       jmp     .L077cbc_ret
-.L074cbc_enc_tail:
+       pxor    %xmm2,%xmm2
+       jmp     .L079cbc_ret
+.L076cbc_enc_tail:
        movl    %eax,%ecx
 .long  2767451785
        movl    $16,%ecx
@@ -1794,20 +1860,20 @@ aesni_cbc_encrypt:
        movl    %ebx,%ecx
        movl    %edi,%esi
        movl    %ebp,%edx
-       jmp     .L075cbc_enc_loop
+       jmp     .L077cbc_enc_loop
 .align 16
-.L073cbc_decrypt:
+.L075cbc_decrypt:
        cmpl    $80,%eax
-       jbe     .L078cbc_dec_tail
+       jbe     .L080cbc_dec_tail
        movaps  %xmm7,(%esp)
        subl    $80,%eax
-       jmp     .L079cbc_dec_loop6_enter
+       jmp     .L081cbc_dec_loop6_enter
 .align 16
-.L080cbc_dec_loop6:
+.L082cbc_dec_loop6:
        movaps  %xmm0,(%esp)
        movups  %xmm7,(%edi)
        leal    16(%edi),%edi
-.L079cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -1837,28 +1903,28 @@ aesni_cbc_encrypt:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      .L080cbc_dec_loop6
+       ja      .L082cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     .L081cbc_dec_tail_collected
+       jle     .L083cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-.L078cbc_dec_tail:
+.L080cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     .L082cbc_dec_one
+       jbe     .L084cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     .L083cbc_dec_two
+       jbe     .L085cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     .L084cbc_dec_three
+       jbe     .L086cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     .L085cbc_dec_four
+       jbe     .L087cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1876,55 +1942,62 @@ aesni_cbc_encrypt:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L082cbc_dec_one:
+.L084cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L086dec1_loop_16:
+.L089dec1_loop_16:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L086dec1_loop_16
+       jnz     .L089dec1_loop_16
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L083cbc_dec_two:
+.L085cbc_dec_two:
        call    _aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L084cbc_dec_three:
+.L086cbc_dec_three:
        call    _aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L085cbc_dec_four:
+.L087cbc_dec_four:
        call    _aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1934,28 +2007,44 @@ aesni_cbc_encrypt:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-.L081cbc_dec_tail_collected:
+       jmp     .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     .L087cbc_dec_tail_partial
+       jnz     .L090cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     .L077cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     .L079cbc_ret
 .align 16
-.L087cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-.L077cbc_ret:
+       movdqa  %xmm2,(%esp)
+.L079cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-.L072cbc_abort:
+       pxor    %xmm7,%xmm7
+.L074cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1965,52 +2054,62 @@ aesni_cbc_encrypt:
 .type  _aesni_set_encrypt_key,@function
 .align 16
 _aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      .L088bad_pointer
+       jz      .L091bad_pointer
        testl   %edx,%edx
-       jz      .L088bad_pointer
+       jz      .L091bad_pointer
+       call    .L092pic
+.L092pic:
+       popl    %ebx
+       leal    .Lkey_const-.L092pic(%ebx),%ebx
+       leal    OPENSSL_ia32cap_P,%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      .L08914rounds
+       je      .L09314rounds
        cmpl    $192,%ecx
-       je      .L09012rounds
+       je      .L09412rounds
        cmpl    $128,%ecx
-       jne     .L091bad_keybits
+       jne     .L095bad_keybits
 .align 16
-.L09210rounds:
+.L09610rounds:
+       cmpl    $268435456,%ebp
+       je      .L09710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    .L093key_128_cold
+       call    .L098key_128_cold
 .byte  102,15,58,223,200,2
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,4
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,8
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,16
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,32
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,64
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,128
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,27
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,54
-       call    .L094key_128
+       call    .L099key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L100good_key
 .align 16
-.L094key_128:
+.L099key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-.L093key_128_cold:
+.L098key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2019,38 +2118,91 @@ _aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L09012rounds:
+.L09710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+.L101loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     .L101loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     .L100good_key
+.align 16
+.L09412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      .L10212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L095key_192a_cold
+       call    .L103key_192a_cold
 .byte  102,15,58,223,202,2
-       call    .L096key_192b
+       call    .L104key_192b
 .byte  102,15,58,223,202,4
-       call    .L097key_192a
+       call    .L105key_192a
 .byte  102,15,58,223,202,8
-       call    .L096key_192b
+       call    .L104key_192b
 .byte  102,15,58,223,202,16
-       call    .L097key_192a
+       call    .L105key_192a
 .byte  102,15,58,223,202,32
-       call    .L096key_192b
+       call    .L104key_192b
 .byte  102,15,58,223,202,64
-       call    .L097key_192a
+       call    .L105key_192a
 .byte  102,15,58,223,202,128
-       call    .L096key_192b
+       call    .L104key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L100good_key
 .align 16
-.L097key_192a:
+.L105key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 16
-.L095key_192a_cold:
+.L103key_192a_cold:
        movaps  %xmm2,%xmm5
-.L098key_192b_warm:
+.L106key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2064,56 +2216,90 @@ _aesni_set_encrypt_key:
        pxor    %xmm3,%xmm2
        ret
 .align 16
-.L096key_192b:
+.L104key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     .L098key_192b_warm
+       jmp     .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+.L107loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     .L107loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     .L100good_key
 .align 16
-.L08914rounds:
+.L09314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      .L10814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L099key_256a_cold
+       call    .L109key_256a_cold
 .byte  102,15,58,223,200,1
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,2
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,2
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,4
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,4
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,8
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,8
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,16
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,16
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,32
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,32
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,64
-       call    .L101key_256a
+       call    .L111key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     .L100good_key
 .align 16
-.L101key_256a:
+.L111key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-.L099key_256a_cold:
+.L109key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2122,7 +2308,7 @@ _aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L100key_256b:
+.L110key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2132,13 +2318,70 @@ _aesni_set_encrypt_key:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 16
+.L10814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+.L112loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      .L113done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     .L112loop_key256
+.L113done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+.L100good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 4
-.L088bad_pointer:
+.L091bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 4
-.L091bad_keybits:
+.L095bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .size  _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
 .globl aesni_set_encrypt_key
@@ -2164,7 +2407,7 @@ aesni_set_decrypt_key:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     .L102dec_key_ret
+       jnz     .L114dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2172,7 +2415,7 @@ aesni_set_decrypt_key:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-.L103dec_key_inverse:
+.L115dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2182,15 +2425,24 @@ aesni_set_decrypt_key:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      .L103dec_key_inverse
+       ja      .L115dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-.L102dec_key_ret:
+.L114dec_key_ret:
        ret
 .size  aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
+.comm  OPENSSL_ia32cap_P,16,4
index cecd5f8..c1f5aec 100644 (file)
@@ -20,7 +20,10 @@ L000enc1_loop_1:
        leal    16(%edx),%edx
        jnz     L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .globl _aesni_decrypt
 .align 4
@@ -42,7 +45,10 @@ L001dec1_loop_2:
        leal    16(%edx),%edx
        jnz     L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .align 4
 __aesni_encrypt2:
@@ -242,17 +248,15 @@ __aesni_encrypt6:
        negl    %ecx
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     L_aesni_encrypt6_enter
+       jmp     L008_aesni_encrypt6_inner
 .align 4,0x90
-L008enc6_loop:
+L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
@@ -266,7 +270,7 @@ L_aesni_encrypt6_enter:
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L008enc6_loop
+       jnz     L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -296,17 +300,15 @@ __aesni_decrypt6:
        negl    %ecx
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     L_aesni_decrypt6_enter
+       jmp     L010_aesni_decrypt6_inner
 .align 4,0x90
-L009dec6_loop:
+L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
@@ -320,7 +322,7 @@ L_aesni_decrypt6_enter:
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L009dec6_loop
+       jnz     L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -348,14 +350,14 @@ L_aesni_ecb_encrypt_begin:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      L010ecb_ret
+       jz      L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      L011ecb_decrypt
+       jz      L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      L012ecb_enc_tail
+       jb      L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -364,9 +366,9 @@ L_aesni_ecb_encrypt_begin:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     L013ecb_enc_loop6_enter
+       jmp     L015ecb_enc_loop6_enter
 .align 4,0x90
-L014ecb_enc_loop6:
+L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -381,12 +383,12 @@ L014ecb_enc_loop6:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-L013ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
        call    __aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     L014ecb_enc_loop6
+       jnc     L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -395,18 +397,18 @@ L013ecb_enc_loop6_enter:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      L010ecb_ret
-L012ecb_enc_tail:
+       jz      L012ecb_ret
+L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      L015ecb_enc_one
+       jb      L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      L016ecb_enc_two
+       je      L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      L017ecb_enc_three
+       jb      L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      L018ecb_enc_four
+       je      L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_encrypt6
@@ -415,49 +417,49 @@ L012ecb_enc_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L015ecb_enc_one:
+L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L019enc1_loop_3:
+L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L019enc1_loop_3
+       jnz     L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L016ecb_enc_two:
+L018ecb_enc_two:
        call    __aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L017ecb_enc_three:
+L019ecb_enc_three:
        call    __aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L018ecb_enc_four:
+L020ecb_enc_four:
        call    __aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L011ecb_decrypt:
+L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      L020ecb_dec_tail
+       jb      L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -466,9 +468,9 @@ L011ecb_decrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     L021ecb_dec_loop6_enter
+       jmp     L023ecb_dec_loop6_enter
 .align 4,0x90
-L022ecb_dec_loop6:
+L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -483,12 +485,12 @@ L022ecb_dec_loop6:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-L021ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
        call    __aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     L022ecb_dec_loop6
+       jnc     L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -497,18 +499,18 @@ L021ecb_dec_loop6_enter:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      L010ecb_ret
-L020ecb_dec_tail:
+       jz      L012ecb_ret
+L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      L023ecb_dec_one
+       jb      L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      L024ecb_dec_two
+       je      L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      L025ecb_dec_three
+       jb      L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      L026ecb_dec_four
+       je      L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_decrypt6
@@ -517,43 +519,51 @@ L020ecb_dec_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L023ecb_dec_one:
+L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L027dec1_loop_4:
+L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L027dec1_loop_4
+       jnz     L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L024ecb_dec_two:
+L026ecb_dec_two:
        call    __aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L025ecb_dec_three:
+L027ecb_dec_three:
        call    __aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L026ecb_dec_four:
+L028ecb_dec_four:
        call    __aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-L010ecb_ret:
+L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -598,7 +608,7 @@ L_aesni_ccm64_encrypt_blocks_begin:
        leal    32(%edx,%ecx,1),%edx
        subl    %ecx,%ebx
 .byte  102,15,56,0,253
-L028ccm64_enc_outer:
+L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
@@ -607,7 +617,7 @@ L028ccm64_enc_outer:
        xorps   %xmm6,%xmm0
        xorps   %xmm0,%xmm3
        movups  32(%ebp),%xmm0
-L029ccm64_enc2_loop:
+L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -615,7 +625,7 @@ L029ccm64_enc2_loop:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L029ccm64_enc2_loop
+       jnz     L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
@@ -628,10 +638,18 @@ L029ccm64_enc2_loop:
        movups  %xmm6,(%edi)
 .byte  102,15,56,0,213
        leal    16(%edi),%edi
-       jnz     L028ccm64_enc_outer
+       jnz     L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -677,12 +695,12 @@ L_aesni_ccm64_decrypt_blocks_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L030enc1_loop_5:
+L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L030enc1_loop_5
+       jnz     L032enc1_loop_5
 .byte  102,15,56,221,209
        shll    $4,%ebx
        movl    $16,%ecx
@@ -692,16 +710,16 @@ L030enc1_loop_5:
        subl    %ebx,%ecx
        leal    32(%ebp,%ebx,1),%edx
        movl    %ecx,%ebx
-       jmp     L031ccm64_dec_outer
+       jmp     L033ccm64_dec_outer
 .align 4,0x90
-L031ccm64_dec_outer:
+L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      L032ccm64_dec_break
+       jz      L034ccm64_dec_break
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
@@ -709,7 +727,7 @@ L031ccm64_dec_outer:
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
        movups  32(%ebp),%xmm0
-L033ccm64_dec2_loop:
+L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -717,7 +735,7 @@ L033ccm64_dec2_loop:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L033ccm64_dec2_loop
+       jnz     L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
@@ -725,9 +743,9 @@ L033ccm64_dec2_loop:
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
        leal    16(%esi),%esi
-       jmp     L031ccm64_dec_outer
+       jmp     L033ccm64_dec_outer
 .align 4,0x90
-L032ccm64_dec_break:
+L034ccm64_dec_break:
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
@@ -735,16 +753,24 @@ L032ccm64_dec_break:
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-L034enc1_loop_6:
+L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L034enc1_loop_6
+       jnz     L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -768,7 +794,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      L035ctr32_one_shortcut
+       je      L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -806,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
        pshufd  $192,%xmm0,%xmm2
        pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      L036ctr32_tail
+       jb      L038ctr32_tail
        pxor    %xmm6,%xmm7
        shll    $4,%ecx
        movl    $16,%ebx
@@ -815,9 +841,9 @@ L_aesni_ctr32_encrypt_blocks_begin:
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     L037ctr32_loop6
+       jmp     L039ctr32_loop6
 .align 4,0x90
-L037ctr32_loop6:
+L039ctr32_loop6:
        pshufd  $64,%xmm0,%xmm4
        movdqa  32(%esp),%xmm0
        pshufd  $192,%xmm1,%xmm5
@@ -871,27 +897,27 @@ L037ctr32_loop6:
        leal    96(%edi),%edi
        pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     L037ctr32_loop6
+       jnc     L039ctr32_loop6
        addl    $6,%eax
-       jz      L038ctr32_ret
+       jz      L040ctr32_ret
        movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
        pxor    32(%esp),%xmm7
        movl    240(%ebp),%ecx
-L036ctr32_tail:
+L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      L039ctr32_one
+       jb      L041ctr32_one
        pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      L040ctr32_two
+       je      L042ctr32_two
        pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      L041ctr32_three
+       jb      L043ctr32_three
        pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      L042ctr32_four
+       je      L044ctr32_four
        por     %xmm7,%xmm6
        call    __aesni_encrypt6
        movups  (%esi),%xmm1
@@ -909,29 +935,29 @@ L036ctr32_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L035ctr32_one_shortcut:
+L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-L039ctr32_one:
+L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L043enc1_loop_7:
+L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L043enc1_loop_7
+       jnz     L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L040ctr32_two:
+L042ctr32_two:
        call    __aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -939,9 +965,9 @@ L040ctr32_two:
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L041ctr32_three:
+L043ctr32_three:
        call    __aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -952,9 +978,9 @@ L041ctr32_three:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L042ctr32_four:
+L044ctr32_four:
        call    __aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -968,7 +994,18 @@ L042ctr32_four:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-L038ctr32_ret:
+L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -991,12 +1028,12 @@ L_aesni_xts_encrypt_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L044enc1_loop_8:
+L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L044enc1_loop_8
+       jnz     L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1020,14 +1057,14 @@ L044enc1_loop_8:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      L045xts_enc_short
+       jc      L047xts_enc_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     L046xts_enc_loop6
+       jmp     L048xts_enc_loop6
 .align 4,0x90
-L046xts_enc_loop6:
+L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1116,23 +1153,23 @@ L046xts_enc_loop6:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     L046xts_enc_loop6
+       jnc     L048xts_enc_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-L045xts_enc_short:
+L047xts_enc_short:
        addl    $96,%eax
-       jz      L047xts_enc_done6x
+       jz      L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      L048xts_enc_one
+       jb      L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      L049xts_enc_two
+       je      L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1141,7 +1178,7 @@ L045xts_enc_short:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      L050xts_enc_three
+       jb      L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1151,7 +1188,7 @@ L045xts_enc_short:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      L051xts_enc_four
+       je      L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1183,9 +1220,9 @@ L045xts_enc_short:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L048xts_enc_one:
+L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1193,20 +1230,20 @@ L048xts_enc_one:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L053enc1_loop_9:
+L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L053enc1_loop_9
+       jnz     L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L049xts_enc_two:
+L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1220,9 +1257,9 @@ L049xts_enc_two:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L050xts_enc_three:
+L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1240,9 +1277,9 @@ L050xts_enc_three:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L051xts_enc_four:
+L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1264,28 +1301,28 @@ L051xts_enc_four:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L047xts_enc_done6x:
+L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      L054xts_enc_ret
+       jz      L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     L055xts_enc_steal
+       jmp     L057xts_enc_steal
 .align 4,0x90
-L052xts_enc_done:
+L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      L054xts_enc_ret
+       jz      L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-L055xts_enc_steal:
+L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1293,7 +1330,7 @@ L055xts_enc_steal:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     L055xts_enc_steal
+       jnz     L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1303,16 +1340,30 @@ L055xts_enc_steal:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L056enc1_loop_10:
+L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L056enc1_loop_10
+       jnz     L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-L054xts_enc_ret:
+L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1335,12 +1386,12 @@ L_aesni_xts_decrypt_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L057enc1_loop_11:
+L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L057enc1_loop_11
+       jnz     L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1369,14 +1420,14 @@ L057enc1_loop_11:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      L058xts_dec_short
+       jc      L060xts_dec_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     L059xts_dec_loop6
+       jmp     L061xts_dec_loop6
 .align 4,0x90
-L059xts_dec_loop6:
+L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1465,23 +1516,23 @@ L059xts_dec_loop6:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     L059xts_dec_loop6
+       jnc     L061xts_dec_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-L058xts_dec_short:
+L060xts_dec_short:
        addl    $96,%eax
-       jz      L060xts_dec_done6x
+       jz      L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      L061xts_dec_one
+       jb      L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      L062xts_dec_two
+       je      L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1490,7 +1541,7 @@ L058xts_dec_short:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      L063xts_dec_three
+       jb      L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1500,7 +1551,7 @@ L058xts_dec_short:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      L064xts_dec_four
+       je      L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1532,9 +1583,9 @@ L058xts_dec_short:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L061xts_dec_one:
+L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1542,20 +1593,20 @@ L061xts_dec_one:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L066dec1_loop_12:
+L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L066dec1_loop_12
+       jnz     L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L062xts_dec_two:
+L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1569,9 +1620,9 @@ L062xts_dec_two:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L063xts_dec_three:
+L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1589,9 +1640,9 @@ L063xts_dec_three:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L064xts_dec_four:
+L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1613,20 +1664,20 @@ L064xts_dec_four:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L060xts_dec_done6x:
+L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      L067xts_dec_ret
+       jz      L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     L068xts_dec_only_one_more
+       jmp     L070xts_dec_only_one_more
 .align 4,0x90
-L065xts_dec_done:
+L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      L067xts_dec_ret
+       jz      L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1636,7 +1687,7 @@ L065xts_dec_done:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-L068xts_dec_only_one_more:
+L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1650,16 +1701,16 @@ L068xts_dec_only_one_more:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L069dec1_loop_13:
+L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L069dec1_loop_13
+       jnz     L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-L070xts_dec_steal:
+L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1667,7 +1718,7 @@ L070xts_dec_steal:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     L070xts_dec_steal
+       jnz     L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1677,16 +1728,30 @@ L070xts_dec_steal:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L071dec1_loop_14:
+L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L071dec1_loop_14
+       jnz     L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-L067xts_dec_ret:
+L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1710,7 +1775,7 @@ L_aesni_cbc_encrypt_begin:
        movl    32(%esp),%edx
        movl    36(%esp),%ebp
        testl   %eax,%eax
-       jz      L072cbc_abort
+       jz      L074cbc_abort
        cmpl    $0,40(%esp)
        xchgl   %esp,%ebx
        movups  (%ebp),%xmm7
@@ -1718,14 +1783,14 @@ L_aesni_cbc_encrypt_begin:
        movl    %edx,%ebp
        movl    %ebx,16(%esp)
        movl    %ecx,%ebx
-       je      L073cbc_decrypt
+       je      L075cbc_decrypt
        movaps  %xmm7,%xmm2
        cmpl    $16,%eax
-       jb      L074cbc_enc_tail
+       jb      L076cbc_enc_tail
        subl    $16,%eax
-       jmp     L075cbc_enc_loop
+       jmp     L077cbc_enc_loop
 .align 4,0x90
-L075cbc_enc_loop:
+L077cbc_enc_loop:
        movups  (%esi),%xmm7
        leal    16(%esi),%esi
        movups  (%edx),%xmm0
@@ -1733,24 +1798,25 @@ L075cbc_enc_loop:
        xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
        xorps   %xmm7,%xmm2
-L076enc1_loop_15:
+L078enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L076enc1_loop_15
+       jnz     L078enc1_loop_15
 .byte  102,15,56,221,209
        movl    %ebx,%ecx
        movl    %ebp,%edx
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        subl    $16,%eax
-       jnc     L075cbc_enc_loop
+       jnc     L077cbc_enc_loop
        addl    $16,%eax
-       jnz     L074cbc_enc_tail
+       jnz     L076cbc_enc_tail
        movaps  %xmm2,%xmm7
-       jmp     L077cbc_ret
-L074cbc_enc_tail:
+       pxor    %xmm2,%xmm2
+       jmp     L079cbc_ret
+L076cbc_enc_tail:
        movl    %eax,%ecx
 .long  2767451785
        movl    $16,%ecx
@@ -1761,20 +1827,20 @@ L074cbc_enc_tail:
        movl    %ebx,%ecx
        movl    %edi,%esi
        movl    %ebp,%edx
-       jmp     L075cbc_enc_loop
+       jmp     L077cbc_enc_loop
 .align 4,0x90
-L073cbc_decrypt:
+L075cbc_decrypt:
        cmpl    $80,%eax
-       jbe     L078cbc_dec_tail
+       jbe     L080cbc_dec_tail
        movaps  %xmm7,(%esp)
        subl    $80,%eax
-       jmp     L079cbc_dec_loop6_enter
+       jmp     L081cbc_dec_loop6_enter
 .align 4,0x90
-L080cbc_dec_loop6:
+L082cbc_dec_loop6:
        movaps  %xmm0,(%esp)
        movups  %xmm7,(%edi)
        leal    16(%edi),%edi
-L079cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -1804,28 +1870,28 @@ L079cbc_dec_loop6_enter:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      L080cbc_dec_loop6
+       ja      L082cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     L081cbc_dec_tail_collected
+       jle     L083cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-L078cbc_dec_tail:
+L080cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     L082cbc_dec_one
+       jbe     L084cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     L083cbc_dec_two
+       jbe     L085cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     L084cbc_dec_three
+       jbe     L086cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     L085cbc_dec_four
+       jbe     L087cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1843,55 +1909,62 @@ L078cbc_dec_tail:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L082cbc_dec_one:
+L084cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L086dec1_loop_16:
+L089dec1_loop_16:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L086dec1_loop_16
+       jnz     L089dec1_loop_16
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L083cbc_dec_two:
+L085cbc_dec_two:
        call    __aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L084cbc_dec_three:
+L086cbc_dec_three:
        call    __aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L085cbc_dec_four:
+L087cbc_dec_four:
        call    __aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1901,28 +1974,44 @@ L085cbc_dec_four:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-L081cbc_dec_tail_collected:
+       jmp     L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+L088cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     L087cbc_dec_tail_partial
+       jnz     L090cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     L077cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     L079cbc_ret
 .align 4,0x90
-L087cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-L077cbc_ret:
+       movdqa  %xmm2,(%esp)
+L079cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-L072cbc_abort:
+       pxor    %xmm7,%xmm7
+L074cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1930,52 +2019,62 @@ L072cbc_abort:
        ret
 .align 4
 __aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      L088bad_pointer
+       jz      L091bad_pointer
        testl   %edx,%edx
-       jz      L088bad_pointer
+       jz      L091bad_pointer
+       call    L092pic
+L092pic:
+       popl    %ebx
+       leal    Lkey_const-L092pic(%ebx),%ebx
+       movl    L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      L08914rounds
+       je      L09314rounds
        cmpl    $192,%ecx
-       je      L09012rounds
+       je      L09412rounds
        cmpl    $128,%ecx
-       jne     L091bad_keybits
+       jne     L095bad_keybits
 .align 4,0x90
-L09210rounds:
+L09610rounds:
+       cmpl    $268435456,%ebp
+       je      L09710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    L093key_128_cold
+       call    L098key_128_cold
 .byte  102,15,58,223,200,2
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,4
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,8
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,16
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,32
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,64
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,128
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,27
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,54
-       call    L094key_128
+       call    L099key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     L100good_key
 .align 4,0x90
-L094key_128:
+L099key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-L093key_128_cold:
+L098key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -1984,38 +2083,91 @@ L093key_128_cold:
        xorps   %xmm1,%xmm0
        ret
 .align 4,0x90
-L09012rounds:
+L09710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+L101loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     L101loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     L100good_key
+.align 4,0x90
+L09412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      L10212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    L095key_192a_cold
+       call    L103key_192a_cold
 .byte  102,15,58,223,202,2
-       call    L096key_192b
+       call    L104key_192b
 .byte  102,15,58,223,202,4
-       call    L097key_192a
+       call    L105key_192a
 .byte  102,15,58,223,202,8
-       call    L096key_192b
+       call    L104key_192b
 .byte  102,15,58,223,202,16
-       call    L097key_192a
+       call    L105key_192a
 .byte  102,15,58,223,202,32
-       call    L096key_192b
+       call    L104key_192b
 .byte  102,15,58,223,202,64
-       call    L097key_192a
+       call    L105key_192a
 .byte  102,15,58,223,202,128
-       call    L096key_192b
+       call    L104key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     L100good_key
 .align 4,0x90
-L097key_192a:
+L105key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 4,0x90
-L095key_192a_cold:
+L103key_192a_cold:
        movaps  %xmm2,%xmm5
-L098key_192b_warm:
+L106key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2029,56 +2181,90 @@ L098key_192b_warm:
        pxor    %xmm3,%xmm2
        ret
 .align 4,0x90
-L096key_192b:
+L104key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     L098key_192b_warm
+       jmp     L106key_192b_warm
 .align 4,0x90
-L08914rounds:
+L10212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+L107loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     L107loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     L100good_key
+.align 4,0x90
+L09314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      L10814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    L099key_256a_cold
+       call    L109key_256a_cold
 .byte  102,15,58,223,200,1
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,2
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,2
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,4
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,4
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,8
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,8
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,16
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,16
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,32
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,32
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,64
-       call    L101key_256a
+       call    L111key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     L100good_key
 .align 4,0x90
-L101key_256a:
+L111key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-L099key_256a_cold:
+L109key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2087,7 +2273,7 @@ L099key_256a_cold:
        xorps   %xmm1,%xmm0
        ret
 .align 4,0x90
-L100key_256b:
+L110key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2097,13 +2283,70 @@ L100key_256b:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 4,0x90
+L10814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+L112loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      L113done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     L112loop_key256
+L113done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+L100good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 2,0x90
-L088bad_pointer:
+L091bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 2,0x90
-L091bad_keybits:
+L095bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .globl _aesni_set_encrypt_key
 .align 4
@@ -2125,7 +2368,7 @@ L_aesni_set_decrypt_key_begin:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     L102dec_key_ret
+       jnz     L114dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2133,7 +2376,7 @@ L_aesni_set_decrypt_key_begin:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-L103dec_key_inverse:
+L115dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2143,14 +2386,27 @@ L103dec_key_inverse:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      L103dec_key_inverse
+       ja      L115dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-L102dec_key_ret:
+L114dec_key_ret:
        ret
+.align 6,0x90
+Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol       _OPENSSL_ia32cap_P
+.long  0
+.comm  _OPENSSL_ia32cap_P,16,2
index 43fdb5a..6511c21 100644 (file)
@@ -17,6 +17,7 @@ IF @Version LT 800
 ELSE
 .text$ SEGMENT ALIGN(64) 'CODE'
 ENDIF
+;EXTERN        _OPENSSL_ia32cap_P:NEAR
 ALIGN  16
 _aesni_encrypt PROC PUBLIC
 $L_aesni_encrypt_begin::
@@ -36,7 +37,10 @@ DB   102,15,56,220,209
        lea     edx,DWORD PTR 16[edx]
        jnz     $L000enc1_loop_1
 DB     102,15,56,221,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR [eax],xmm2
+       pxor    xmm2,xmm2
        ret
 _aesni_encrypt ENDP
 ALIGN  16
@@ -58,7 +62,10 @@ DB   102,15,56,222,209
        lea     edx,DWORD PTR 16[edx]
        jnz     $L001dec1_loop_2
 DB     102,15,56,223,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR [eax],xmm2
+       pxor    xmm2,xmm2
        ret
 _aesni_decrypt ENDP
 ALIGN  16
@@ -265,17 +272,15 @@ DB        102,15,56,220,217
        neg     ecx
 DB     102,15,56,220,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR [ecx*1+edx]
        add     ecx,16
-DB     102,15,56,220,233
-DB     102,15,56,220,241
-DB     102,15,56,220,249
-       movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jmp     $L_aesni_encrypt6_enter
+       jmp     $L008_aesni_encrypt6_inner
 ALIGN  16
-$L008enc6_loop:
+$L009enc6_loop:
 DB     102,15,56,220,209
 DB     102,15,56,220,217
 DB     102,15,56,220,225
+$L008_aesni_encrypt6_inner:
 DB     102,15,56,220,233
 DB     102,15,56,220,241
 DB     102,15,56,220,249
@@ -289,7 +294,7 @@ DB  102,15,56,220,232
 DB     102,15,56,220,240
 DB     102,15,56,220,248
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L008enc6_loop
+       jnz     $L009enc6_loop
 DB     102,15,56,220,209
 DB     102,15,56,220,217
 DB     102,15,56,220,225
@@ -320,17 +325,15 @@ DB        102,15,56,222,217
        neg     ecx
 DB     102,15,56,222,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR [ecx*1+edx]
        add     ecx,16
-DB     102,15,56,222,233
-DB     102,15,56,222,241
-DB     102,15,56,222,249
-       movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jmp     $L_aesni_decrypt6_enter
+       jmp     $L010_aesni_decrypt6_inner
 ALIGN  16
-$L009dec6_loop:
+$L011dec6_loop:
 DB     102,15,56,222,209
 DB     102,15,56,222,217
 DB     102,15,56,222,225
+$L010_aesni_decrypt6_inner:
 DB     102,15,56,222,233
 DB     102,15,56,222,241
 DB     102,15,56,222,249
@@ -344,7 +347,7 @@ DB  102,15,56,222,232
 DB     102,15,56,222,240
 DB     102,15,56,222,248
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L009dec6_loop
+       jnz     $L011dec6_loop
 DB     102,15,56,222,209
 DB     102,15,56,222,217
 DB     102,15,56,222,225
@@ -372,14 +375,14 @@ $L_aesni_ecb_encrypt_begin::
        mov     edx,DWORD PTR 32[esp]
        mov     ebx,DWORD PTR 36[esp]
        and     eax,-16
-       jz      $L010ecb_ret
+       jz      $L012ecb_ret
        mov     ecx,DWORD PTR 240[edx]
        test    ebx,ebx
-       jz      $L011ecb_decrypt
+       jz      $L013ecb_decrypt
        mov     ebp,edx
        mov     ebx,ecx
        cmp     eax,96
-       jb      $L012ecb_enc_tail
+       jb      $L014ecb_enc_tail
        movdqu  xmm2,XMMWORD PTR [esi]
        movdqu  xmm3,XMMWORD PTR 16[esi]
        movdqu  xmm4,XMMWORD PTR 32[esi]
@@ -388,9 +391,9 @@ $L_aesni_ecb_encrypt_begin::
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
        sub     eax,96
-       jmp     $L013ecb_enc_loop6_enter
+       jmp     $L015ecb_enc_loop6_enter
 ALIGN  16
-$L014ecb_enc_loop6:
+$L016ecb_enc_loop6:
        movups  XMMWORD PTR [edi],xmm2
        movdqu  xmm2,XMMWORD PTR [esi]
        movups  XMMWORD PTR 16[edi],xmm3
@@ -405,12 +408,12 @@ $L014ecb_enc_loop6:
        lea     edi,DWORD PTR 96[edi]
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
-$L013ecb_enc_loop6_enter:
+$L015ecb_enc_loop6_enter:
        call    __aesni_encrypt6
        mov     edx,ebp
        mov     ecx,ebx
        sub     eax,96
-       jnc     $L014ecb_enc_loop6
+       jnc     $L016ecb_enc_loop6
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
@@ -419,18 +422,18 @@ $L013ecb_enc_loop6_enter:
        movups  XMMWORD PTR 80[edi],xmm7
        lea     edi,DWORD PTR 96[edi]
        add     eax,96
-       jz      $L010ecb_ret
-$L012ecb_enc_tail:
+       jz      $L012ecb_ret
+$L014ecb_enc_tail:
        movups  xmm2,XMMWORD PTR [esi]
        cmp     eax,32
-       jb      $L015ecb_enc_one
+       jb      $L017ecb_enc_one
        movups  xmm3,XMMWORD PTR 16[esi]
-       je      $L016ecb_enc_two
+       je      $L018ecb_enc_two
        movups  xmm4,XMMWORD PTR 32[esi]
        cmp     eax,64
-       jb      $L017ecb_enc_three
+       jb      $L019ecb_enc_three
        movups  xmm5,XMMWORD PTR 48[esi]
-       je      $L018ecb_enc_four
+       je      $L020ecb_enc_four
        movups  xmm6,XMMWORD PTR 64[esi]
        xorps   xmm7,xmm7
        call    __aesni_encrypt6
@@ -439,49 +442,49 @@ $L012ecb_enc_tail:
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L015ecb_enc_one:
+$L017ecb_enc_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L019enc1_loop_3:
+$L021enc1_loop_3:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L019enc1_loop_3
+       jnz     $L021enc1_loop_3
 DB     102,15,56,221,209
        movups  XMMWORD PTR [edi],xmm2
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L016ecb_enc_two:
+$L018ecb_enc_two:
        call    __aesni_encrypt2
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L017ecb_enc_three:
+$L019ecb_enc_three:
        call    __aesni_encrypt3
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L018ecb_enc_four:
+$L020ecb_enc_four:
        call    __aesni_encrypt4
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L011ecb_decrypt:
+$L013ecb_decrypt:
        mov     ebp,edx
        mov     ebx,ecx
        cmp     eax,96
-       jb      $L020ecb_dec_tail
+       jb      $L022ecb_dec_tail
        movdqu  xmm2,XMMWORD PTR [esi]
        movdqu  xmm3,XMMWORD PTR 16[esi]
        movdqu  xmm4,XMMWORD PTR 32[esi]
@@ -490,9 +493,9 @@ $L011ecb_decrypt:
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
        sub     eax,96
-       jmp     $L021ecb_dec_loop6_enter
+       jmp     $L023ecb_dec_loop6_enter
 ALIGN  16
-$L022ecb_dec_loop6:
+$L024ecb_dec_loop6:
        movups  XMMWORD PTR [edi],xmm2
        movdqu  xmm2,XMMWORD PTR [esi]
        movups  XMMWORD PTR 16[edi],xmm3
@@ -507,12 +510,12 @@ $L022ecb_dec_loop6:
        lea     edi,DWORD PTR 96[edi]
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
-$L021ecb_dec_loop6_enter:
+$L023ecb_dec_loop6_enter:
        call    __aesni_decrypt6
        mov     edx,ebp
        mov     ecx,ebx
        sub     eax,96
-       jnc     $L022ecb_dec_loop6
+       jnc     $L024ecb_dec_loop6
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
@@ -521,18 +524,18 @@ $L021ecb_dec_loop6_enter:
        movups  XMMWORD PTR 80[edi],xmm7
        lea     edi,DWORD PTR 96[edi]
        add     eax,96
-       jz      $L010ecb_ret
-$L020ecb_dec_tail:
+       jz      $L012ecb_ret
+$L022ecb_dec_tail:
        movups  xmm2,XMMWORD PTR [esi]
        cmp     eax,32
-       jb      $L023ecb_dec_one
+       jb      $L025ecb_dec_one
        movups  xmm3,XMMWORD PTR 16[esi]
-       je      $L024ecb_dec_two
+       je      $L026ecb_dec_two
        movups  xmm4,XMMWORD PTR 32[esi]
        cmp     eax,64
-       jb      $L025ecb_dec_three
+       jb      $L027ecb_dec_three
        movups  xmm5,XMMWORD PTR 48[esi]
-       je      $L026ecb_dec_four
+       je      $L028ecb_dec_four
        movups  xmm6,XMMWORD PTR 64[esi]
        xorps   xmm7,xmm7
        call    __aesni_decrypt6
@@ -541,43 +544,51 @@ $L020ecb_dec_tail:
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L023ecb_dec_one:
+$L025ecb_dec_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L027dec1_loop_4:
+$L029dec1_loop_4:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L027dec1_loop_4
+       jnz     $L029dec1_loop_4
 DB     102,15,56,223,209
        movups  XMMWORD PTR [edi],xmm2
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L024ecb_dec_two:
+$L026ecb_dec_two:
        call    __aesni_decrypt2
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L025ecb_dec_three:
+$L027ecb_dec_three:
        call    __aesni_decrypt3
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L026ecb_dec_four:
+$L028ecb_dec_four:
        call    __aesni_decrypt4
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
-$L010ecb_ret:
+$L012ecb_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        pop     edi
        pop     esi
        pop     ebx
@@ -622,7 +633,7 @@ $L_aesni_ccm64_encrypt_blocks_begin::
        lea     edx,DWORD PTR 32[ecx*1+edx]
        sub     ebx,ecx
 DB     102,15,56,0,253
-$L028ccm64_enc_outer:
+$L030ccm64_enc_outer:
        movups  xmm0,XMMWORD PTR [ebp]
        mov     ecx,ebx
        movups  xmm6,XMMWORD PTR [esi]
@@ -631,7 +642,7 @@ $L028ccm64_enc_outer:
        xorps   xmm0,xmm6
        xorps   xmm3,xmm0
        movups  xmm0,XMMWORD PTR 32[ebp]
-$L029ccm64_enc2_loop:
+$L031ccm64_enc2_loop:
 DB     102,15,56,220,209
 DB     102,15,56,220,217
        movups  xmm1,XMMWORD PTR [ecx*1+edx]
@@ -639,7 +650,7 @@ DB  102,15,56,220,217
 DB     102,15,56,220,208
 DB     102,15,56,220,216
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L029ccm64_enc2_loop
+       jnz     $L031ccm64_enc2_loop
 DB     102,15,56,220,209
 DB     102,15,56,220,217
        paddq   xmm7,XMMWORD PTR 16[esp]
@@ -652,10 +663,18 @@ DB        102,15,56,221,216
        movups  XMMWORD PTR [edi],xmm6
 DB     102,15,56,0,213
        lea     edi,DWORD PTR 16[edi]
-       jnz     $L028ccm64_enc_outer
+       jnz     $L030ccm64_enc_outer
        mov     esp,DWORD PTR 48[esp]
        mov     edi,DWORD PTR 40[esp]
        movups  XMMWORD PTR [edi],xmm3
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        pop     edi
        pop     esi
        pop     ebx
@@ -701,12 +720,12 @@ DB        102,15,56,0,253
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L030enc1_loop_5:
+$L032enc1_loop_5:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L030enc1_loop_5
+       jnz     $L032enc1_loop_5
 DB     102,15,56,221,209
        shl     ebx,4
        mov     ecx,16
@@ -716,16 +735,16 @@ DB        102,15,56,221,209
        sub     ecx,ebx
        lea     edx,DWORD PTR 32[ebx*1+ebp]
        mov     ebx,ecx
-       jmp     $L031ccm64_dec_outer
+       jmp     $L033ccm64_dec_outer
 ALIGN  16
-$L031ccm64_dec_outer:
+$L033ccm64_dec_outer:
        xorps   xmm6,xmm2
        movdqa  xmm2,xmm7
        movups  XMMWORD PTR [edi],xmm6
        lea     edi,DWORD PTR 16[edi]
 DB     102,15,56,0,213
        sub     eax,1
-       jz      $L032ccm64_dec_break
+       jz      $L034ccm64_dec_break
        movups  xmm0,XMMWORD PTR [ebp]
        mov     ecx,ebx
        movups  xmm1,XMMWORD PTR 16[ebp]
@@ -733,7 +752,7 @@ DB  102,15,56,0,213
        xorps   xmm2,xmm0
        xorps   xmm3,xmm6
        movups  xmm0,XMMWORD PTR 32[ebp]
-$L033ccm64_dec2_loop:
+$L035ccm64_dec2_loop:
 DB     102,15,56,220,209
 DB     102,15,56,220,217
        movups  xmm1,XMMWORD PTR [ecx*1+edx]
@@ -741,7 +760,7 @@ DB  102,15,56,220,217
 DB     102,15,56,220,208
 DB     102,15,56,220,216
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L033ccm64_dec2_loop
+       jnz     $L035ccm64_dec2_loop
        movups  xmm6,XMMWORD PTR [esi]
        paddq   xmm7,XMMWORD PTR 16[esp]
 DB     102,15,56,220,209
@@ -749,9 +768,9 @@ DB  102,15,56,220,217
 DB     102,15,56,221,208
 DB     102,15,56,221,216
        lea     esi,QWORD PTR 16[esi]
-       jmp     $L031ccm64_dec_outer
+       jmp     $L033ccm64_dec_outer
 ALIGN  16
-$L032ccm64_dec_break:
+$L034ccm64_dec_break:
        mov     ecx,DWORD PTR 240[ebp]
        mov     edx,ebp
        movups  xmm0,XMMWORD PTR [edx]
@@ -759,16 +778,24 @@ $L032ccm64_dec_break:
        xorps   xmm6,xmm0
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm3,xmm6
-$L034enc1_loop_6:
+$L036enc1_loop_6:
 DB     102,15,56,220,217
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L034enc1_loop_6
+       jnz     $L036enc1_loop_6
 DB     102,15,56,221,217
        mov     esp,DWORD PTR 48[esp]
        mov     edi,DWORD PTR 40[esp]
        movups  XMMWORD PTR [edi],xmm3
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        pop     edi
        pop     esi
        pop     ebx
@@ -792,7 +819,7 @@ $L_aesni_ctr32_encrypt_blocks_begin::
        and     esp,-16
        mov     DWORD PTR 80[esp],ebp
        cmp     eax,1
-       je      $L035ctr32_one_shortcut
+       je      $L037ctr32_one_shortcut
        movdqu  xmm7,XMMWORD PTR [ebx]
        mov     DWORD PTR [esp],202182159
        mov     DWORD PTR 4[esp],134810123
@@ -830,7 +857,7 @@ DB  102,15,56,0,202
        pshufd  xmm2,xmm0,192
        pshufd  xmm3,xmm0,128
        cmp     eax,6
-       jb      $L036ctr32_tail
+       jb      $L038ctr32_tail
        pxor    xmm7,xmm6
        shl     ecx,4
        mov     ebx,16
@@ -839,9 +866,9 @@ DB  102,15,56,0,202
        sub     ebx,ecx
        lea     edx,DWORD PTR 32[ecx*1+edx]
        sub     eax,6
-       jmp     $L037ctr32_loop6
+       jmp     $L039ctr32_loop6
 ALIGN  16
-$L037ctr32_loop6:
+$L039ctr32_loop6:
        pshufd  xmm4,xmm0,64
        movdqa  xmm0,XMMWORD PTR 32[esp]
        pshufd  xmm5,xmm1,192
@@ -895,27 +922,27 @@ DB        102,15,56,0,202
        lea     edi,DWORD PTR 96[edi]
        pshufd  xmm3,xmm0,128
        sub     eax,6
-       jnc     $L037ctr32_loop6
+       jnc     $L039ctr32_loop6
        add     eax,6
-       jz      $L038ctr32_ret
+       jz      $L040ctr32_ret
        movdqu  xmm7,XMMWORD PTR [ebp]
        mov     edx,ebp
        pxor    xmm7,XMMWORD PTR 32[esp]
        mov     ecx,DWORD PTR 240[ebp]
-$L036ctr32_tail:
+$L038ctr32_tail:
        por     xmm2,xmm7
        cmp     eax,2
-       jb      $L039ctr32_one
+       jb      $L041ctr32_one
        pshufd  xmm4,xmm0,64
        por     xmm3,xmm7
-       je      $L040ctr32_two
+       je      $L042ctr32_two
        pshufd  xmm5,xmm1,192
        por     xmm4,xmm7
        cmp     eax,4
-       jb      $L041ctr32_three
+       jb      $L043ctr32_three
        pshufd  xmm6,xmm1,128
        por     xmm5,xmm7
-       je      $L042ctr32_four
+       je      $L044ctr32_four
        por     xmm6,xmm7
        call    __aesni_encrypt6
        movups  xmm1,XMMWORD PTR [esi]
@@ -933,29 +960,29 @@ $L036ctr32_tail:
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L035ctr32_one_shortcut:
+$L037ctr32_one_shortcut:
        movups  xmm2,XMMWORD PTR [ebx]
        mov     ecx,DWORD PTR 240[edx]
-$L039ctr32_one:
+$L041ctr32_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L043enc1_loop_7:
+$L045enc1_loop_7:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L043enc1_loop_7
+       jnz     $L045enc1_loop_7
 DB     102,15,56,221,209
        movups  xmm6,XMMWORD PTR [esi]
        xorps   xmm6,xmm2
        movups  XMMWORD PTR [edi],xmm6
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L040ctr32_two:
+$L042ctr32_two:
        call    __aesni_encrypt2
        movups  xmm5,XMMWORD PTR [esi]
        movups  xmm6,XMMWORD PTR 16[esi]
@@ -963,9 +990,9 @@ $L040ctr32_two:
        xorps   xmm3,xmm6
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L041ctr32_three:
+$L043ctr32_three:
        call    __aesni_encrypt3
        movups  xmm5,XMMWORD PTR [esi]
        movups  xmm6,XMMWORD PTR 16[esi]
@@ -976,9 +1003,9 @@ $L041ctr32_three:
        xorps   xmm4,xmm7
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L042ctr32_four:
+$L044ctr32_four:
        call    __aesni_encrypt4
        movups  xmm6,XMMWORD PTR [esi]
        movups  xmm7,XMMWORD PTR 16[esi]
@@ -992,7 +1019,18 @@ $L042ctr32_four:
        xorps   xmm5,xmm0
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
-$L038ctr32_ret:
+$L040ctr32_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       pxor    xmm5,xmm5
+       movdqa  XMMWORD PTR 48[esp],xmm0
+       pxor    xmm6,xmm6
+       movdqa  XMMWORD PTR 64[esp],xmm0
+       pxor    xmm7,xmm7
        mov     esp,DWORD PTR 80[esp]
        pop     edi
        pop     esi
@@ -1015,12 +1053,12 @@ $L_aesni_xts_encrypt_begin::
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L044enc1_loop_8:
+$L046enc1_loop_8:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L044enc1_loop_8
+       jnz     $L046enc1_loop_8
 DB     102,15,56,221,209
        mov     esi,DWORD PTR 20[esp]
        mov     edi,DWORD PTR 24[esp]
@@ -1044,14 +1082,14 @@ DB      102,15,56,221,209
        mov     ebp,edx
        mov     ebx,ecx
        sub     eax,96
-       jc      $L045xts_enc_short
+       jc      $L047xts_enc_short
        shl     ecx,4
        mov     ebx,16
        sub     ebx,ecx
        lea     edx,DWORD PTR 32[ecx*1+edx]
-       jmp     $L046xts_enc_loop6
+       jmp     $L048xts_enc_loop6
 ALIGN  16
-$L046xts_enc_loop6:
+$L048xts_enc_loop6:
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  XMMWORD PTR [esp],xmm1
@@ -1140,23 +1178,23 @@ DB      102,15,56,220,249
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        sub     eax,96
-       jnc     $L046xts_enc_loop6
+       jnc     $L048xts_enc_loop6
        mov     ecx,DWORD PTR 240[ebp]
        mov     edx,ebp
        mov     ebx,ecx
-$L045xts_enc_short:
+$L047xts_enc_short:
        add     eax,96
-       jz      $L047xts_enc_done6x
+       jz      $L049xts_enc_done6x
        movdqa  xmm5,xmm1
        cmp     eax,32
-       jb      $L048xts_enc_one
+       jb      $L050xts_enc_one
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        paddq   xmm1,xmm1
        pand    xmm2,xmm3
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
-       je      $L049xts_enc_two
+       je      $L051xts_enc_two
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm6,xmm1
@@ -1165,7 +1203,7 @@ $L045xts_enc_short:
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        cmp     eax,64
-       jb      $L050xts_enc_three
+       jb      $L052xts_enc_three
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm7,xmm1
@@ -1175,7 +1213,7 @@ $L045xts_enc_short:
        pxor    xmm1,xmm2
        movdqa  XMMWORD PTR [esp],xmm5
        movdqa  XMMWORD PTR 16[esp],xmm6
-       je      $L051xts_enc_four
+       je      $L053xts_enc_four
        movdqa  XMMWORD PTR 32[esp],xmm7
        pshufd  xmm7,xmm0,19
        movdqa  XMMWORD PTR 48[esp],xmm1
@@ -1207,9 +1245,9 @@ $L045xts_enc_short:
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
        lea     edi,DWORD PTR 80[edi]
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L048xts_enc_one:
+$L050xts_enc_one:
        movups  xmm2,XMMWORD PTR [esi]
        lea     esi,DWORD PTR 16[esi]
        xorps   xmm2,xmm5
@@ -1217,20 +1255,20 @@ $L048xts_enc_one:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L053enc1_loop_9:
+$L055enc1_loop_9:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L053enc1_loop_9
+       jnz     $L055enc1_loop_9
 DB     102,15,56,221,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
        movdqa  xmm1,xmm5
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L049xts_enc_two:
+$L051xts_enc_two:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1244,9 +1282,9 @@ $L049xts_enc_two:
        movups  XMMWORD PTR 16[edi],xmm3
        lea     edi,DWORD PTR 32[edi]
        movdqa  xmm1,xmm6
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L050xts_enc_three:
+$L052xts_enc_three:
        movaps  xmm7,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1264,9 +1302,9 @@ $L050xts_enc_three:
        movups  XMMWORD PTR 32[edi],xmm4
        lea     edi,DWORD PTR 48[edi]
        movdqa  xmm1,xmm7
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L051xts_enc_four:
+$L053xts_enc_four:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1288,28 +1326,28 @@ $L051xts_enc_four:
        movups  XMMWORD PTR 48[edi],xmm5
        lea     edi,DWORD PTR 64[edi]
        movdqa  xmm1,xmm6
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L047xts_enc_done6x:
+$L049xts_enc_done6x:
        mov     eax,DWORD PTR 112[esp]
        and     eax,15
-       jz      $L054xts_enc_ret
+       jz      $L056xts_enc_ret
        movdqa  xmm5,xmm1
        mov     DWORD PTR 112[esp],eax
-       jmp     $L055xts_enc_steal
+       jmp     $L057xts_enc_steal
 ALIGN  16
-$L052xts_enc_done:
+$L054xts_enc_done:
        mov     eax,DWORD PTR 112[esp]
        pxor    xmm0,xmm0
        and     eax,15
-       jz      $L054xts_enc_ret
+       jz      $L056xts_enc_ret
        pcmpgtd xmm0,xmm1
        mov     DWORD PTR 112[esp],eax
        pshufd  xmm5,xmm0,19
        paddq   xmm1,xmm1
        pand    xmm5,XMMWORD PTR 96[esp]
        pxor    xmm5,xmm1
-$L055xts_enc_steal:
+$L057xts_enc_steal:
        movzx   ecx,BYTE PTR [esi]
        movzx   edx,BYTE PTR [edi-16]
        lea     esi,DWORD PTR 1[esi]
@@ -1317,7 +1355,7 @@ $L055xts_enc_steal:
        mov     BYTE PTR [edi],dl
        lea     edi,DWORD PTR 1[edi]
        sub     eax,1
-       jnz     $L055xts_enc_steal
+       jnz     $L057xts_enc_steal
        sub     edi,DWORD PTR 112[esp]
        mov     edx,ebp
        mov     ecx,ebx
@@ -1327,16 +1365,30 @@ $L055xts_enc_steal:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L056enc1_loop_10:
+$L058enc1_loop_10:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L056enc1_loop_10
+       jnz     $L058enc1_loop_10
 DB     102,15,56,221,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi-16],xmm2
-$L054xts_enc_ret:
+$L056xts_enc_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       movdqa  XMMWORD PTR [esp],xmm0
+       pxor    xmm3,xmm3
+       movdqa  XMMWORD PTR 16[esp],xmm0
+       pxor    xmm4,xmm4
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       pxor    xmm5,xmm5
+       movdqa  XMMWORD PTR 48[esp],xmm0
+       pxor    xmm6,xmm6
+       movdqa  XMMWORD PTR 64[esp],xmm0
+       pxor    xmm7,xmm7
+       movdqa  XMMWORD PTR 80[esp],xmm0
        mov     esp,DWORD PTR 116[esp]
        pop     edi
        pop     esi
@@ -1359,12 +1411,12 @@ $L_aesni_xts_decrypt_begin::
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L057enc1_loop_11:
+$L059enc1_loop_11:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L057enc1_loop_11
+       jnz     $L059enc1_loop_11
 DB     102,15,56,221,209
        mov     esi,DWORD PTR 20[esp]
        mov     edi,DWORD PTR 24[esp]
@@ -1393,14 +1445,14 @@ DB      102,15,56,221,209
        pcmpgtd xmm0,xmm1
        and     eax,-16
        sub     eax,96
-       jc      $L058xts_dec_short
+       jc      $L060xts_dec_short
        shl     ecx,4
        mov     ebx,16
        sub     ebx,ecx
        lea     edx,DWORD PTR 32[ecx*1+edx]
-       jmp     $L059xts_dec_loop6
+       jmp     $L061xts_dec_loop6
 ALIGN  16
-$L059xts_dec_loop6:
+$L061xts_dec_loop6:
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  XMMWORD PTR [esp],xmm1
@@ -1489,23 +1541,23 @@ DB      102,15,56,222,249
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        sub     eax,96
-       jnc     $L059xts_dec_loop6
+       jnc     $L061xts_dec_loop6
        mov     ecx,DWORD PTR 240[ebp]
        mov     edx,ebp
        mov     ebx,ecx
-$L058xts_dec_short:
+$L060xts_dec_short:
        add     eax,96
-       jz      $L060xts_dec_done6x
+       jz      $L062xts_dec_done6x
        movdqa  xmm5,xmm1
        cmp     eax,32
-       jb      $L061xts_dec_one
+       jb      $L063xts_dec_one
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        paddq   xmm1,xmm1
        pand    xmm2,xmm3
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
-       je      $L062xts_dec_two
+       je      $L064xts_dec_two
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm6,xmm1
@@ -1514,7 +1566,7 @@ $L058xts_dec_short:
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        cmp     eax,64
-       jb      $L063xts_dec_three
+       jb      $L065xts_dec_three
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm7,xmm1
@@ -1524,7 +1576,7 @@ $L058xts_dec_short:
        pxor    xmm1,xmm2
        movdqa  XMMWORD PTR [esp],xmm5
        movdqa  XMMWORD PTR 16[esp],xmm6
-       je      $L064xts_dec_four
+       je      $L066xts_dec_four
        movdqa  XMMWORD PTR 32[esp],xmm7
        pshufd  xmm7,xmm0,19
        movdqa  XMMWORD PTR 48[esp],xmm1
@@ -1556,9 +1608,9 @@ $L058xts_dec_short:
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
        lea     edi,DWORD PTR 80[edi]
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L061xts_dec_one:
+$L063xts_dec_one:
        movups  xmm2,XMMWORD PTR [esi]
        lea     esi,DWORD PTR 16[esi]
        xorps   xmm2,xmm5
@@ -1566,20 +1618,20 @@ $L061xts_dec_one:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L066dec1_loop_12:
+$L068dec1_loop_12:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L066dec1_loop_12
+       jnz     $L068dec1_loop_12
 DB     102,15,56,223,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
        movdqa  xmm1,xmm5
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L062xts_dec_two:
+$L064xts_dec_two:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1593,9 +1645,9 @@ $L062xts_dec_two:
        movups  XMMWORD PTR 16[edi],xmm3
        lea     edi,DWORD PTR 32[edi]
        movdqa  xmm1,xmm6
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L063xts_dec_three:
+$L065xts_dec_three:
        movaps  xmm7,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1613,9 +1665,9 @@ $L063xts_dec_three:
        movups  XMMWORD PTR 32[edi],xmm4
        lea     edi,DWORD PTR 48[edi]
        movdqa  xmm1,xmm7
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L064xts_dec_four:
+$L066xts_dec_four:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1637,20 +1689,20 @@ $L064xts_dec_four:
        movups  XMMWORD PTR 48[edi],xmm5
        lea     edi,DWORD PTR 64[edi]
        movdqa  xmm1,xmm6
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L060xts_dec_done6x:
+$L062xts_dec_done6x:
        mov     eax,DWORD PTR 112[esp]
        and     eax,15
-       jz      $L067xts_dec_ret
+       jz      $L069xts_dec_ret
        mov     DWORD PTR 112[esp],eax
-       jmp     $L068xts_dec_only_one_more
+       jmp     $L070xts_dec_only_one_more
 ALIGN  16
-$L065xts_dec_done:
+$L067xts_dec_done:
        mov     eax,DWORD PTR 112[esp]
        pxor    xmm0,xmm0
        and     eax,15
-       jz      $L067xts_dec_ret
+       jz      $L069xts_dec_ret
        pcmpgtd xmm0,xmm1
        mov     DWORD PTR 112[esp],eax
        pshufd  xmm2,xmm0,19
@@ -1660,7 +1712,7 @@ $L065xts_dec_done:
        pand    xmm2,xmm3
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
-$L068xts_dec_only_one_more:
+$L070xts_dec_only_one_more:
        pshufd  xmm5,xmm0,19
        movdqa  xmm6,xmm1
        paddq   xmm1,xmm1
@@ -1674,16 +1726,16 @@ $L068xts_dec_only_one_more:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L069dec1_loop_13:
+$L071dec1_loop_13:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L069dec1_loop_13
+       jnz     $L071dec1_loop_13
 DB     102,15,56,223,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi],xmm2
-$L070xts_dec_steal:
+$L072xts_dec_steal:
        movzx   ecx,BYTE PTR 16[esi]
        movzx   edx,BYTE PTR [edi]
        lea     esi,DWORD PTR 1[esi]
@@ -1691,7 +1743,7 @@ $L070xts_dec_steal:
        mov     BYTE PTR 16[edi],dl
        lea     edi,DWORD PTR 1[edi]
        sub     eax,1
-       jnz     $L070xts_dec_steal
+       jnz     $L072xts_dec_steal
        sub     edi,DWORD PTR 112[esp]
        mov     edx,ebp
        mov     ecx,ebx
@@ -1701,16 +1753,30 @@ $L070xts_dec_steal:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L071dec1_loop_14:
+$L073dec1_loop_14:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L071dec1_loop_14
+       jnz     $L073dec1_loop_14
 DB     102,15,56,223,209
        xorps   xmm2,xmm6
        movups  XMMWORD PTR [edi],xmm2
-$L067xts_dec_ret:
+$L069xts_dec_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       movdqa  XMMWORD PTR [esp],xmm0
+       pxor    xmm3,xmm3
+       movdqa  XMMWORD PTR 16[esp],xmm0
+       pxor    xmm4,xmm4
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       pxor    xmm5,xmm5
+       movdqa  XMMWORD PTR 48[esp],xmm0
+       pxor    xmm6,xmm6
+       movdqa  XMMWORD PTR 64[esp],xmm0
+       pxor    xmm7,xmm7
+       movdqa  XMMWORD PTR 80[esp],xmm0
        mov     esp,DWORD PTR 116[esp]
        pop     edi
        pop     esi
@@ -1734,7 +1800,7 @@ $L_aesni_cbc_encrypt_begin::
        mov     edx,DWORD PTR 32[esp]
        mov     ebp,DWORD PTR 36[esp]
        test    eax,eax
-       jz      $L072cbc_abort
+       jz      $L074cbc_abort
        cmp     DWORD PTR 40[esp],0
        xchg    ebx,esp
        movups  xmm7,XMMWORD PTR [ebp]
@@ -1742,14 +1808,14 @@ $L_aesni_cbc_encrypt_begin::
        mov     ebp,edx
        mov     DWORD PTR 16[esp],ebx
        mov     ebx,ecx
-       je      $L073cbc_decrypt
+       je      $L075cbc_decrypt
        movaps  xmm2,xmm7
        cmp     eax,16
-       jb      $L074cbc_enc_tail
+       jb      $L076cbc_enc_tail
        sub     eax,16
-       jmp     $L075cbc_enc_loop
+       jmp     $L077cbc_enc_loop
 ALIGN  16
-$L075cbc_enc_loop:
+$L077cbc_enc_loop:
        movups  xmm7,XMMWORD PTR [esi]
        lea     esi,DWORD PTR 16[esi]
        movups  xmm0,XMMWORD PTR [edx]
@@ -1757,24 +1823,25 @@ $L075cbc_enc_loop:
        xorps   xmm7,xmm0
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm7
-$L076enc1_loop_15:
+$L078enc1_loop_15:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L076enc1_loop_15
+       jnz     $L078enc1_loop_15
 DB     102,15,56,221,209
        mov     ecx,ebx
        mov     edx,ebp
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
        sub     eax,16
-       jnc     $L075cbc_enc_loop
+       jnc     $L077cbc_enc_loop
        add     eax,16
-       jnz     $L074cbc_enc_tail
+       jnz     $L076cbc_enc_tail
        movaps  xmm7,xmm2
-       jmp     $L077cbc_ret
-$L074cbc_enc_tail:
+       pxor    xmm2,xmm2
+       jmp     $L079cbc_ret
+$L076cbc_enc_tail:
        mov     ecx,eax
 DD     2767451785
        mov     ecx,16
@@ -1785,20 +1852,20 @@ DD      2868115081
        mov     ecx,ebx
        mov     esi,edi
        mov     edx,ebp
-       jmp     $L075cbc_enc_loop
+       jmp     $L077cbc_enc_loop
 ALIGN  16
-$L073cbc_decrypt:
+$L075cbc_decrypt:
        cmp     eax,80
-       jbe     $L078cbc_dec_tail
+       jbe     $L080cbc_dec_tail
        movaps  XMMWORD PTR [esp],xmm7
        sub     eax,80
-       jmp     $L079cbc_dec_loop6_enter
+       jmp     $L081cbc_dec_loop6_enter
 ALIGN  16
-$L080cbc_dec_loop6:
+$L082cbc_dec_loop6:
        movaps  XMMWORD PTR [esp],xmm0
        movups  XMMWORD PTR [edi],xmm7
        lea     edi,DWORD PTR 16[edi]
-$L079cbc_dec_loop6_enter:
+$L081cbc_dec_loop6_enter:
        movdqu  xmm2,XMMWORD PTR [esi]
        movdqu  xmm3,XMMWORD PTR 16[esi]
        movdqu  xmm4,XMMWORD PTR 32[esi]
@@ -1828,28 +1895,28 @@ $L079cbc_dec_loop6_enter:
        movups  XMMWORD PTR 64[edi],xmm6
        lea     edi,DWORD PTR 80[edi]
        sub     eax,96
-       ja      $L080cbc_dec_loop6
+       ja      $L082cbc_dec_loop6
        movaps  xmm2,xmm7
        movaps  xmm7,xmm0
        add     eax,80
-       jle     $L081cbc_dec_tail_collected
+       jle     $L083cbc_dec_clear_tail_collected
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
-$L078cbc_dec_tail:
+$L080cbc_dec_tail:
        movups  xmm2,XMMWORD PTR [esi]
        movaps  xmm6,xmm2
        cmp     eax,16
-       jbe     $L082cbc_dec_one
+       jbe     $L084cbc_dec_one
        movups  xmm3,XMMWORD PTR 16[esi]
        movaps  xmm5,xmm3
        cmp     eax,32
-       jbe     $L083cbc_dec_two
+       jbe     $L085cbc_dec_two
        movups  xmm4,XMMWORD PTR 32[esi]
        cmp     eax,48
-       jbe     $L084cbc_dec_three
+       jbe     $L086cbc_dec_three
        movups  xmm5,XMMWORD PTR 48[esi]
        cmp     eax,64
-       jbe     $L085cbc_dec_four
+       jbe     $L087cbc_dec_four
        movups  xmm6,XMMWORD PTR 64[esi]
        movaps  XMMWORD PTR [esp],xmm7
        movups  xmm2,XMMWORD PTR [esi]
@@ -1867,55 +1934,62 @@ $L078cbc_dec_tail:
        xorps   xmm6,xmm0
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR 32[edi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR 48[edi],xmm5
+       pxor    xmm5,xmm5
        lea     edi,DWORD PTR 64[edi]
        movaps  xmm2,xmm6
+       pxor    xmm6,xmm6
        sub     eax,80
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L082cbc_dec_one:
+$L084cbc_dec_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L086dec1_loop_16:
+$L089dec1_loop_16:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L086dec1_loop_16
+       jnz     $L089dec1_loop_16
 DB     102,15,56,223,209
        xorps   xmm2,xmm7
        movaps  xmm7,xmm6
        sub     eax,16
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L083cbc_dec_two:
+$L085cbc_dec_two:
        call    __aesni_decrypt2
        xorps   xmm2,xmm7
        xorps   xmm3,xmm6
        movups  XMMWORD PTR [edi],xmm2
        movaps  xmm2,xmm3
+       pxor    xmm3,xmm3
        lea     edi,DWORD PTR 16[edi]
        movaps  xmm7,xmm5
        sub     eax,32
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L084cbc_dec_three:
+$L086cbc_dec_three:
        call    __aesni_decrypt3
        xorps   xmm2,xmm7
        xorps   xmm3,xmm6
        xorps   xmm4,xmm5
        movups  XMMWORD PTR [edi],xmm2
        movaps  xmm2,xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR 16[edi],xmm3
+       pxor    xmm3,xmm3
        lea     edi,DWORD PTR 32[edi]
        movups  xmm7,XMMWORD PTR 32[esi]
        sub     eax,48
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L085cbc_dec_four:
+$L087cbc_dec_four:
        call    __aesni_decrypt4
        movups  xmm1,XMMWORD PTR 16[esi]
        movups  xmm0,XMMWORD PTR 32[esi]
@@ -1925,28 +1999,44 @@ $L085cbc_dec_four:
        movups  XMMWORD PTR [edi],xmm2
        xorps   xmm4,xmm1
        movups  XMMWORD PTR 16[edi],xmm3
+       pxor    xmm3,xmm3
        xorps   xmm5,xmm0
        movups  XMMWORD PTR 32[edi],xmm4
+       pxor    xmm4,xmm4
        lea     edi,DWORD PTR 48[edi]
        movaps  xmm2,xmm5
+       pxor    xmm5,xmm5
        sub     eax,64
-$L081cbc_dec_tail_collected:
+       jmp     $L088cbc_dec_tail_collected
+ALIGN  16
+$L083cbc_dec_clear_tail_collected:
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+$L088cbc_dec_tail_collected:
        and     eax,15
-       jnz     $L087cbc_dec_tail_partial
+       jnz     $L090cbc_dec_tail_partial
        movups  XMMWORD PTR [edi],xmm2
-       jmp     $L077cbc_ret
+       pxor    xmm0,xmm0
+       jmp     $L079cbc_ret
 ALIGN  16
-$L087cbc_dec_tail_partial:
+$L090cbc_dec_tail_partial:
        movaps  XMMWORD PTR [esp],xmm2
+       pxor    xmm0,xmm0
        mov     ecx,16
        mov     esi,esp
        sub     ecx,eax
 DD     2767451785
-$L077cbc_ret:
+       movdqa  XMMWORD PTR [esp],xmm2
+$L079cbc_ret:
        mov     esp,DWORD PTR 16[esp]
        mov     ebp,DWORD PTR 36[esp]
+       pxor    xmm2,xmm2
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR [ebp],xmm7
-$L072cbc_abort:
+       pxor    xmm7,xmm7
+$L074cbc_abort:
        pop     edi
        pop     esi
        pop     ebx
@@ -1955,52 +2045,62 @@ $L072cbc_abort:
 _aesni_cbc_encrypt ENDP
 ALIGN  16
 __aesni_set_encrypt_key        PROC PRIVATE
+       push    ebp
+       push    ebx
        test    eax,eax
-       jz      $L088bad_pointer
+       jz      $L091bad_pointer
        test    edx,edx
-       jz      $L088bad_pointer
+       jz      $L091bad_pointer
+       call    $L092pic
+$L092pic:
+       pop     ebx
+       lea     ebx,DWORD PTR ($Lkey_const-$L092pic)[ebx]
+       lea     ebp,DWORD PTR _OPENSSL_ia32cap_P
        movups  xmm0,XMMWORD PTR [eax]
        xorps   xmm4,xmm4
+       mov     ebp,DWORD PTR 4[ebp]
        lea     edx,DWORD PTR 16[edx]
+       and     ebp,268437504
        cmp     ecx,256
-       je      $L08914rounds
+       je      $L09314rounds
        cmp     ecx,192
-       je      $L09012rounds
+       je      $L09412rounds
        cmp     ecx,128
-       jne     $L091bad_keybits
+       jne     $L095bad_keybits
 ALIGN  16
-$L09210rounds:
+$L09610rounds:
+       cmp     ebp,268435456
+       je      $L09710rounds_alt
        mov     ecx,9
        movups  XMMWORD PTR [edx-16],xmm0
 DB     102,15,58,223,200,1
-       call    $L093key_128_cold
+       call    $L098key_128_cold
 DB     102,15,58,223,200,2
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,4
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,8
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,16
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,32
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,64
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,128
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,27
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,54
-       call    $L094key_128
+       call    $L099key_128
        movups  XMMWORD PTR [edx],xmm0
        mov     DWORD PTR 80[edx],ecx
-       xor     eax,eax
-       ret
+       jmp     $L100good_key
 ALIGN  16
-$L094key_128:
+$L099key_128:
        movups  XMMWORD PTR [edx],xmm0
        lea     edx,DWORD PTR 16[edx]
-$L093key_128_cold:
+$L098key_128_cold:
        shufps  xmm4,xmm0,16
        xorps   xmm0,xmm4
        shufps  xmm4,xmm0,140
@@ -2009,38 +2109,91 @@ $L093key_128_cold:
        xorps   xmm0,xmm1
        ret
 ALIGN  16
-$L09012rounds:
+$L09710rounds_alt:
+       movdqa  xmm5,XMMWORD PTR [ebx]
+       mov     ecx,8
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+       movdqa  xmm2,xmm0
+       movdqu  XMMWORD PTR [edx-16],xmm0
+$L101loop_key128:
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+       lea     edx,DWORD PTR 16[edx]
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR [edx-16],xmm0
+       movdqa  xmm2,xmm0
+       dec     ecx
+       jnz     $L101loop_key128
+       movdqa  xmm4,XMMWORD PTR 48[ebx]
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR [edx],xmm0
+       movdqa  xmm2,xmm0
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR 16[edx],xmm0
+       mov     ecx,9
+       mov     DWORD PTR 96[edx],ecx
+       jmp     $L100good_key
+ALIGN  16
+$L09412rounds:
        movq    xmm2,QWORD PTR 16[eax]
+       cmp     ebp,268435456
+       je      $L10212rounds_alt
        mov     ecx,11
        movups  XMMWORD PTR [edx-16],xmm0
 DB     102,15,58,223,202,1
-       call    $L095key_192a_cold
+       call    $L103key_192a_cold
 DB     102,15,58,223,202,2
-       call    $L096key_192b
+       call    $L104key_192b
 DB     102,15,58,223,202,4
-       call    $L097key_192a
+       call    $L105key_192a
 DB     102,15,58,223,202,8
-       call    $L096key_192b
+       call    $L104key_192b
 DB     102,15,58,223,202,16
-       call    $L097key_192a
+       call    $L105key_192a
 DB     102,15,58,223,202,32
-       call    $L096key_192b
+       call    $L104key_192b
 DB     102,15,58,223,202,64
-       call    $L097key_192a
+       call    $L105key_192a
 DB     102,15,58,223,202,128
-       call    $L096key_192b
+       call    $L104key_192b
        movups  XMMWORD PTR [edx],xmm0
        mov     DWORD PTR 48[edx],ecx
-       xor     eax,eax
-       ret
+       jmp     $L100good_key
 ALIGN  16
-$L097key_192a:
+$L105key_192a:
        movups  XMMWORD PTR [edx],xmm0
        lea     edx,DWORD PTR 16[edx]
 ALIGN  16
-$L095key_192a_cold:
+$L103key_192a_cold:
        movaps  xmm5,xmm2
-$L098key_192b_warm:
+$L106key_192b_warm:
        shufps  xmm4,xmm0,16
        movdqa  xmm3,xmm2
        xorps   xmm0,xmm4
@@ -2054,56 +2207,90 @@ $L098key_192b_warm:
        pxor    xmm2,xmm3
        ret
 ALIGN  16
-$L096key_192b:
+$L104key_192b:
        movaps  xmm3,xmm0
        shufps  xmm5,xmm0,68
        movups  XMMWORD PTR [edx],xmm5
        shufps  xmm3,xmm2,78
        movups  XMMWORD PTR 16[edx],xmm3
        lea     edx,DWORD PTR 32[edx]
-       jmp     $L098key_192b_warm
+       jmp     $L106key_192b_warm
+ALIGN  16
+$L10212rounds_alt:
+       movdqa  xmm5,XMMWORD PTR 16[ebx]
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+       mov     ecx,8
+       movdqu  XMMWORD PTR [edx-16],xmm0
+$L107loop_key192:
+       movq    QWORD PTR [edx],xmm2
+       movdqa  xmm1,xmm2
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+       pslld   xmm4,1
+       lea     edx,DWORD PTR 24[edx]
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+       pshufd  xmm3,xmm0,255
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pxor    xmm0,xmm2
+       pxor    xmm2,xmm3
+       movdqu  XMMWORD PTR [edx-16],xmm0
+       dec     ecx
+       jnz     $L107loop_key192
+       mov     ecx,11
+       mov     DWORD PTR 32[edx],ecx
+       jmp     $L100good_key
 ALIGN  16
-$L08914rounds:
+$L09314rounds:
        movups  xmm2,XMMWORD PTR 16[eax]
-       mov     ecx,13
        lea     edx,DWORD PTR 16[edx]
+       cmp     ebp,268435456
+       je      $L10814rounds_alt
+       mov     ecx,13
        movups  XMMWORD PTR [edx-32],xmm0
        movups  XMMWORD PTR [edx-16],xmm2
 DB     102,15,58,223,202,1
-       call    $L099key_256a_cold
+       call    $L109key_256a_cold
 DB     102,15,58,223,200,1
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,2
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,2
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,4
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,4
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,8
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,8
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,16
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,16
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,32
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,32
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,64
-       call    $L101key_256a
+       call    $L111key_256a
        movups  XMMWORD PTR [edx],xmm0
        mov     DWORD PTR 16[edx],ecx
        xor     eax,eax
-       ret
+       jmp     $L100good_key
 ALIGN  16
-$L101key_256a:
+$L111key_256a:
        movups  XMMWORD PTR [edx],xmm2
        lea     edx,DWORD PTR 16[edx]
-$L099key_256a_cold:
+$L109key_256a_cold:
        shufps  xmm4,xmm0,16
        xorps   xmm0,xmm4
        shufps  xmm4,xmm0,140
@@ -2112,7 +2299,7 @@ $L099key_256a_cold:
        xorps   xmm0,xmm1
        ret
 ALIGN  16
-$L100key_256b:
+$L110key_256b:
        movups  XMMWORD PTR [edx],xmm0
        lea     edx,DWORD PTR 16[edx]
        shufps  xmm4,xmm2,16
@@ -2122,13 +2309,70 @@ $L100key_256b:
        shufps  xmm1,xmm1,170
        xorps   xmm2,xmm1
        ret
+ALIGN  16
+$L10814rounds_alt:
+       movdqa  xmm5,XMMWORD PTR [ebx]
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+       mov     ecx,7
+       movdqu  XMMWORD PTR [edx-32],xmm0
+       movdqa  xmm1,xmm2
+       movdqu  XMMWORD PTR [edx-16],xmm2
+$L112loop_key256:
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+       pslld   xmm4,1
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR [edx],xmm0
+       dec     ecx
+       jz      $L113done_key256
+       pshufd  xmm2,xmm0,255
+       pxor    xmm3,xmm3
+DB     102,15,56,221,211
+       movdqa  xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm1,xmm3
+       pxor    xmm2,xmm1
+       movdqu  XMMWORD PTR 16[edx],xmm2
+       lea     edx,DWORD PTR 32[edx]
+       movdqa  xmm1,xmm2
+       jmp     $L112loop_key256
+$L113done_key256:
+       mov     ecx,13
+       mov     DWORD PTR 16[edx],ecx
+$L100good_key:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       xor     eax,eax
+       pop     ebx
+       pop     ebp
+       ret
 ALIGN  4
-$L088bad_pointer:
+$L091bad_pointer:
        mov     eax,-1
+       pop     ebx
+       pop     ebp
        ret
 ALIGN  4
-$L091bad_keybits:
+$L095bad_keybits:
+       pxor    xmm0,xmm0
        mov     eax,-2
+       pop     ebx
+       pop     ebp
        ret
 __aesni_set_encrypt_key ENDP
 ALIGN  16
@@ -2150,7 +2394,7 @@ $L_aesni_set_decrypt_key_begin::
        mov     edx,DWORD PTR 12[esp]
        shl     ecx,4
        test    eax,eax
-       jnz     $L102dec_key_ret
+       jnz     $L114dec_key_ret
        lea     eax,DWORD PTR 16[ecx*1+edx]
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR [eax]
@@ -2158,7 +2402,7 @@ $L_aesni_set_decrypt_key_begin::
        movups  XMMWORD PTR [edx],xmm1
        lea     edx,DWORD PTR 16[edx]
        lea     eax,DWORD PTR [eax-16]
-$L103dec_key_inverse:
+$L115dec_key_inverse:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR [eax]
 DB     102,15,56,219,192
@@ -2168,17 +2412,28 @@ DB      102,15,56,219,201
        movups  XMMWORD PTR 16[eax],xmm0
        movups  XMMWORD PTR [edx-16],xmm1
        cmp     eax,edx
-       ja      $L103dec_key_inverse
+       ja      $L115dec_key_inverse
        movups  xmm0,XMMWORD PTR [edx]
 DB     102,15,56,219,192
        movups  XMMWORD PTR [edx],xmm0
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        xor     eax,eax
-$L102dec_key_ret:
+$L114dec_key_ret:
        ret
 _aesni_set_decrypt_key ENDP
+ALIGN  64
+$Lkey_const::
+DD     202313229,202313229,202313229,202313229
+DD     67569157,67569157,67569157,67569157
+DD     1,1,1,1
+DD     27,27,27,27
 DB     65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 DB     83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 DB     32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 DB     115,108,46,111,114,103,62,0
 .text$ ENDS
+.bss   SEGMENT 'BSS'
+COMM   _OPENSSL_ia32cap_P:DWORD:4
+.bss   ENDS
 END
index 732ba3d..fd979d0 100644 (file)
@@ -230,17 +230,17 @@ aes_v8_encrypt:
 
 .Loop_enc:
        .byte   0x00,0x43,0xb0,0xf3     @ aese q2,q0
-       vld1.32 {q0},[r2]!
        .byte   0x84,0x43,0xb0,0xf3     @ aesmc q2,q2
+       vld1.32 {q0},[r2]!
        subs    r3,r3,#2
        .byte   0x02,0x43,0xb0,0xf3     @ aese q2,q1
-       vld1.32 {q1},[r2]!
        .byte   0x84,0x43,0xb0,0xf3     @ aesmc q2,q2
+       vld1.32 {q1},[r2]!
        bgt     .Loop_enc
 
        .byte   0x00,0x43,0xb0,0xf3     @ aese q2,q0
-       vld1.32 {q0},[r2]
        .byte   0x84,0x43,0xb0,0xf3     @ aesmc q2,q2
+       vld1.32 {q0},[r2]
        .byte   0x02,0x43,0xb0,0xf3     @ aese q2,q1
        veor    q2,q2,q0
 
@@ -259,17 +259,17 @@ aes_v8_decrypt:
 
 .Loop_dec:
        .byte   0x40,0x43,0xb0,0xf3     @ aesd q2,q0
-       vld1.32 {q0},[r2]!
        .byte   0xc4,0x43,0xb0,0xf3     @ aesimc q2,q2
+       vld1.32 {q0},[r2]!
        subs    r3,r3,#2
        .byte   0x42,0x43,0xb0,0xf3     @ aesd q2,q1
-       vld1.32 {q1},[r2]!
        .byte   0xc4,0x43,0xb0,0xf3     @ aesimc q2,q2
+       vld1.32 {q1},[r2]!
        bgt     .Loop_dec
 
        .byte   0x40,0x43,0xb0,0xf3     @ aesd q2,q0
-       vld1.32 {q0},[r2]
        .byte   0xc4,0x43,0xb0,0xf3     @ aesimc q2,q2
+       vld1.32 {q0},[r2]
        .byte   0x42,0x43,0xb0,0xf3     @ aesd q2,q1
        veor    q2,q2,q0
 
@@ -313,16 +313,42 @@ aes_v8_cbc_encrypt:
        veor    q5,q8,q7
        beq     .Lcbc_enc128
 
+       vld1.32 {q2-q3},[r7]
+       add     r7,r3,#16
+       add     r6,r3,#16*4
+       add     r12,r3,#16*5
+       .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       add     r14,r3,#16*6
+       add     r3,r3,#16*7
+       b       .Lenter_cbc_enc
+
+.align 4
 .Loop_cbc_enc:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       vld1.32 {q8},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-       subs    r6,r6,#2
+        vst1.8 {q6},[r1]!
+.Lenter_cbc_enc:
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       vld1.32 {q9},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-       bgt     .Loop_cbc_enc
+       .byte   0x04,0x03,0xb0,0xf3     @ aese q0,q2
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q8},[r6]
+       cmp     r5,#4
+       .byte   0x06,0x03,0xb0,0xf3     @ aese q0,q3
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q9},[r12]
+       beq     .Lcbc_enc192
+
+       .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q8},[r14]
+       .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
+       .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       vld1.32 {q9},[r3]
+       nop
 
+.Lcbc_enc192:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
         subs   r2,r2,#16
@@ -331,7 +357,6 @@ aes_v8_cbc_encrypt:
         moveq  r8,#0
        .byte   0x24,0x03,0xb0,0xf3     @ aese q0,q10
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-        add    r7,r3,#16
        .byte   0x26,0x03,0xb0,0xf3     @ aese q0,q11
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
         vld1.8 {q8},[r0],r8
@@ -340,16 +365,14 @@ aes_v8_cbc_encrypt:
         veor   q8,q8,q5
        .byte   0x2a,0x03,0xb0,0xf3     @ aese q0,q13
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
-        vld1.32 {q9},[r7]!     @ re-pre-load rndkey[1]
+        vld1.32 {q9},[r7]              @ re-pre-load rndkey[1]
        .byte   0x2c,0x03,0xb0,0xf3     @ aese q0,q14
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
        .byte   0x2e,0x03,0xb0,0xf3     @ aese q0,q15
-
-        mov    r6,r5
        veor    q6,q0,q7
-       vst1.8  {q6},[r1]!
        bhs     .Loop_cbc_enc
 
+       vst1.8  {q6},[r1]!
        b       .Lcbc_done
 
 .align 5
@@ -407,79 +430,78 @@ aes_v8_cbc_encrypt:
 
 .Loop3x_cbc_dec:
        .byte   0x60,0x03,0xb0,0xf3     @ aesd q0,q8
-       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
-       vld1.32 {q8},[r7]!
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q8},[r7]!
        subs    r6,r6,#2
        .byte   0x62,0x03,0xb0,0xf3     @ aesd q0,q9
-       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
-       vld1.32 {q9},[r7]!
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q9},[r7]!
        bgt     .Loop3x_cbc_dec
 
        .byte   0x60,0x03,0xb0,0xf3     @ aesd q0,q8
-       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
-        veor   q4,q6,q7
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+        veor   q4,q6,q7
+        subs   r2,r2,#0x30
         veor   q5,q2,q7
+        movlo  r6,r2                   @ r6, r6, is zero at this point
        .byte   0x62,0x03,0xb0,0xf3     @ aesd q0,q9
-       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
-        veor   q9,q3,q7
-        subs   r2,r2,#0x30
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
-        vorr   q6,q11,q11
-        movlo  r6,r2                   @ r6, r6, is zero at this point
-       .byte   0x68,0x03,0xb0,0xf3     @ aesd q0,q12
-       .byte   0x68,0x23,0xb0,0xf3     @ aesd q1,q12
-       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
+        veor   q9,q3,q7
         add    r0,r0,r6                @ r0 is adjusted in such way that
                                        @ at exit from the loop q1-q10
                                        @ are loaded with last "words"
+        vorr   q6,q11,q11
+        mov    r7,r3
+       .byte   0x68,0x03,0xb0,0xf3     @ aesd q0,q12
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x68,0x23,0xb0,0xf3     @ aesd q1,q12
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
-        mov    r7,r3
-       .byte   0x6a,0x03,0xb0,0xf3     @ aesd q0,q13
-       .byte   0x6a,0x23,0xb0,0xf3     @ aesd q1,q13
-       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
         vld1.8 {q2},[r0]!
+       .byte   0x6a,0x03,0xb0,0xf3     @ aesd q0,q13
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x6a,0x23,0xb0,0xf3     @ aesd q1,q13
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         vld1.8 {q3},[r0]!
        .byte   0x6c,0x03,0xb0,0xf3     @ aesd q0,q14
-       .byte   0x6c,0x23,0xb0,0xf3     @ aesd q1,q14
-       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
-        vld1.8 {q11},[r0]!
        .byte   0xc0,0x03,0xb0,0xf3     @ aesimc q0,q0
+       .byte   0x6c,0x23,0xb0,0xf3     @ aesd q1,q14
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
-        vld1.32 {q8},[r7]!     @ re-pre-load rndkey[0]
+        vld1.8 {q11},[r0]!
        .byte   0x6e,0x03,0xb0,0xf3     @ aesd q0,q15
        .byte   0x6e,0x23,0xb0,0xf3     @ aesd q1,q15
        .byte   0x6e,0x43,0xf0,0xf3     @ aesd q10,q15
-
+        vld1.32 {q8},[r7]!     @ re-pre-load rndkey[0]
         add    r6,r5,#2
        veor    q4,q4,q0
        veor    q5,q5,q1
        veor    q10,q10,q9
         vld1.32 {q9},[r7]!     @ re-pre-load rndkey[1]
-        vorr   q0,q2,q2
        vst1.8  {q4},[r1]!
-        vorr   q1,q3,q3
+        vorr   q0,q2,q2
        vst1.8  {q5},[r1]!
+        vorr   q1,q3,q3
        vst1.8  {q10},[r1]!
         vorr   q10,q11,q11
        bhs     .Loop3x_cbc_dec
@@ -490,39 +512,39 @@ aes_v8_cbc_encrypt:
 
 .Lcbc_dec_tail:
        .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
-       vld1.32 {q8},[r7]!
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q8},[r7]!
        subs    r6,r6,#2
        .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
-       vld1.32 {q9},[r7]!
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
+       vld1.32 {q9},[r7]!
        bgt     .Lcbc_dec_tail
 
        .byte   0x60,0x23,0xb0,0xf3     @ aesd q1,q8
-       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x60,0x43,0xf0,0xf3     @ aesd q10,q8
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
        .byte   0x62,0x23,0xb0,0xf3     @ aesd q1,q9
-       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x62,0x43,0xf0,0xf3     @ aesd q10,q9
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
        .byte   0x68,0x23,0xb0,0xf3     @ aesd q1,q12
-       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x68,0x43,0xf0,0xf3     @ aesd q10,q12
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         cmn    r2,#0x20
        .byte   0x6a,0x23,0xb0,0xf3     @ aesd q1,q13
-       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6a,0x43,0xf0,0xf3     @ aesd q10,q13
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         veor   q5,q6,q7
        .byte   0x6c,0x23,0xb0,0xf3     @ aesd q1,q14
-       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
        .byte   0xc2,0x23,0xb0,0xf3     @ aesimc q1,q1
+       .byte   0x6c,0x43,0xf0,0xf3     @ aesd q10,q14
        .byte   0xe4,0x43,0xf0,0xf3     @ aesimc q10,q10
         veor   q9,q3,q7
        .byte   0x6e,0x23,0xb0,0xf3     @ aesd q1,q15
@@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks:
 .align 4
 .Loop3x_ctr32:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
-       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
-       vld1.32         {q8},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
        .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
+       vld1.32         {q8},[r7]!
        subs            r6,r6,#2
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
-       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
-       vld1.32         {q9},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
        .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
+       vld1.32         {q9},[r7]!
        bgt             .Loop3x_ctr32
 
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
-       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
-        mov            r7,r3
        .byte   0x80,0x83,0xb0,0xf3     @ aesmc q4,q0
-        vld1.8         {q2},[r0]!
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0xa3,0xb0,0xf3     @ aesmc q5,q1
-       .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
+        vld1.8         {q2},[r0]!
         vorr           q0,q6,q6
-       .byte   0x22,0x83,0xb0,0xf3     @ aese q4,q9
+       .byte   0x20,0x43,0xf0,0xf3     @ aese q10,q8
+       .byte   0xa4,0x43,0xf0,0xf3     @ aesmc q10,q10
         vld1.8         {q3},[r0]!
-       .byte   0x22,0xa3,0xb0,0xf3     @ aese q5,q9
-       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
         vorr           q1,q6,q6
+       .byte   0x22,0x83,0xb0,0xf3     @ aese q4,q9
        .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-        vld1.8         {q11},[r0]!
+       .byte   0x22,0xa3,0xb0,0xf3     @ aese q5,q9
        .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+        vld1.8         {q11},[r0]!
+        mov            r7,r3
+       .byte   0x22,0x43,0xf0,0xf3     @ aese q10,q9
        .byte   0xa4,0x23,0xf0,0xf3     @ aesmc q9,q10
         vorr           q10,q6,q6
         add            r9,r8,#1
        .byte   0x28,0x83,0xb0,0xf3     @ aese q4,q12
+       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
        .byte   0x28,0xa3,0xb0,0xf3     @ aese q5,q12
-       .byte   0x28,0x23,0xf0,0xf3     @ aese q9,q12
+       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
         veor           q2,q2,q7
         add            r10,r8,#2
-       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+       .byte   0x28,0x23,0xf0,0xf3     @ aese q9,q12
        .byte   0xa2,0x23,0xf0,0xf3     @ aesmc q9,q9
         veor           q3,q3,q7
         add            r8,r8,#3
        .byte   0x2a,0x83,0xb0,0xf3     @ aese q4,q13
+       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
        .byte   0x2a,0xa3,0xb0,0xf3     @ aese q5,q13
-       .byte   0x2a,0x23,0xf0,0xf3     @ aese q9,q13
+       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
         veor           q11,q11,q7
         rev            r9,r9
-       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-        vld1.32         {q8},[r7]!     @ re-pre-load rndkey[0]
-       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+       .byte   0x2a,0x23,0xf0,0xf3     @ aese q9,q13
        .byte   0xa2,0x23,0xf0,0xf3     @ aesmc q9,q9
         vmov.32        d1[1], r9
         rev            r10,r10
        .byte   0x2c,0x83,0xb0,0xf3     @ aese q4,q14
+       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
        .byte   0x2c,0xa3,0xb0,0xf3     @ aese q5,q14
-       .byte   0x2c,0x23,0xf0,0xf3     @ aese q9,q14
+       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
         vmov.32        d3[1], r10
         rev            r12,r8
-       .byte   0x88,0x83,0xb0,0xf3     @ aesmc q4,q4
-       .byte   0x8a,0xa3,0xb0,0xf3     @ aesmc q5,q5
+       .byte   0x2c,0x23,0xf0,0xf3     @ aese q9,q14
        .byte   0xa2,0x23,0xf0,0xf3     @ aesmc q9,q9
         vmov.32        d21[1], r12
         subs           r2,r2,#3
@@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks:
        .byte   0x2e,0xa3,0xb0,0xf3     @ aese q5,q15
        .byte   0x2e,0x23,0xf0,0xf3     @ aese q9,q15
 
-        mov            r6,r5
        veor            q2,q2,q4
+        vld1.32         {q8},[r7]!     @ re-pre-load rndkey[0]
+       vst1.8          {q2},[r1]!
        veor            q3,q3,q5
+        mov            r6,r5
+       vst1.8          {q3},[r1]!
        veor            q11,q11,q9
         vld1.32         {q9},[r7]!     @ re-pre-load rndkey[1]
-       vst1.8          {q2},[r1]!
-       vst1.8          {q3},[r1]!
        vst1.8          {q11},[r1]!
        bhs             .Loop3x_ctr32
 
@@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks:
 
 .Lctr32_tail:
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
-       vld1.32         {q8},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       vld1.32         {q8},[r7]!
        subs            r6,r6,#2
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
-       vld1.32         {q9},[r7]!
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+       vld1.32         {q9},[r7]!
        bgt             .Lctr32_tail
 
        .byte   0x20,0x03,0xb0,0xf3     @ aese q0,q8
-       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x20,0x23,0xb0,0xf3     @ aese q1,q8
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
        .byte   0x22,0x03,0xb0,0xf3     @ aese q0,q9
-       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x22,0x23,0xb0,0xf3     @ aese q1,q9
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
         vld1.8         {q2},[r0],r12
        .byte   0x28,0x03,0xb0,0xf3     @ aese q0,q12
-       .byte   0x28,0x23,0xb0,0xf3     @ aese q1,q12
-        vld1.8         {q3},[r0]
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x28,0x23,0xb0,0xf3     @ aese q1,q12
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
+        vld1.8         {q3},[r0]
        .byte   0x2a,0x03,0xb0,0xf3     @ aese q0,q13
-       .byte   0x2a,0x23,0xb0,0xf3     @ aese q1,q13
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x2a,0x23,0xb0,0xf3     @ aese q1,q13
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
-       .byte   0x2c,0x03,0xb0,0xf3     @ aese q0,q14
-       .byte   0x2c,0x23,0xb0,0xf3     @ aese q1,q14
         veor           q2,q2,q7
+       .byte   0x2c,0x03,0xb0,0xf3     @ aese q0,q14
        .byte   0x80,0x03,0xb0,0xf3     @ aesmc q0,q0
+       .byte   0x2c,0x23,0xb0,0xf3     @ aese q1,q14
        .byte   0x82,0x23,0xb0,0xf3     @ aesmc q1,q1
         veor           q3,q3,q7
        .byte   0x2e,0x03,0xb0,0xf3     @ aese q0,q15
index d321235..c54f514 100644 (file)
@@ -495,7 +495,7 @@ gcm_ghash_neon:
        veor            q10,q10,q9              @
        vshl.i64        q9,q0,#63
        veor            q10, q10, q9            @
-       veor            d1,d1,d20       @
+       veor            d1,d1,d20       @
        veor            d4,d4,d21
 
        vshr.u64        q10,q0,#1               @ 2nd phase
index 570d917..2695749 100644 (file)
 .type  gcm_init_v8,%function
 .align 4
 gcm_init_v8:
-       vld1.64         {q9},[r1]               @ load H
-       vmov.i8         q8,#0xe1
+       vld1.64         {q9},[r1]               @ load input H
+       vmov.i8         q11,#0xe1
+       vshl.i64        q11,q11,#57             @ 0xc2.0
        vext.8          q3,q9,q9,#8
-       vshl.i64        q8,q8,#57
-       vshr.u64        q10,q8,#63
-       vext.8          q8,q10,q8,#8            @ t0=0xc2....01
+       vshr.u64        q10,q11,#63
        vdup.32 q9,d18[1]
-       vshr.u64        q11,q3,#63
+       vext.8          q8,q10,q11,#8           @ t0=0xc2....01
+       vshr.u64        q10,q3,#63
        vshr.s32        q9,q9,#31               @ broadcast carry bit
-       vand            q11,q11,q8
+       vand            q10,q10,q8
        vshl.i64        q3,q3,#1
-       vext.8          q11,q11,q11,#8
+       vext.8          q10,q10,q10,#8
        vand            q8,q8,q9
-       vorr            q3,q3,q11               @ H<<<=1
-       veor            q3,q3,q8                @ twisted H
-       vst1.64         {q3},[r0]
+       vorr            q3,q3,q10               @ H<<<=1
+       veor            q12,q3,q8               @ twisted H
+       vst1.64         {q12},[r0]!             @ store Htable[0]
+
+       @ calculate H^2
+       vext.8          q8,q12,q12,#8           @ Karatsuba pre-processing
+       .byte   0xa8,0x0e,0xa8,0xf2     @ pmull q0,q12,q12
+       veor            q8,q8,q12
+       .byte   0xa9,0x4e,0xa9,0xf2     @ pmull2 q2,q12,q12
+       .byte   0xa0,0x2e,0xa0,0xf2     @ pmull q1,q8,q8
+
+       vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
+       veor            q10,q0,q2
+       veor            q1,q1,q9
+       veor            q1,q1,q10
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase
+
+       vmov            d4,d3           @ Xh|Xm - 256-bit result
+       vmov            d3,d0           @ Xm is rotated Xl
+       veor            q0,q1,q10
+
+       vext.8          q10,q0,q0,#8            @ 2nd phase
+       .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
+       veor            q10,q10,q2
+       veor            q14,q0,q10
+
+       vext.8          q9,q14,q14,#8           @ Karatsuba pre-processing
+       veor            q9,q9,q14
+       vext.8          q13,q8,q9,#8            @ pack Karatsuba pre-processed
+       vst1.64         {q13-q14},[r0]          @ store Htable[1..2]
 
        bx      lr
 .size  gcm_init_v8,.-gcm_init_v8
-
 .global        gcm_gmult_v8
 .type  gcm_gmult_v8,%function
 .align 4
 gcm_gmult_v8:
        vld1.64         {q9},[r0]               @ load Xi
        vmov.i8         q11,#0xe1
-       vld1.64         {q12},[r1]              @ load twisted H
+       vld1.64         {q12-q13},[r1]  @ load twisted H, ...
        vshl.u64        q11,q11,#57
 #ifndef __ARMEB__
        vrev64.8        q9,q9
 #endif
-       vext.8          q13,q12,q12,#8
-       mov             r3,#0
        vext.8          q3,q9,q9,#8
-       mov             r12,#0
-       veor            q13,q13,q12             @ Karatsuba pre-processing
-       mov             r2,r0
-       b               .Lgmult_v8
-.size  gcm_gmult_v8,.-gcm_gmult_v8
 
+       .byte   0x86,0x0e,0xa8,0xf2     @ pmull q0,q12,q3               @ H.lo·Xi.lo
+       veor            q9,q9,q3                @ Karatsuba pre-processing
+       .byte   0x87,0x4e,0xa9,0xf2     @ pmull2 q2,q12,q3              @ H.hi·Xi.hi
+       .byte   0xa2,0x2e,0xaa,0xf2     @ pmull q1,q13,q9               @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+       vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
+       veor            q10,q0,q2
+       veor            q1,q1,q9
+       veor            q1,q1,q10
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase of reduction
+
+       vmov            d4,d3           @ Xh|Xm - 256-bit result
+       vmov            d3,d0           @ Xm is rotated Xl
+       veor            q0,q1,q10
+
+       vext.8          q10,q0,q0,#8            @ 2nd phase of reduction
+       .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
+       veor            q10,q10,q2
+       veor            q0,q0,q10
+
+#ifndef __ARMEB__
+       vrev64.8        q0,q0
+#endif
+       vext.8          q0,q0,q0,#8
+       vst1.64         {q0},[r0]               @ write out Xi
+
+       bx      lr
+.size  gcm_gmult_v8,.-gcm_gmult_v8
 .global        gcm_ghash_v8
 .type  gcm_ghash_v8,%function
 .align 4
 gcm_ghash_v8:
+       vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
        vld1.64         {q0},[r0]               @ load [rotated] Xi
-       subs            r3,r3,#16
+                                               @ "[rotated]" means that
+                                               @ loaded value would have
+                                               @ to be rotated in order to
+                                               @ make it appear as in
+                                               @ alorithm specification
+       subs            r3,r3,#32               @ see if r3 is 32 or larger
+       mov             r12,#16         @ r12 is used as post-
+                                               @ increment for input pointer;
+                                               @ as loop is modulo-scheduled
+                                               @ r12 is zeroed just in time
+                                               @ to preclude oversteping
+                                               @ inp[len], which means that
+                                               @ last block[s] are actually
+                                               @ loaded twice, but last
+                                               @ copy is not processed
+       vld1.64         {q12-q13},[r1]! @ load twisted H, ..., H^2
        vmov.i8         q11,#0xe1
-       mov             r12,#16
-       vld1.64         {q12},[r1]              @ load twisted H
-       moveq   r12,#0
-       vext.8          q0,q0,q0,#8
-       vshl.u64        q11,q11,#57
-       vld1.64         {q9},[r2],r12   @ load [rotated] inp
-       vext.8          q13,q12,q12,#8
+       vld1.64         {q14},[r1]
+       moveq   r12,#0                  @ is it time to zero r12?
+       vext.8          q0,q0,q0,#8             @ rotate Xi
+       vld1.64         {q8},[r2]!      @ load [rotated] I[0]
+       vshl.u64        q11,q11,#57             @ compose 0xc2.0 constant
 #ifndef __ARMEB__
+       vrev64.8        q8,q8
        vrev64.8        q0,q0
+#endif
+       vext.8          q3,q8,q8,#8             @ rotate I[0]
+       blo             .Lodd_tail_v8           @ r3 was less than 32
+       vld1.64         {q9},[r2],r12   @ load [rotated] I[1]
+#ifndef __ARMEB__
        vrev64.8        q9,q9
 #endif
-       veor            q13,q13,q12             @ Karatsuba pre-processing
-       vext.8          q3,q9,q9,#8
-       b               .Loop_v8
+       vext.8          q7,q9,q9,#8
+       veor            q3,q3,q0                @ I[i]^=Xi
+       .byte   0x8e,0x8e,0xa8,0xf2     @ pmull q4,q12,q7               @ H·Ii+1
+       veor            q9,q9,q7                @ Karatsuba pre-processing
+       .byte   0x8f,0xce,0xa9,0xf2     @ pmull2 q6,q12,q7
+       b               .Loop_mod2x_v8
 
 .align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+       vext.8          q10,q3,q3,#8
+       subs            r3,r3,#32               @ is there more data?
+       .byte   0x86,0x0e,0xac,0xf2     @ pmull q0,q14,q3               @ H^2.lo·Xi.lo
+       movlo   r12,#0                  @ is it time to zero r12?
+
+        .byte  0xa2,0xae,0xaa,0xf2     @ pmull q5,q13,q9
+       veor            q10,q10,q3              @ Karatsuba pre-processing
+       .byte   0x87,0x4e,0xad,0xf2     @ pmull2 q2,q14,q3              @ H^2.hi·Xi.hi
+       veor            q0,q0,q4                @ accumulate
+       .byte   0xa5,0x2e,0xab,0xf2     @ pmull2 q1,q13,q10             @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+        vld1.64        {q8},[r2],r12   @ load [rotated] I[i+2]
+
+       veor            q2,q2,q6
+        moveq  r12,#0                  @ is it time to zero r12?
+       veor            q1,q1,q5
+
+       vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
+       veor            q10,q0,q2
+       veor            q1,q1,q9
+        vld1.64        {q9},[r2],r12   @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+        vrev64.8       q8,q8
+#endif
+       veor            q1,q1,q10
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase of reduction
+
+#ifndef __ARMEB__
+        vrev64.8       q9,q9
+#endif
+       vmov            d4,d3           @ Xh|Xm - 256-bit result
+       vmov            d3,d0           @ Xm is rotated Xl
+        vext.8         q7,q9,q9,#8
+        vext.8         q3,q8,q8,#8
+       veor            q0,q1,q10
+        .byte  0x8e,0x8e,0xa8,0xf2     @ pmull q4,q12,q7               @ H·Ii+1
+       veor            q3,q3,q2                @ accumulate q3 early
+
+       vext.8          q10,q0,q0,#8            @ 2nd phase of reduction
+       .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
+       veor            q3,q3,q10
+        veor           q9,q9,q7                @ Karatsuba pre-processing
+       veor            q3,q3,q0
+        .byte  0x8f,0xce,0xa9,0xf2     @ pmull2 q6,q12,q7
+       bhs             .Loop_mod2x_v8          @ there was at least 32 more bytes
+
+       veor            q2,q2,q10
+       vext.8          q3,q8,q8,#8             @ re-construct q3
+       adds            r3,r3,#32               @ re-construct r3
+       veor            q0,q0,q2                @ re-construct q0
+       beq             .Ldone_v8               @ is r3 zero?
+.Lodd_tail_v8:
        vext.8          q10,q0,q0,#8
        veor            q3,q3,q0                @ inp^=Xi
-       veor            q9,q9,q10               @ q9 is rotated inp^Xi
+       veor            q9,q8,q10               @ q9 is rotated inp^Xi
 
-.Lgmult_v8:
        .byte   0x86,0x0e,0xa8,0xf2     @ pmull q0,q12,q3               @ H.lo·Xi.lo
        veor            q9,q9,q3                @ Karatsuba pre-processing
        .byte   0x87,0x4e,0xa9,0xf2     @ pmull2 q2,q12,q3              @ H.hi·Xi.hi
-       subs            r3,r3,#16
        .byte   0xa2,0x2e,0xaa,0xf2     @ pmull q1,q13,q9               @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-       moveq   r12,#0
 
        vext.8          q9,q0,q2,#8             @ Karatsuba post-processing
        veor            q10,q0,q2
        veor            q1,q1,q9
-        vld1.64        {q9},[r2],r12   @ load [rotated] inp
        veor            q1,q1,q10
-       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase
+       .byte   0x26,0x4e,0xe0,0xf2     @ pmull q10,q0,q11              @ 1st phase of reduction
 
        vmov            d4,d3           @ Xh|Xm - 256-bit result
        vmov            d3,d0           @ Xm is rotated Xl
-#ifndef __ARMEB__
-        vrev64.8       q9,q9
-#endif
        veor            q0,q1,q10
-        vext.8         q3,q9,q9,#8
 
-       vext.8          q10,q0,q0,#8            @ 2nd phase
+       vext.8          q10,q0,q0,#8            @ 2nd phase of reduction
        .byte   0x26,0x0e,0xa0,0xf2     @ pmull q0,q0,q11
        veor            q10,q10,q2
        veor            q0,q0,q10
-       bhs             .Loop_v8
 
+.Ldone_v8:
 #ifndef __ARMEB__
        vrev64.8        q0,q0
 #endif
        vext.8          q0,q0,q0,#8
        vst1.64         {q0},[r0]               @ write out Xi
 
+       vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
        bx      lr
 .size  gcm_ghash_v8,.-gcm_ghash_v8
 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
index bf1ce4f..683f1cc 100644 (file)
@@ -1,7 +1,59 @@
-#include "arm_arch.h"
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
 
 .text
+#if __ARM_ARCH__<7
 .code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
 
 .type  K256,%object
 .align 5
@@ -24,7 +76,7 @@ K256:
 .word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size  K256,.-K256
 .word  0                               @ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
 .word  OPENSSL_armcap_P-sha256_block_data_order
 #endif
@@ -33,9 +85,12 @@ K256:
 .global        sha256_block_data_order
 .type  sha256_block_data_order,%function
 sha256_block_data_order:
+#if __ARM_ARCH__<7
        sub     r3,pc,#8                @ sha256_block_data_order
-       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+       adr     r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
        ldr     r12,.LOPENSSL_armcap
        ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
        tst     r12,#ARMV8_SHA256
@@ -43,6 +98,7 @@ sha256_block_data_order:
        tst     r12,#ARMV7_NEON
        bne     .LNEON
 #endif
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
        stmdb   sp!,{r0,r1,r2,r4-r11,lr}
        ldmia   r0,{r4,r5,r6,r7,r8,r9,r10,r11}
        sub     r14,r3,#256+32  @ K256
@@ -1736,6 +1792,9 @@ sha256_block_data_order:
        eor     r12,r12,r6                      @ Maj(a,b,c)
        add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
        @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
        ldreq   r3,[sp,#16*4]           @ pull ctx
        bne     .Lrounds_16_xx
 
@@ -1777,16 +1836,19 @@ sha256_block_data_order:
 .arch  armv7-a
 .fpu   neon
 
+.global        sha256_block_data_order_neon
 .type  sha256_block_data_order_neon,%function
 .align 4
 sha256_block_data_order_neon:
 .LNEON:
        stmdb   sp!,{r4-r12,lr}
 
+       sub     r11,sp,#16*4+16
+       adr     r14,K256
+       bic     r11,r11,#15             @ align for 128-bit stores
        mov     r12,sp
-       sub     sp,sp,#16*4+16          @ alloca
-       sub     r14,r3,#256+32  @ K256
-       bic     sp,sp,#15               @ align for 128-bit stores
+       mov     sp,r11                  @ alloca
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
 
        vld1.8          {q0},[r1]!
        vld1.8          {q1},[r1]!
@@ -2224,11 +2286,13 @@ sha256_block_data_order_neon:
        ldr             r0,[sp,#72]
        sub             r14,r14,#256    @ rewind r14
        teq             r1,r0
+       it              eq
        subeq           r1,r1,#64               @ avoid SEGV
        vld1.8          {q0},[r1]!              @ load next input block
        vld1.8          {q1},[r1]!
        vld1.8          {q2},[r1]!
        vld1.8          {q3},[r1]!
+       it              ne
        strne           r1,[sp,#68]
        mov             r1,sp
        add     r11,r11,r2
@@ -2542,23 +2606,38 @@ sha256_block_data_order_neon:
        str     r7,[r2],#4
        stmia   r2,{r8-r11}
 
+       ittte   ne
        movne   r1,sp
        ldrne   r2,[sp,#0]
        eorne   r12,r12,r12
        ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
        eorne   r3,r5,r6
        bne     .L_00_48
 
        ldmia   sp!,{r4-r12,pc}
 .size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
 #endif
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
 .type  sha256_block_data_order_armv8,%function
 .align 5
 sha256_block_data_order_armv8:
 .LARMv8:
        vld1.32 {q0,q1},[r0]
-       sub     r3,r3,#sha256_block_data_order-K256
+# ifdef __thumb2__
+       adr     r3,.LARMv8
+       sub     r3,r3,#.LARMv8-K256
+# else
+       adrl    r3,K256
+# endif
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
 
 .Loop_v8:
        vld1.8          {q8-q9},[r1]!
@@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8:
        teq             r1,r2
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
-       .byte   0xe2,0x03,0xfa,0xf3     @ sha256su0 q8,q9
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe6,0x0c,0x64,0xf3     @ sha256su1 q8,q10,q11
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
-       .byte   0xe4,0x23,0xfa,0xf3     @ sha256su0 q9,q10
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe0,0x2c,0x66,0xf3     @ sha256su1 q9,q11,q8
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q10
-       .byte   0xe6,0x43,0xfa,0xf3     @ sha256su0 q10,q11
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe2,0x4c,0x60,0xf3     @ sha256su1 q10,q8,q9
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q11
-       .byte   0xe0,0x63,0xfa,0xf3     @ sha256su0 q11,q8
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe4,0x6c,0x62,0xf3     @ sha256su1 q11,q9,q10
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
-       .byte   0xe2,0x03,0xfa,0xf3     @ sha256su0 q8,q9
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe6,0x0c,0x64,0xf3     @ sha256su1 q8,q10,q11
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
-       .byte   0xe4,0x23,0xfa,0xf3     @ sha256su0 q9,q10
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe0,0x2c,0x66,0xf3     @ sha256su1 q9,q11,q8
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q10
-       .byte   0xe6,0x43,0xfa,0xf3     @ sha256su0 q10,q11
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe2,0x4c,0x60,0xf3     @ sha256su1 q10,q8,q9
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q11
-       .byte   0xe0,0x63,0xfa,0xf3     @ sha256su0 q11,q8
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe4,0x6c,0x62,0xf3     @ sha256su1 q11,q9,q10
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
-       .byte   0xe2,0x03,0xfa,0xf3     @ sha256su0 q8,q9
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe6,0x0c,0x64,0xf3     @ sha256su1 q8,q10,q11
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
-       .byte   0xe4,0x23,0xfa,0xf3     @ sha256su0 q9,q10
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe0,0x2c,0x66,0xf3     @ sha256su1 q9,q11,q8
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q10
-       .byte   0xe6,0x43,0xfa,0xf3     @ sha256su0 q10,q11
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
-       .byte   0xe2,0x4c,0x60,0xf3     @ sha256su1 q10,q8,q9
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q11
-       .byte   0xe0,0x63,0xfa,0xf3     @ sha256su0 q11,q8
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
-       .byte   0xe4,0x6c,0x62,0xf3     @ sha256su1 q11,q9,q10
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
        vld1.32         {q13},[r3]!
        vadd.i32        q12,q12,q8
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
 
        vld1.32         {q12},[r3]!
        vadd.i32        q13,q13,q9
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
 
        vld1.32         {q13},[r3]
        vadd.i32        q12,q12,q10
        sub             r3,r3,#256-16   @ rewind
        vmov            q2,q0
-       .byte   0x68,0x0c,0x02,0xf3     @ sha256h q0,q1,q12
-       .byte   0x68,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q12
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
 
        vadd.i32        q13,q13,q11
        vmov            q2,q0
-       .byte   0x6a,0x0c,0x02,0xf3     @ sha256h q0,q1,q13
-       .byte   0x6a,0x2c,0x14,0xf3     @ sha256h2 q1,q2,q13
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
 
        vadd.i32        q0,q0,q14
        vadd.i32        q1,q1,q15
+       it              ne
        bne             .Loop_v8
 
        vst1.32         {q0,q1},[r0]
@@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8:
 #endif
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
 .align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .comm   OPENSSL_armcap_P,4,4
 #endif
index 0a4b1ac..f5dd6cb 100644 (file)
@@ -227,17 +227,17 @@ aes_v8_encrypt:
 
 .Loop_enc:
        aese    v2.16b,v0.16b
-       ld1     {v0.4s},[x2],#16
        aesmc   v2.16b,v2.16b
+       ld1     {v0.4s},[x2],#16
        subs    w3,w3,#2
        aese    v2.16b,v1.16b
-       ld1     {v1.4s},[x2],#16
        aesmc   v2.16b,v2.16b
+       ld1     {v1.4s},[x2],#16
        b.gt    .Loop_enc
 
        aese    v2.16b,v0.16b
-       ld1     {v0.4s},[x2]
        aesmc   v2.16b,v2.16b
+       ld1     {v0.4s},[x2]
        aese    v2.16b,v1.16b
        eor     v2.16b,v2.16b,v0.16b
 
@@ -256,17 +256,17 @@ aes_v8_decrypt:
 
 .Loop_dec:
        aesd    v2.16b,v0.16b
-       ld1     {v0.4s},[x2],#16
        aesimc  v2.16b,v2.16b
+       ld1     {v0.4s},[x2],#16
        subs    w3,w3,#2
        aesd    v2.16b,v1.16b
-       ld1     {v1.4s},[x2],#16
        aesimc  v2.16b,v2.16b
+       ld1     {v1.4s},[x2],#16
        b.gt    .Loop_dec
 
        aesd    v2.16b,v0.16b
-       ld1     {v0.4s},[x2]
        aesimc  v2.16b,v2.16b
+       ld1     {v0.4s},[x2]
        aesd    v2.16b,v1.16b
        eor     v2.16b,v2.16b,v0.16b
 
@@ -308,16 +308,42 @@ aes_v8_cbc_encrypt:
        eor     v5.16b,v16.16b,v7.16b
        b.eq    .Lcbc_enc128
 
+       ld1     {v2.4s-v3.4s},[x7]
+       add     x7,x3,#16
+       add     x6,x3,#16*4
+       add     x12,x3,#16*5
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       add     x14,x3,#16*6
+       add     x3,x3,#16*7
+       b       .Lenter_cbc_enc
+
+.align 4
 .Loop_cbc_enc:
        aese    v0.16b,v16.16b
-       ld1     {v16.4s},[x7],#16
        aesmc   v0.16b,v0.16b
-       subs    w6,w6,#2
+        st1    {v6.16b},[x1],#16
+.Lenter_cbc_enc:
        aese    v0.16b,v17.16b
-       ld1     {v17.4s},[x7],#16
        aesmc   v0.16b,v0.16b
-       b.gt    .Loop_cbc_enc
+       aese    v0.16b,v2.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v16.4s},[x6]
+       cmp     w5,#4
+       aese    v0.16b,v3.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v17.4s},[x12]
+       b.eq    .Lcbc_enc192
+
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v16.4s},[x14]
+       aese    v0.16b,v17.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v17.4s},[x3]
+       nop
 
+.Lcbc_enc192:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
         subs   x2,x2,#16
@@ -326,7 +352,6 @@ aes_v8_cbc_encrypt:
         csel   x8,xzr,x8,eq
        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
-        add    x7,x3,#16
        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
         ld1    {v16.16b},[x0],x8
@@ -335,16 +360,14 @@ aes_v8_cbc_encrypt:
         eor    v16.16b,v16.16b,v5.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
-        ld1 {v17.4s},[x7],#16  // re-pre-load rndkey[1]
+        ld1 {v17.4s},[x7]              // re-pre-load rndkey[1]
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v23.16b
-
-        mov    w6,w5
        eor     v6.16b,v0.16b,v7.16b
-       st1     {v6.16b},[x1],#16
        b.hs    .Loop_cbc_enc
 
+       st1     {v6.16b},[x1],#16
        b       .Lcbc_done
 
 .align 5
@@ -402,79 +425,78 @@ aes_v8_cbc_encrypt:
 
 .Loop3x_cbc_dec:
        aesd    v0.16b,v16.16b
-       aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
-       ld1     {v16.4s},[x7],#16
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
-       aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
-       ld1     {v17.4s},[x7],#16
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v17.4s},[x7],#16
        b.gt    .Loop3x_cbc_dec
 
        aesd    v0.16b,v16.16b
-       aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
-        eor    v4.16b,v6.16b,v7.16b
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
+        eor    v4.16b,v6.16b,v7.16b
+        subs   x2,x2,#0x30
         eor    v5.16b,v2.16b,v7.16b
+        csel   x6,x2,x6,lo                     // x6, w6, is zero at this point
        aesd    v0.16b,v17.16b
-       aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
-        eor    v17.16b,v3.16b,v7.16b
-        subs   x2,x2,#0x30
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
-        orr    v6.16b,v19.16b,v19.16b
-        csel   x6,x2,x6,lo                     // x6, w6, is zero at this point
-       aesd    v0.16b,v20.16b
-       aesd    v1.16b,v20.16b
-       aesd    v18.16b,v20.16b
+        eor    v17.16b,v3.16b,v7.16b
         add    x0,x0,x6                // x0 is adjusted in such way that
                                        // at exit from the loop v1.16b-v18.16b
                                        // are loaded with last "words"
+        orr    v6.16b,v19.16b,v19.16b
+        mov    x7,x3
+       aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v20.16b
        aesimc  v18.16b,v18.16b
-        mov    x7,x3
-       aesd    v0.16b,v21.16b
-       aesd    v1.16b,v21.16b
-       aesd    v18.16b,v21.16b
         ld1    {v2.16b},[x0],#16
+       aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v21.16b
        aesimc  v18.16b,v18.16b
         ld1    {v3.16b},[x0],#16
        aesd    v0.16b,v22.16b
-       aesd    v1.16b,v22.16b
-       aesd    v18.16b,v22.16b
-        ld1    {v19.16b},[x0],#16
        aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v22.16b
        aesimc  v18.16b,v18.16b
-        ld1 {v16.4s},[x7],#16  // re-pre-load rndkey[0]
+        ld1    {v19.16b},[x0],#16
        aesd    v0.16b,v23.16b
        aesd    v1.16b,v23.16b
        aesd    v18.16b,v23.16b
-
+        ld1 {v16.4s},[x7],#16  // re-pre-load rndkey[0]
         add    w6,w5,#2
        eor     v4.16b,v4.16b,v0.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v18.16b,v18.16b,v17.16b
         ld1 {v17.4s},[x7],#16  // re-pre-load rndkey[1]
-        orr    v0.16b,v2.16b,v2.16b
        st1     {v4.16b},[x1],#16
-        orr    v1.16b,v3.16b,v3.16b
+        orr    v0.16b,v2.16b,v2.16b
        st1     {v5.16b},[x1],#16
+        orr    v1.16b,v3.16b,v3.16b
        st1     {v18.16b},[x1],#16
         orr    v18.16b,v19.16b,v19.16b
        b.hs    .Loop3x_cbc_dec
@@ -485,39 +507,39 @@ aes_v8_cbc_encrypt:
 
 .Lcbc_dec_tail:
        aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
-       ld1     {v16.4s},[x7],#16
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
-       ld1     {v17.4s},[x7],#16
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
+       ld1     {v17.4s},[x7],#16
        b.gt    .Lcbc_dec_tail
 
        aesd    v1.16b,v16.16b
-       aesd    v18.16b,v16.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
        aesimc  v18.16b,v18.16b
        aesd    v1.16b,v17.16b
-       aesd    v18.16b,v17.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
        aesimc  v18.16b,v18.16b
        aesd    v1.16b,v20.16b
-       aesd    v18.16b,v20.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v20.16b
        aesimc  v18.16b,v18.16b
         cmn    x2,#0x20
        aesd    v1.16b,v21.16b
-       aesd    v18.16b,v21.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v21.16b
        aesimc  v18.16b,v18.16b
         eor    v5.16b,v6.16b,v7.16b
        aesd    v1.16b,v22.16b
-       aesd    v18.16b,v22.16b
        aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v22.16b
        aesimc  v18.16b,v18.16b
         eor    v17.16b,v3.16b,v7.16b
        aesd    v1.16b,v23.16b
@@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks:
 .align 4
 .Loop3x_ctr32:
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
-       aese            v18.16b,v16.16b
-       ld1             {v16.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
        aesmc           v1.16b,v1.16b
+       aese            v18.16b,v16.16b
        aesmc           v18.16b,v18.16b
+       ld1             {v16.4s},[x7],#16
        subs            w6,w6,#2
        aese            v0.16b,v17.16b
-       aese            v1.16b,v17.16b
-       aese            v18.16b,v17.16b
-       ld1             {v17.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
        aesmc           v1.16b,v1.16b
+       aese            v18.16b,v17.16b
        aesmc           v18.16b,v18.16b
+       ld1             {v17.4s},[x7],#16
        b.gt            .Loop3x_ctr32
 
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
-       aese            v18.16b,v16.16b
-        mov            x7,x3
        aesmc           v4.16b,v0.16b
-        ld1            {v2.16b},[x0],#16
+       aese            v1.16b,v16.16b
        aesmc           v5.16b,v1.16b
-       aesmc           v18.16b,v18.16b
+        ld1            {v2.16b},[x0],#16
         orr            v0.16b,v6.16b,v6.16b
-       aese            v4.16b,v17.16b
+       aese            v18.16b,v16.16b
+       aesmc           v18.16b,v18.16b
         ld1            {v3.16b},[x0],#16
-       aese            v5.16b,v17.16b
-       aese            v18.16b,v17.16b
         orr            v1.16b,v6.16b,v6.16b
+       aese            v4.16b,v17.16b
        aesmc           v4.16b,v4.16b
-        ld1            {v19.16b},[x0],#16
+       aese            v5.16b,v17.16b
        aesmc           v5.16b,v5.16b
+        ld1            {v19.16b},[x0],#16
+        mov            x7,x3
+       aese            v18.16b,v17.16b
        aesmc           v17.16b,v18.16b
         orr            v18.16b,v6.16b,v6.16b
         add            w9,w8,#1
        aese            v4.16b,v20.16b
+       aesmc           v4.16b,v4.16b
        aese            v5.16b,v20.16b
-       aese            v17.16b,v20.16b
+       aesmc           v5.16b,v5.16b
         eor            v2.16b,v2.16b,v7.16b
         add            w10,w8,#2
-       aesmc           v4.16b,v4.16b
-       aesmc           v5.16b,v5.16b
+       aese            v17.16b,v20.16b
        aesmc           v17.16b,v17.16b
         eor            v3.16b,v3.16b,v7.16b
         add            w8,w8,#3
        aese            v4.16b,v21.16b
+       aesmc           v4.16b,v4.16b
        aese            v5.16b,v21.16b
-       aese            v17.16b,v21.16b
+       aesmc           v5.16b,v5.16b
         eor            v19.16b,v19.16b,v7.16b
         rev            w9,w9
-       aesmc           v4.16b,v4.16b
-        ld1     {v16.4s},[x7],#16      // re-pre-load rndkey[0]
-       aesmc           v5.16b,v5.16b
+       aese            v17.16b,v21.16b
        aesmc           v17.16b,v17.16b
         mov    v0.s[3], w9
         rev            w10,w10
        aese            v4.16b,v22.16b
+       aesmc           v4.16b,v4.16b
        aese            v5.16b,v22.16b
-       aese            v17.16b,v22.16b
+       aesmc           v5.16b,v5.16b
         mov    v1.s[3], w10
         rev            w12,w8
-       aesmc           v4.16b,v4.16b
-       aesmc           v5.16b,v5.16b
+       aese            v17.16b,v22.16b
        aesmc           v17.16b,v17.16b
         mov    v18.s[3], w12
         subs           x2,x2,#3
@@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks:
        aese            v5.16b,v23.16b
        aese            v17.16b,v23.16b
 
-        mov            w6,w5
        eor             v2.16b,v2.16b,v4.16b
+        ld1     {v16.4s},[x7],#16      // re-pre-load rndkey[0]
+       st1             {v2.16b},[x1],#16
        eor             v3.16b,v3.16b,v5.16b
+        mov            w6,w5
+       st1             {v3.16b},[x1],#16
        eor             v19.16b,v19.16b,v17.16b
         ld1     {v17.4s},[x7],#16      // re-pre-load rndkey[1]
-       st1             {v2.16b},[x1],#16
-       st1             {v3.16b},[x1],#16
        st1             {v19.16b},[x1],#16
        b.hs            .Loop3x_ctr32
 
@@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks:
 
 .Lctr32_tail:
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
-       ld1             {v16.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
        aesmc           v1.16b,v1.16b
+       ld1             {v16.4s},[x7],#16
        subs            w6,w6,#2
        aese            v0.16b,v17.16b
-       aese            v1.16b,v17.16b
-       ld1             {v17.4s},[x7],#16
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
        aesmc           v1.16b,v1.16b
+       ld1             {v17.4s},[x7],#16
        b.gt            .Lctr32_tail
 
        aese            v0.16b,v16.16b
-       aese            v1.16b,v16.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
        aesmc           v1.16b,v1.16b
        aese            v0.16b,v17.16b
-       aese            v1.16b,v17.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
        aesmc           v1.16b,v1.16b
         ld1            {v2.16b},[x0],x12
        aese            v0.16b,v20.16b
-       aese            v1.16b,v20.16b
-        ld1            {v3.16b},[x0]
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v20.16b
        aesmc           v1.16b,v1.16b
+        ld1            {v3.16b},[x0]
        aese            v0.16b,v21.16b
-       aese            v1.16b,v21.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v21.16b
        aesmc           v1.16b,v1.16b
-       aese            v0.16b,v22.16b
-       aese            v1.16b,v22.16b
         eor            v2.16b,v2.16b,v7.16b
+       aese            v0.16b,v22.16b
        aesmc           v0.16b,v0.16b
+       aese            v1.16b,v22.16b
        aesmc           v1.16b,v1.16b
         eor            v3.16b,v3.16b,v7.16b
        aese            v0.16b,v23.16b
index 1bfb263..479007d 100644 (file)
 .type  gcm_init_v8,%function
 .align 4
 gcm_init_v8:
-       ld1             {v17.2d},[x1]           //load H
-       movi            v16.16b,#0xe1
+       ld1             {v17.2d},[x1]           //load input H
+       movi            v19.16b,#0xe1
+       shl     v19.2d,v19.2d,#57               //0xc2.0
        ext             v3.16b,v17.16b,v17.16b,#8
-       shl     v16.2d,v16.2d,#57
-       ushr    v18.2d,v16.2d,#63
-       ext             v16.16b,v18.16b,v16.16b,#8              //t0=0xc2....01
+       ushr    v18.2d,v19.2d,#63
        dup             v17.4s,v17.s[1]
-       ushr    v19.2d,v3.2d,#63
+       ext             v16.16b,v18.16b,v19.16b,#8              //t0=0xc2....01
+       ushr    v18.2d,v3.2d,#63
        sshr    v17.4s,v17.4s,#31               //broadcast carry bit
-       and             v19.16b,v19.16b,v16.16b
+       and             v18.16b,v18.16b,v16.16b
        shl     v3.2d,v3.2d,#1
-       ext             v19.16b,v19.16b,v19.16b,#8
+       ext             v18.16b,v18.16b,v18.16b,#8
        and             v16.16b,v16.16b,v17.16b
-       orr             v3.16b,v3.16b,v19.16b           //H<<<=1
-       eor             v3.16b,v3.16b,v16.16b           //twisted H
-       st1             {v3.2d},[x0]
+       orr             v3.16b,v3.16b,v18.16b           //H<<<=1
+       eor             v20.16b,v3.16b,v16.16b          //twisted H
+       st1             {v20.2d},[x0],#16               //store Htable[0]
+
+       //calculate H^2
+       ext             v16.16b,v20.16b,v20.16b,#8              //Karatsuba pre-processing
+       pmull   v0.1q,v20.1d,v20.1d
+       eor             v16.16b,v16.16b,v20.16b
+       pmull2  v2.1q,v20.2d,v20.2d
+       pmull   v1.1q,v16.1d,v16.1d
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase
+
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+       eor             v0.16b,v1.16b,v18.16b
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v18.16b,v18.16b,v2.16b
+       eor             v22.16b,v0.16b,v18.16b
+
+       ext             v17.16b,v22.16b,v22.16b,#8              //Karatsuba pre-processing
+       eor             v17.16b,v17.16b,v22.16b
+       ext             v21.16b,v16.16b,v17.16b,#8              //pack Karatsuba pre-processed
+       st1             {v21.2d-v22.2d},[x0]            //store Htable[1..2]
 
        ret
 .size  gcm_init_v8,.-gcm_init_v8
-
 .global        gcm_gmult_v8
 .type  gcm_gmult_v8,%function
 .align 4
 gcm_gmult_v8:
        ld1             {v17.2d},[x0]           //load Xi
        movi            v19.16b,#0xe1
-       ld1             {v20.2d},[x1]           //load twisted H
+       ld1             {v20.2d-v21.2d},[x1]    //load twisted H, ...
        shl     v19.2d,v19.2d,#57
 #ifndef __ARMEB__
        rev64   v17.16b,v17.16b
 #endif
-       ext             v21.16b,v20.16b,v20.16b,#8
-       mov             x3,#0
        ext             v3.16b,v17.16b,v17.16b,#8
-       mov             x12,#0
-       eor             v21.16b,v21.16b,v20.16b         //Karatsuba pre-processing
-       mov             x2,x0
-       b               .Lgmult_v8
-.size  gcm_gmult_v8,.-gcm_gmult_v8
 
+       pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
+       eor             v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
+       pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
+       pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
+
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+       eor             v0.16b,v1.16b,v18.16b
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v18.16b,v18.16b,v2.16b
+       eor             v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+       rev64   v0.16b,v0.16b
+#endif
+       ext             v0.16b,v0.16b,v0.16b,#8
+       st1             {v0.2d},[x0]            //write out Xi
+
+       ret
+.size  gcm_gmult_v8,.-gcm_gmult_v8
 .global        gcm_ghash_v8
 .type  gcm_ghash_v8,%function
 .align 4
 gcm_ghash_v8:
        ld1             {v0.2d},[x0]            //load [rotated] Xi
-       subs            x3,x3,#16
+                                               //"[rotated]" means that
+                                               //loaded value would have
+                                               //to be rotated in order to
+                                               //make it appear as in
+                                               //alorithm specification
+       subs            x3,x3,#32               //see if x3 is 32 or larger
+       mov             x12,#16         //x12 is used as post-
+                                               //increment for input pointer;
+                                               //as loop is modulo-scheduled
+                                               //x12 is zeroed just in time
+                                               //to preclude oversteping
+                                               //inp[len], which means that
+                                               //last block[s] are actually
+                                               //loaded twice, but last
+                                               //copy is not processed
+       ld1             {v20.2d-v21.2d},[x1],#32        //load twisted H, ..., H^2
        movi            v19.16b,#0xe1
-       mov             x12,#16
-       ld1             {v20.2d},[x1]           //load twisted H
-       csel    x12,xzr,x12,eq
-       ext             v0.16b,v0.16b,v0.16b,#8
-       shl     v19.2d,v19.2d,#57
-       ld1             {v17.2d},[x2],x12       //load [rotated] inp
-       ext             v21.16b,v20.16b,v20.16b,#8
+       ld1             {v22.2d},[x1]
+       csel    x12,xzr,x12,eq                  //is it time to zero x12?
+       ext             v0.16b,v0.16b,v0.16b,#8         //rotate Xi
+       ld1             {v16.2d},[x2],#16       //load [rotated] I[0]
+       shl     v19.2d,v19.2d,#57               //compose 0xc2.0 constant
 #ifndef __ARMEB__
+       rev64   v16.16b,v16.16b
        rev64   v0.16b,v0.16b
+#endif
+       ext             v3.16b,v16.16b,v16.16b,#8               //rotate I[0]
+       b.lo            .Lodd_tail_v8           //x3 was less than 32
+       ld1             {v17.2d},[x2],x12       //load [rotated] I[1]
+#ifndef __ARMEB__
        rev64   v17.16b,v17.16b
 #endif
-       eor             v21.16b,v21.16b,v20.16b         //Karatsuba pre-processing
-       ext             v3.16b,v17.16b,v17.16b,#8
-       b               .Loop_v8
+       ext             v7.16b,v17.16b,v17.16b,#8
+       eor             v3.16b,v3.16b,v0.16b            //I[i]^=Xi
+       pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
+       eor             v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
+       pmull2  v6.1q,v20.2d,v7.2d
+       b               .Loop_mod2x_v8
 
 .align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+       ext             v18.16b,v3.16b,v3.16b,#8
+       subs            x3,x3,#32               //is there more data?
+       pmull   v0.1q,v22.1d,v3.1d              //H^2.lo·Xi.lo
+       csel    x12,xzr,x12,lo                  //is it time to zero x12?
+
+        pmull  v5.1q,v21.1d,v17.1d
+       eor             v18.16b,v18.16b,v3.16b          //Karatsuba pre-processing
+       pmull2  v2.1q,v22.2d,v3.2d              //H^2.hi·Xi.hi
+       eor             v0.16b,v0.16b,v4.16b            //accumulate
+       pmull2  v1.1q,v21.2d,v18.2d             //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+        ld1    {v16.2d},[x2],x12       //load [rotated] I[i+2]
+
+       eor             v2.16b,v2.16b,v6.16b
+        csel   x12,xzr,x12,eq                  //is it time to zero x12?
+       eor             v1.16b,v1.16b,v5.16b
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+        ld1    {v17.2d},[x2],x12       //load [rotated] I[i+3]
+#ifndef __ARMEB__
+        rev64  v16.16b,v16.16b
+#endif
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
+
+#ifndef __ARMEB__
+        rev64  v17.16b,v17.16b
+#endif
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+        ext            v7.16b,v17.16b,v17.16b,#8
+        ext            v3.16b,v16.16b,v16.16b,#8
+       eor             v0.16b,v1.16b,v18.16b
+        pmull  v4.1q,v20.1d,v7.1d              //H·Ii+1
+       eor             v3.16b,v3.16b,v2.16b            //accumulate v3.16b early
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v3.16b,v3.16b,v18.16b
+        eor            v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
+       eor             v3.16b,v3.16b,v0.16b
+        pmull2 v6.1q,v20.2d,v7.2d
+       b.hs            .Loop_mod2x_v8          //there was at least 32 more bytes
+
+       eor             v2.16b,v2.16b,v18.16b
+       ext             v3.16b,v16.16b,v16.16b,#8               //re-construct v3.16b
+       adds            x3,x3,#32               //re-construct x3
+       eor             v0.16b,v0.16b,v2.16b            //re-construct v0.16b
+       b.eq            .Ldone_v8               //is x3 zero?
+.Lodd_tail_v8:
        ext             v18.16b,v0.16b,v0.16b,#8
        eor             v3.16b,v3.16b,v0.16b            //inp^=Xi
-       eor             v17.16b,v17.16b,v18.16b         //v17.16b is rotated inp^Xi
+       eor             v17.16b,v16.16b,v18.16b         //v17.16b is rotated inp^Xi
 
-.Lgmult_v8:
        pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
        eor             v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
        pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
-       subs            x3,x3,#16
        pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
-       csel    x12,xzr,x12,eq
 
        ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
        eor             v18.16b,v0.16b,v2.16b
        eor             v1.16b,v1.16b,v17.16b
-        ld1    {v17.2d},[x2],x12       //load [rotated] inp
        eor             v1.16b,v1.16b,v18.16b
-       pmull   v18.1q,v0.1d,v19.1d             //1st phase
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
 
        ins     v2.d[0],v1.d[1]
        ins     v1.d[1],v0.d[0]
-#ifndef __ARMEB__
-        rev64  v17.16b,v17.16b
-#endif
        eor             v0.16b,v1.16b,v18.16b
-        ext            v3.16b,v17.16b,v17.16b,#8
 
-       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
        pmull   v0.1q,v0.1d,v19.1d
        eor             v18.16b,v18.16b,v2.16b
        eor             v0.16b,v0.16b,v18.16b
-       b.hs            .Loop_v8
 
+.Ldone_v8:
 #ifndef __ARMEB__
        rev64   v0.16b,v0.16b
 #endif
index 84708af..6573fe4 100644 (file)
@@ -17,7 +17,10 @@ aesni_encrypt:
        leaq    16(%rdx),%rdx
        jnz     .Loop_enc1_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 .size  aesni_encrypt,.-aesni_encrypt
 
@@ -38,7 +41,10 @@ aesni_decrypt:
        leaq    16(%rdx),%rdx
        jnz     .Loop_dec1_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 .size  aesni_decrypt, .-aesni_decrypt
 .type  _aesni_encrypt2,@function
@@ -264,21 +270,18 @@ _aesni_encrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     .Lenc_loop6_enter
 .align 16
 .Lenc_loop6:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+.Lenc_loop6_enter:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-.Lenc_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     .Ldec_loop6_enter
 .align 16
 .Ldec_loop6:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+.Ldec_loop6_enter:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-.Ldec_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,220,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,220,225
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-.byte  102,68,15,56,220,193
-.byte  102,68,15,56,220,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     .Lenc_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     .Lenc_loop8_inner
 .align 16
 .Lenc_loop8:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
+.Lenc_loop8_inner:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,222,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     .Ldec_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     .Ldec_loop8_inner
 .align 16
 .Ldec_loop8:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
+.Ldec_loop8_inner:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -587,6 +577,7 @@ aesni_ecb_encrypt:
        movups  80(%rdi),%xmm7
        je      .Lecb_enc_six
        movdqu  96(%rdi),%xmm8
+       xorps   %xmm9,%xmm9
        call    _aesni_encrypt8
        movups  %xmm2,(%rsi)
        movups  %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ aesni_ecb_encrypt:
        jnc     .Lecb_dec_loop8
 
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movq    %r11,%rcx
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movl    %r10d,%eax
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
        movups  %xmm9,112(%rsi)
+       pxor    %xmm9,%xmm9
        leaq    128(%rsi),%rsi
        addq    $128,%rdx
        jz      .Lecb_ret
@@ -731,14 +730,23 @@ aesni_ecb_encrypt:
        je      .Lecb_dec_six
        movups  96(%rdi),%xmm8
        movups  (%rcx),%xmm0
+       xorps   %xmm9,%xmm9
        call    _aesni_decrypt8
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_one:
@@ -754,49 +762,73 @@ aesni_ecb_encrypt:
        jnz     .Loop_dec1_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_two:
        call    _aesni_decrypt2
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_five:
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        jmp     .Lecb_ret
 .align 16
 .Lecb_dec_six:
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
 
 .Lecb_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
 .size  aesni_ecb_encrypt,.-aesni_ecb_encrypt
 .globl aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks:
        leaq    16(%rsi),%rsi
        jnz     .Lccm64_enc_outer
 
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 .size  aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
 .globl aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks:
        leaq    16(%r11),%r11
        jnz     .Loop_enc1_6
 .byte  102,15,56,221,217
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 .size  aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
 .globl aesni_ctr32_encrypt_blocks
 .type  aesni_ctr32_encrypt_blocks,@function
 .align 16
 aesni_ctr32_encrypt_blocks:
+       cmpq    $1,%rdx
+       jne     .Lctr32_bulk
+
+
+
+       movups  (%r8),%xmm2
+       movups  (%rdi),%xmm3
+       movl    240(%rcx),%edx
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_enc1_7:
+.byte  102,15,56,220,209
+       decl    %edx
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_enc1_7
+.byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm2
+       jmp     .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $128,%rsp
        andq    $-16,%rsp
        leaq    -8(%rax),%rbp
 
-       cmpq    $1,%rdx
-       je      .Lctr32_one_shortcut
+
+
 
        movdqu  (%r8),%xmm2
        movdqu  (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks:
        leaq    -128(%rcx),%rcx
 
 .Lctr32_tail:
+
+
        leaq    16(%rcx),%rcx
        cmpq    $4,%rdx
        jb      .Lctr32_loop3
        je      .Lctr32_loop4
 
+
        shll    $4,%eax
        movdqa  96(%rsp),%xmm8
        pxor    %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks:
        movups  32(%rdi),%xmm12
        xorps   %xmm12,%xmm4
        movups  %xmm4,32(%rsi)
-       jmp     .Lctr32_done
 
-.align 16
-.Lctr32_one_shortcut:
-       movups  (%r8),%xmm2
-       movups  (%rdi),%xmm10
-       movl    240(%rcx),%eax
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-.Loop_enc1_7:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     .Loop_enc1_7
-.byte  102,15,56,221,209
-       xorps   %xmm10,%xmm2
-       movups  %xmm2,(%rsi)
-       jmp     .Lctr32_done
-
-.align 16
 .Lctr32_done:
+       xorps   %xmm0,%xmm0
+       xorl    %r11d,%r11d
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       movaps  %xmm0,112(%rsp)
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lctr32_epilogue:
@@ -1750,6 +1829,7 @@ aesni_xts_encrypt:
        shrl    $4,%eax
 
 .Lxts_enc_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        addq    $96,%rdx
@@ -1778,6 +1858,7 @@ aesni_xts_encrypt:
        pxor    %xmm12,%xmm4
        pxor    %xmm13,%xmm5
        pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm7
 
        call    _aesni_encrypt6
 
@@ -1920,6 +2001,29 @@ aesni_xts_encrypt:
        movups  %xmm2,-16(%rsi)
 
 .Lxts_enc_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lxts_enc_epilogue:
@@ -2196,6 +2300,7 @@ aesni_xts_decrypt:
        shrl    $4,%eax
 
 .Lxts_dec_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        pxor    %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ aesni_xts_decrypt:
        movups  %xmm2,(%rsi)
 
 .Lxts_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lxts_dec_epilogue:
@@ -2446,7 +2574,11 @@ aesni_cbc_encrypt:
        jnc     .Lcbc_enc_loop
        addq    $16,%rdx
        jnz     .Lcbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
        jmp     .Lcbc_ret
 
 .Lcbc_enc_tail:
@@ -2466,6 +2598,35 @@ aesni_cbc_encrypt:
 
 .align 16
 .Lcbc_decrypt:
+       cmpq    $16,%rdx
+       jne     .Lcbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+.Loop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     .Loop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $16,%rsp
@@ -2702,7 +2863,7 @@ aesni_cbc_encrypt:
        movaps  %xmm9,%xmm2
        leaq    -112(%rcx),%rcx
        addq    $112,%rdx
-       jle     .Lcbc_dec_tail_collected
+       jle     .Lcbc_dec_clear_tail_collected
        movups  %xmm9,(%rsi)
        leaq    16(%rsi),%rsi
        cmpq    $80,%rdx
@@ -2721,14 +2882,19 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        leaq    80(%rsi),%rsi
        movdqa  %xmm7,%xmm2
+       pxor    %xmm7,%xmm7
        jmp     .Lcbc_dec_tail_collected
 
 .align 16
@@ -2743,16 +2909,23 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        pxor    %xmm9,%xmm8
        movdqu  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        leaq    96(%rsi),%rsi
        movdqa  %xmm8,%xmm2
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     .Lcbc_dec_tail_collected
 
 .align 16
@@ -2796,7 +2969,7 @@ aesni_cbc_encrypt:
 
        movdqa  %xmm7,%xmm2
        addq    $80,%rdx
-       jle     .Lcbc_dec_tail_collected
+       jle     .Lcbc_dec_clear_tail_collected
        movups  %xmm7,(%rsi)
        leaq    16(%rsi),%rsi
 
@@ -2831,12 +3004,17 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        leaq    64(%rsi),%rsi
        movdqa  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        subq    $16,%rdx
        jmp     .Lcbc_dec_tail_collected
 
@@ -2847,12 +3025,12 @@ aesni_cbc_encrypt:
        movups  16(%rcx),%xmm1
        leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
 .byte  102,15,56,222,209
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     .Loop_dec1_16
+       jnz     .Loop_dec1_17
 .byte  102,15,56,223,209
        xorps   %xmm10,%xmm2
        movaps  %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ aesni_cbc_encrypt:
        pxor    %xmm11,%xmm3
        movdqu  %xmm2,(%rsi)
        movdqa  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leaq    16(%rsi),%rsi
        jmp     .Lcbc_dec_tail_collected
 .align 16
@@ -2878,7 +3057,9 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movdqa  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        leaq    32(%rsi),%rsi
        jmp     .Lcbc_dec_tail_collected
 .align 16
@@ -2891,29 +3072,45 @@ aesni_cbc_encrypt:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movdqa  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        leaq    48(%rsi),%rsi
        jmp     .Lcbc_dec_tail_collected
 
 .align 16
+.Lcbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
 .Lcbc_dec_tail_collected:
        movups  %xmm10,(%r8)
        andq    $15,%rdx
        jnz     .Lcbc_dec_tail_partial
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     .Lcbc_dec_ret
 .align 16
 .Lcbc_dec_tail_partial:
        movaps  %xmm2,(%rsp)
+       pxor    %xmm2,%xmm2
        movq    $16,%rcx
        movq    %rsi,%rdi
        subq    %rdx,%rcx
        leaq    (%rsp),%rsi
 .long  0x9066A4F3
+       movdqa  %xmm2,(%rsp)
 
 .Lcbc_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        leaq    (%rbp),%rsp
        popq    %rbp
 .Lcbc_ret:
@@ -2951,7 +3148,9 @@ aesni_set_decrypt_key:
 
        movups  (%rdx),%xmm0
 .byte  102,15,56,219,192
+       pxor    %xmm1,%xmm1
        movups  %xmm0,(%rdi)
+       pxor    %xmm0,%xmm0
 .Ldec_key_ret:
        addq    $8,%rsp
        .byte   0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
        testq   %rdx,%rdx
        jz      .Lenc_key_ret
 
+       movl    $268437504,%r10d
        movups  (%rdi),%xmm0
        xorps   %xmm4,%xmm4
+       andl    OPENSSL_ia32cap_P+4(%rip),%r10d
        leaq    16(%rdx),%rax
        cmpl    $256,%esi
        je      .L14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
 
 .L10rounds:
        movl    $9,%esi
+       cmpl    $268435456,%r10d
+       je      .L10rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,200,1
        call    .Lkey_expansion_128_cold
@@ -3008,9 +3212,79 @@ __aesni_set_encrypt_key:
        jmp     .Lenc_key_ret
 
 .align 16
+.L10rounds_alt:
+       movdqa  .Lkey_rotate(%rip),%xmm5
+       movl    $8,%r10d
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,(%rdx)
+       jmp     .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leaq    16(%rax),%rax
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%rax)
+       movdqa  %xmm0,%xmm2
+
+       decl    %r10d
+       jnz     .Loop_key128
+
+       movdqa  .Lkey_rcon1b(%rip),%xmm4
+
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%rax)
+
+       movl    %esi,96(%rax)
+       xorl    %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
 .L12rounds:
        movq    16(%rdi),%xmm2
        movl    $11,%esi
+       cmpl    $268435456,%r10d
+       je      .L12rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,202,1
        call    .Lkey_expansion_192a_cold
@@ -3034,10 +3308,54 @@ __aesni_set_encrypt_key:
        jmp     .Lenc_key_ret
 
 .align 16
+.L12rounds_alt:
+       movdqa  .Lkey_rotate192(%rip),%xmm5
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movl    $8,%r10d
+       movdqu  %xmm0,(%rdx)
+       jmp     .Loop_key192
+
+.align 16
+.Loop_key192:
+       movq    %xmm2,0(%rax)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leaq    24(%rax),%rax
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%rax)
+
+       decl    %r10d
+       jnz     .Loop_key192
+
+       movl    %esi,32(%rax)
+       xorl    %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
 .L14rounds:
        movups  16(%rdi),%xmm2
        movl    $13,%esi
        leaq    16(%rax),%rax
+       cmpl    $268435456,%r10d
+       je      .L14rounds_alt
+
        movups  %xmm0,(%rdx)
        movups  %xmm2,16(%rdx)
 .byte  102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ __aesni_set_encrypt_key:
        jmp     .Lenc_key_ret
 
 .align 16
+.L14rounds_alt:
+       movdqa  .Lkey_rotate(%rip),%xmm5
+       movdqa  .Lkey_rcon1(%rip),%xmm4
+       movl    $7,%r10d
+       movdqu  %xmm0,0(%rdx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,16(%rdx)
+       jmp     .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       decl    %r10d
+       jz      .Ldone_key256
+
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%rax)
+       leaq    32(%rax),%rax
+       movdqa  %xmm2,%xmm1
+
+       jmp     .Loop_key256
+
+.Ldone_key256:
+       movl    %esi,16(%rax)
+       xorl    %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
 .Lbad_keybits:
        movq    $-2,%rax
 .Lenc_key_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
        addq    $8,%rsp
        .byte   0xf3,0xc3
 .LSEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key:
 .long  0x87,0,1,0
 .Lincrement1:
 .byte  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long  0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long  1,1,1,1
+.Lkey_rcon1b:
+.long  0x1b,0x1b,0x1b,0x1b
 
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 64
index 1bf368c..5f98ff2 100644 (file)
@@ -1755,11 +1755,16 @@ bn_from_mont8x:
 .type  bn_get_bits5,@function
 .align 16
 bn_get_bits5:
-       movq    %rdi,%r10
+       leaq    0(%rdi),%r10
+       leaq    1(%rdi),%r11
        movl    %esi,%ecx
-       shrl    $3,%esi
-       movzwl  (%r10,%rsi,1),%eax
-       andl    $7,%ecx
+       shrl    $4,%esi
+       andl    $15,%ecx
+       leal    -8(%rcx),%eax
+       cmpl    $11,%ecx
+       cmovaq  %r11,%r10
+       cmoval  %eax,%ecx
+       movzwl  (%r10,%rsi,2),%eax
        shrl    %cl,%eax
        andl    $31,%eax
        .byte   0xf3,0xc3
index 57509ae..41ad80e 100644 (file)
@@ -17,7 +17,10 @@ L$oop_enc1_1:
        leaq    16(%rdx),%rdx
        jnz     L$oop_enc1_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 
 
@@ -38,7 +41,10 @@ L$oop_dec1_2:
        leaq    16(%rdx),%rdx
        jnz     L$oop_dec1_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        .byte   0xf3,0xc3
 
 
@@ -264,21 +270,18 @@ _aesni_encrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     L$enc_loop6_enter
 .p2align       4
 L$enc_loop6:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+L$enc_loop6_enter:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
-L$enc_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
        pxor    %xmm0,%xmm6
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%rcx,%rax,1),%xmm0
        addq    $16,%rax
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%rcx,%rax,1),%xmm0
        jmp     L$dec_loop6_enter
 .p2align       4
 L$dec_loop6:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+L$dec_loop6_enter:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
-L$dec_loop6_enter:
        movups  (%rcx,%rax,1),%xmm1
        addq    $32,%rax
 .byte  102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,220,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,220,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,220,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,220,225
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-.byte  102,68,15,56,220,193
-.byte  102,68,15,56,220,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     L$enc_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     L$enc_loop8_inner
 .p2align       4
 L$enc_loop8:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
+L$enc_loop8_inner:
 .byte  102,15,56,220,225
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
        leaq    32(%rcx,%rax,1),%rcx
        negq    %rax
 .byte  102,15,56,222,209
-       addq    $16,%rax
        pxor    %xmm0,%xmm7
-.byte  102,15,56,222,217
        pxor    %xmm0,%xmm8
+.byte  102,15,56,222,217
        pxor    %xmm0,%xmm9
-.byte  102,15,56,222,225
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-.byte  102,68,15,56,222,193
-.byte  102,68,15,56,222,201
-       movups  -16(%rcx,%rax,1),%xmm0
-       jmp     L$dec_loop8_enter
+       movups  (%rcx,%rax,1),%xmm0
+       addq    $16,%rax
+       jmp     L$dec_loop8_inner
 .p2align       4
 L$dec_loop8:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
+L$dec_loop8_inner:
 .byte  102,15,56,222,225
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
@@ -587,6 +577,7 @@ L$ecb_enc_tail:
        movups  80(%rdi),%xmm7
        je      L$ecb_enc_six
        movdqu  96(%rdi),%xmm8
+       xorps   %xmm9,%xmm9
        call    _aesni_encrypt8
        movups  %xmm2,(%rsi)
        movups  %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter:
        jnc     L$ecb_dec_loop8
 
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movq    %r11,%rcx
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movl    %r10d,%eax
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
        movups  %xmm9,112(%rsi)
+       pxor    %xmm9,%xmm9
        leaq    128(%rsi),%rsi
        addq    $128,%rdx
        jz      L$ecb_ret
@@ -731,14 +730,23 @@ L$ecb_dec_tail:
        je      L$ecb_dec_six
        movups  96(%rdi),%xmm8
        movups  (%rcx),%xmm0
+       xorps   %xmm9,%xmm9
        call    _aesni_decrypt8
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        movups  %xmm8,96(%rsi)
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_one:
@@ -754,49 +762,73 @@ L$oop_dec1_4:
        jnz     L$oop_dec1_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_two:
        call    _aesni_decrypt2
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_five:
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        jmp     L$ecb_ret
 .p2align       4
 L$ecb_dec_six:
        call    _aesni_decrypt6
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        movups  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        movups  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        movups  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
 
 L$ecb_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        .byte   0xf3,0xc3
 
 .globl _aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ L$ccm64_enc2_loop:
        leaq    16(%rsi),%rsi
        jnz     L$ccm64_enc_outer
 
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 
 .globl _aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ L$oop_enc1_6:
        leaq    16(%r11),%r11
        jnz     L$oop_enc1_6
 .byte  102,15,56,221,217
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
        movups  %xmm3,(%r9)
+       pxor    %xmm3,%xmm3
+       pxor    %xmm8,%xmm8
+       pxor    %xmm6,%xmm6
        .byte   0xf3,0xc3
 
 .globl _aesni_ctr32_encrypt_blocks
 
 .p2align       4
 _aesni_ctr32_encrypt_blocks:
+       cmpq    $1,%rdx
+       jne     L$ctr32_bulk
+
+
+
+       movups  (%r8),%xmm2
+       movups  (%rdi),%xmm3
+       movl    240(%rcx),%edx
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+L$oop_enc1_7:
+.byte  102,15,56,220,209
+       decl    %edx
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     L$oop_enc1_7
+.byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       xorps   %xmm2,%xmm2
+       jmp     L$ctr32_epilogue
+
+.p2align       4
+L$ctr32_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $128,%rsp
        andq    $-16,%rsp
        leaq    -8(%rax),%rbp
 
-       cmpq    $1,%rdx
-       je      L$ctr32_one_shortcut
+
+
 
        movdqu  (%r8),%xmm2
        movdqu  (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ L$ctr32_enc_done:
        leaq    -128(%rcx),%rcx
 
 L$ctr32_tail:
+
+
        leaq    16(%rcx),%rcx
        cmpq    $4,%rdx
        jb      L$ctr32_loop3
        je      L$ctr32_loop4
 
+
        shll    $4,%eax
        movdqa  96(%rsp),%xmm8
        pxor    %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ L$ctr32_loop3:
        movups  32(%rdi),%xmm12
        xorps   %xmm12,%xmm4
        movups  %xmm4,32(%rsi)
-       jmp     L$ctr32_done
 
-.p2align       4
-L$ctr32_one_shortcut:
-       movups  (%r8),%xmm2
-       movups  (%rdi),%xmm10
-       movl    240(%rcx),%eax
-       movups  (%rcx),%xmm0
-       movups  16(%rcx),%xmm1
-       leaq    32(%rcx),%rcx
-       xorps   %xmm0,%xmm2
-L$oop_enc1_7:
-.byte  102,15,56,220,209
-       decl    %eax
-       movups  (%rcx),%xmm1
-       leaq    16(%rcx),%rcx
-       jnz     L$oop_enc1_7
-.byte  102,15,56,221,209
-       xorps   %xmm10,%xmm2
-       movups  %xmm2,(%rsi)
-       jmp     L$ctr32_done
-
-.p2align       4
 L$ctr32_done:
+       xorps   %xmm0,%xmm0
+       xorl    %r11d,%r11d
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       movaps  %xmm0,112(%rsp)
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 L$ctr32_epilogue:
@@ -1750,6 +1829,7 @@ L$xts_enc_loop6:
        shrl    $4,%eax
 
 L$xts_enc_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        addq    $96,%rdx
@@ -1778,6 +1858,7 @@ L$xts_enc_short:
        pxor    %xmm12,%xmm4
        pxor    %xmm13,%xmm5
        pxor    %xmm14,%xmm6
+       pxor    %xmm7,%xmm7
 
        call    _aesni_encrypt6
 
@@ -1920,6 +2001,29 @@ L$oop_enc1_10:
        movups  %xmm2,-16(%rsi)
 
 L$xts_enc_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 L$xts_enc_epilogue:
@@ -2196,6 +2300,7 @@ L$xts_dec_loop6:
        shrl    $4,%eax
 
 L$xts_dec_short:
+
        movl    %eax,%r10d
        pxor    %xmm0,%xmm10
        pxor    %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ L$oop_dec1_14:
        movups  %xmm2,(%rsi)
 
 L$xts_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       movaps  %xmm0,0(%rsp)
+       pxor    %xmm8,%xmm8
+       movaps  %xmm0,16(%rsp)
+       pxor    %xmm9,%xmm9
+       movaps  %xmm0,32(%rsp)
+       pxor    %xmm10,%xmm10
+       movaps  %xmm0,48(%rsp)
+       pxor    %xmm11,%xmm11
+       movaps  %xmm0,64(%rsp)
+       pxor    %xmm12,%xmm12
+       movaps  %xmm0,80(%rsp)
+       pxor    %xmm13,%xmm13
+       movaps  %xmm0,96(%rsp)
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
        leaq    (%rbp),%rsp
        popq    %rbp
 L$xts_dec_epilogue:
@@ -2446,7 +2574,11 @@ L$oop_enc1_15:
        jnc     L$cbc_enc_loop
        addq    $16,%rdx
        jnz     L$cbc_enc_tail
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%r8)
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
        jmp     L$cbc_ret
 
 L$cbc_enc_tail:
@@ -2466,6 +2598,35 @@ L$cbc_enc_tail:
 
 .p2align       4
 L$cbc_decrypt:
+       cmpq    $16,%rdx
+       jne     L$cbc_decrypt_bulk
+
+
+
+       movdqu  (%rdi),%xmm2
+       movdqu  (%r8),%xmm3
+       movdqa  %xmm2,%xmm4
+       movups  (%rcx),%xmm0
+       movups  16(%rcx),%xmm1
+       leaq    32(%rcx),%rcx
+       xorps   %xmm0,%xmm2
+L$oop_dec1_16:
+.byte  102,15,56,222,209
+       decl    %r10d
+       movups  (%rcx),%xmm1
+       leaq    16(%rcx),%rcx
+       jnz     L$oop_dec1_16
+.byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       movdqu  %xmm4,(%r8)
+       xorps   %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
+       movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
+       jmp     L$cbc_ret
+.p2align       4
+L$cbc_decrypt_bulk:
        leaq    (%rsp),%rax
        pushq   %rbp
        subq    $16,%rsp
@@ -2702,7 +2863,7 @@ L$cbc_dec_done:
        movaps  %xmm9,%xmm2
        leaq    -112(%rcx),%rcx
        addq    $112,%rdx
-       jle     L$cbc_dec_tail_collected
+       jle     L$cbc_dec_clear_tail_collected
        movups  %xmm9,(%rsi)
        leaq    16(%rsi),%rsi
        cmpq    $80,%rdx
@@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        leaq    80(%rsi),%rsi
        movdqa  %xmm7,%xmm2
+       pxor    %xmm7,%xmm7
        jmp     L$cbc_dec_tail_collected
 
 .p2align       4
@@ -2743,16 +2909,23 @@ L$cbc_dec_seven:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        pxor    %xmm15,%xmm7
        movdqu  %xmm6,64(%rsi)
+       pxor    %xmm6,%xmm6
        pxor    %xmm9,%xmm8
        movdqu  %xmm7,80(%rsi)
+       pxor    %xmm7,%xmm7
        leaq    96(%rsi),%rsi
        movdqa  %xmm8,%xmm2
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
        jmp     L$cbc_dec_tail_collected
 
 .p2align       4
@@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter:
 
        movdqa  %xmm7,%xmm2
        addq    $80,%rdx
-       jle     L$cbc_dec_tail_collected
+       jle     L$cbc_dec_clear_tail_collected
        movups  %xmm7,(%rsi)
        leaq    16(%rsi),%rsi
 
@@ -2831,12 +3004,17 @@ L$cbc_dec_tail:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        pxor    %xmm14,%xmm6
        movdqu  %xmm5,48(%rsi)
+       pxor    %xmm5,%xmm5
        leaq    64(%rsi),%rsi
        movdqa  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        subq    $16,%rdx
        jmp     L$cbc_dec_tail_collected
 
@@ -2847,12 +3025,12 @@ L$cbc_dec_one:
        movups  16(%rcx),%xmm1
        leaq    32(%rcx),%rcx
        xorps   %xmm0,%xmm2
-L$oop_dec1_16:
+L$oop_dec1_17:
 .byte  102,15,56,222,209
        decl    %eax
        movups  (%rcx),%xmm1
        leaq    16(%rcx),%rcx
-       jnz     L$oop_dec1_16
+       jnz     L$oop_dec1_17
 .byte  102,15,56,223,209
        xorps   %xmm10,%xmm2
        movaps  %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ L$cbc_dec_two:
        pxor    %xmm11,%xmm3
        movdqu  %xmm2,(%rsi)
        movdqa  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leaq    16(%rsi),%rsi
        jmp     L$cbc_dec_tail_collected
 .p2align       4
@@ -2878,7 +3057,9 @@ L$cbc_dec_three:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        movdqa  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        leaq    32(%rsi),%rsi
        jmp     L$cbc_dec_tail_collected
 .p2align       4
@@ -2891,29 +3072,45 @@ L$cbc_dec_four:
        movdqu  %xmm2,(%rsi)
        pxor    %xmm12,%xmm4
        movdqu  %xmm3,16(%rsi)
+       pxor    %xmm3,%xmm3
        pxor    %xmm13,%xmm5
        movdqu  %xmm4,32(%rsi)
+       pxor    %xmm4,%xmm4
        movdqa  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        leaq    48(%rsi),%rsi
        jmp     L$cbc_dec_tail_collected
 
 .p2align       4
+L$cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
 L$cbc_dec_tail_collected:
        movups  %xmm10,(%r8)
        andq    $15,%rdx
        jnz     L$cbc_dec_tail_partial
        movups  %xmm2,(%rsi)
+       pxor    %xmm2,%xmm2
        jmp     L$cbc_dec_ret
 .p2align       4
 L$cbc_dec_tail_partial:
        movaps  %xmm2,(%rsp)
+       pxor    %xmm2,%xmm2
        movq    $16,%rcx
        movq    %rsi,%rdi
        subq    %rdx,%rcx
        leaq    (%rsp),%rsi
 .long  0x9066A4F3
+       movdqa  %xmm2,(%rsp)
 
 L$cbc_dec_ret:
+       xorps   %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        leaq    (%rbp),%rsp
        popq    %rbp
 L$cbc_ret:
@@ -2951,7 +3148,9 @@ L$dec_key_inverse:
 
        movups  (%rdx),%xmm0
 .byte  102,15,56,219,192
+       pxor    %xmm1,%xmm1
        movups  %xmm0,(%rdi)
+       pxor    %xmm0,%xmm0
 L$dec_key_ret:
        addq    $8,%rsp
        .byte   0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
        testq   %rdx,%rdx
        jz      L$enc_key_ret
 
+       movl    $268437504,%r10d
        movups  (%rdi),%xmm0
        xorps   %xmm4,%xmm4
+       andl    _OPENSSL_ia32cap_P+4(%rip),%r10d
        leaq    16(%rdx),%rax
        cmpl    $256,%esi
        je      L$14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
 
 L$10rounds:
        movl    $9,%esi
+       cmpl    $268435456,%r10d
+       je      L$10rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,200,1
        call    L$key_expansion_128_cold
@@ -3008,9 +3212,79 @@ L$10rounds:
        jmp     L$enc_key_ret
 
 .p2align       4
+L$10rounds_alt:
+       movdqa  L$key_rotate(%rip),%xmm5
+       movl    $8,%r10d
+       movdqa  L$key_rcon1(%rip),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,(%rdx)
+       jmp     L$oop_key128
+
+.p2align       4
+L$oop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leaq    16(%rax),%rax
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%rax)
+       movdqa  %xmm0,%xmm2
+
+       decl    %r10d
+       jnz     L$oop_key128
+
+       movdqa  L$key_rcon1b(%rip),%xmm4
+
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%rax)
+
+       movl    %esi,96(%rax)
+       xorl    %eax,%eax
+       jmp     L$enc_key_ret
+
+.p2align       4
 L$12rounds:
        movq    16(%rdi),%xmm2
        movl    $11,%esi
+       cmpl    $268435456,%r10d
+       je      L$12rounds_alt
+
        movups  %xmm0,(%rdx)
 .byte  102,15,58,223,202,1
        call    L$key_expansion_192a_cold
@@ -3034,10 +3308,54 @@ L$12rounds:
        jmp     L$enc_key_ret
 
 .p2align       4
+L$12rounds_alt:
+       movdqa  L$key_rotate192(%rip),%xmm5
+       movdqa  L$key_rcon1(%rip),%xmm4
+       movl    $8,%r10d
+       movdqu  %xmm0,(%rdx)
+       jmp     L$oop_key192
+
+.p2align       4
+L$oop_key192:
+       movq    %xmm2,0(%rax)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leaq    24(%rax),%rax
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%rax)
+
+       decl    %r10d
+       jnz     L$oop_key192
+
+       movl    %esi,32(%rax)
+       xorl    %eax,%eax
+       jmp     L$enc_key_ret
+
+.p2align       4
 L$14rounds:
        movups  16(%rdi),%xmm2
        movl    $13,%esi
        leaq    16(%rax),%rax
+       cmpl    $268435456,%r10d
+       je      L$14rounds_alt
+
        movups  %xmm0,(%rdx)
        movups  %xmm2,16(%rdx)
 .byte  102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ L$14rounds:
        jmp     L$enc_key_ret
 
 .p2align       4
+L$14rounds_alt:
+       movdqa  L$key_rotate(%rip),%xmm5
+       movdqa  L$key_rcon1(%rip),%xmm4
+       movl    $7,%r10d
+       movdqu  %xmm0,0(%rdx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,16(%rdx)
+       jmp     L$oop_key256
+
+.p2align       4
+L$oop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%rax)
+
+       decl    %r10d
+       jz      L$done_key256
+
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%rax)
+       leaq    32(%rax),%rax
+       movdqa  %xmm2,%xmm1
+
+       jmp     L$oop_key256
+
+L$done_key256:
+       movl    %esi,16(%rax)
+       xorl    %eax,%eax
+       jmp     L$enc_key_ret
+
+.p2align       4
 L$bad_keybits:
        movq    $-2,%rax
 L$enc_key_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
        addq    $8,%rsp
        .byte   0xf3,0xc3
 L$SEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ L$xts_magic:
 .long  0x87,0,1,0
 L$increment1:
 .byte  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long  0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long  1,1,1,1
+L$key_rcon1b:
+.long  0x1b,0x1b,0x1b,0x1b
 
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .p2align       6
index ba4d621..049bf06 100644 (file)
@@ -1755,11 +1755,16 @@ L$from_epilogue:
 
 .p2align       4
 _bn_get_bits5:
-       movq    %rdi,%r10
+       leaq    0(%rdi),%r10
+       leaq    1(%rdi),%r11
        movl    %esi,%ecx
-       shrl    $3,%esi
-       movzwl  (%r10,%rsi,1),%eax
-       andl    $7,%ecx
+       shrl    $4,%esi
+       andl    $15,%ecx
+       leal    -8(%rcx),%eax
+       cmpl    $11,%ecx
+       cmovaq  %r11,%r10
+       cmoval  %eax,%ecx
+       movzwl  (%r10,%rsi,2),%eax
        shrl    %cl,%eax
        andl    $31,%eax
        .byte   0xf3,0xc3
index 9473352..34b554f 100644 (file)
@@ -60,77 +60,6 @@ DB   54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98
 DB     121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108
 DB     46,111,114,103,62,0
 ALIGN  64
-       mov     rsi,rax
-       mov     rax,QWORD PTR[((64+56))+rax]
-       lea     rax,QWORD PTR[48+rax]
-
-       mov     rbx,QWORD PTR[((-8))+rax]
-       mov     rbp,QWORD PTR[((-16))+rax]
-       mov     r12,QWORD PTR[((-24))+rax]
-       mov     r13,QWORD PTR[((-32))+rax]
-       mov     r14,QWORD PTR[((-40))+rax]
-       mov     r15,QWORD PTR[((-48))+rax]
-       mov     QWORD PTR[144+r8],rbx
-       mov     QWORD PTR[160+r8],rbp
-       mov     QWORD PTR[216+r8],r12
-       mov     QWORD PTR[224+r8],r13
-       mov     QWORD PTR[232+r8],r14
-       mov     QWORD PTR[240+r8],r15
-
-       lea     rsi,QWORD PTR[((64+64))+rsi]
-       lea     rdi,QWORD PTR[512+r8]
-       mov     ecx,20
-       DD      0a548f3fch
-
-$L$in_prologue::
-       mov     rdi,QWORD PTR[8+rax]
-       mov     rsi,QWORD PTR[16+rax]
-       mov     QWORD PTR[152+r8],rax
-       mov     QWORD PTR[168+r8],rsi
-       mov     QWORD PTR[176+r8],rdi
-
-       mov     rdi,QWORD PTR[40+r9]
-       mov     rsi,r8
-       mov     ecx,154
-       DD      0a548f3fch
-
-       mov     rsi,r9
-       xor     rcx,rcx
-       mov     rdx,QWORD PTR[8+rsi]
-       mov     r8,QWORD PTR[rsi]
-       mov     r9,QWORD PTR[16+rsi]
-       mov     r10,QWORD PTR[40+rsi]
-       lea     r11,QWORD PTR[56+rsi]
-       lea     r12,QWORD PTR[24+rsi]
-       mov     QWORD PTR[32+rsp],r10
-       mov     QWORD PTR[40+rsp],r11
-       mov     QWORD PTR[48+rsp],r12
-       mov     QWORD PTR[56+rsp],rcx
-       call    QWORD PTR[__imp_RtlVirtualUnwind]
-
-       mov     eax,1
-       add     rsp,64
-       popfq
-       pop     r15
-       pop     r14
-       pop     r13
-       pop     r12
-       pop     rbp
-       pop     rbx
-       pop     rdi
-       pop     rsi
-       DB      0F3h,0C3h               ;repret
-
 
 .text$ ENDS
-.pdata SEGMENT READONLY ALIGN(4)
-       DD      imagerel $L$SEH_begin_aesni_cbc_sha256_enc_xop
-       DD      imagerel $L$SEH_end_aesni_cbc_sha256_enc_xop
-       DD      imagerel $L$SEH_info_aesni_cbc_sha256_enc_xop
-
-       DD      imagerel $L$SEH_begin_aesni_cbc_sha256_enc_avx
-       DD      imagerel $L$SEH_end_aesni_cbc_sha256_enc_avx
-       DD      imagerel $L$SEH_info_aesni_cbc_sha256_enc_avx
-
-.pdata ENDS
 END
index 53d8afc..5e84812 100644 (file)
@@ -18,7 +18,10 @@ DB   102,15,56,220,209
        lea     r8,QWORD PTR[16+r8]
        jnz     $L$oop_enc1_1
 DB     102,15,56,221,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[rdx],xmm2
+       pxor    xmm2,xmm2
        DB      0F3h,0C3h               ;repret
 aesni_encrypt  ENDP
 
@@ -39,7 +42,10 @@ DB   102,15,56,222,209
        lea     r8,QWORD PTR[16+r8]
        jnz     $L$oop_dec1_2
 DB     102,15,56,223,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[rdx],xmm2
+       pxor    xmm2,xmm2
        DB      0F3h,0C3h               ;repret
 aesni_decrypt  ENDP
 
@@ -265,21 +271,18 @@ DB        102,15,56,220,217
        pxor    xmm6,xmm0
 DB     102,15,56,220,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
        add     rax,16
-DB     102,15,56,220,233
-DB     102,15,56,220,241
-DB     102,15,56,220,249
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
        jmp     $L$enc_loop6_enter
 ALIGN  16
 $L$enc_loop6::
 DB     102,15,56,220,209
 DB     102,15,56,220,217
 DB     102,15,56,220,225
+$L$enc_loop6_enter::
 DB     102,15,56,220,233
 DB     102,15,56,220,241
 DB     102,15,56,220,249
-$L$enc_loop6_enter::
        movups  xmm1,XMMWORD PTR[rax*1+rcx]
        add     rax,32
 DB     102,15,56,220,208
@@ -322,21 +325,18 @@ DB        102,15,56,222,217
        pxor    xmm6,xmm0
 DB     102,15,56,222,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
        add     rax,16
-DB     102,15,56,222,233
-DB     102,15,56,222,241
-DB     102,15,56,222,249
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
        jmp     $L$dec_loop6_enter
 ALIGN  16
 $L$dec_loop6::
 DB     102,15,56,222,209
 DB     102,15,56,222,217
 DB     102,15,56,222,225
+$L$dec_loop6_enter::
 DB     102,15,56,222,233
 DB     102,15,56,222,241
 DB     102,15,56,222,249
-$L$dec_loop6_enter::
        movups  xmm1,XMMWORD PTR[rax*1+rcx]
        add     rax,32
 DB     102,15,56,222,208
@@ -376,23 +376,18 @@ _aesni_encrypt8   PROC PRIVATE
        lea     rcx,QWORD PTR[32+rax*1+rcx]
        neg     rax
 DB     102,15,56,220,209
-       add     rax,16
        pxor    xmm7,xmm0
-DB     102,15,56,220,217
        pxor    xmm8,xmm0
+DB     102,15,56,220,217
        pxor    xmm9,xmm0
-DB     102,15,56,220,225
-DB     102,15,56,220,233
-DB     102,15,56,220,241
-DB     102,15,56,220,249
-DB     102,68,15,56,220,193
-DB     102,68,15,56,220,201
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
-       jmp     $L$enc_loop8_enter
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
+       add     rax,16
+       jmp     $L$enc_loop8_inner
 ALIGN  16
 $L$enc_loop8::
 DB     102,15,56,220,209
 DB     102,15,56,220,217
+$L$enc_loop8_inner::
 DB     102,15,56,220,225
 DB     102,15,56,220,233
 DB     102,15,56,220,241
@@ -445,23 +440,18 @@ _aesni_decrypt8   PROC PRIVATE
        lea     rcx,QWORD PTR[32+rax*1+rcx]
        neg     rax
 DB     102,15,56,222,209
-       add     rax,16
        pxor    xmm7,xmm0
-DB     102,15,56,222,217
        pxor    xmm8,xmm0
+DB     102,15,56,222,217
        pxor    xmm9,xmm0
-DB     102,15,56,222,225
-DB     102,15,56,222,233
-DB     102,15,56,222,241
-DB     102,15,56,222,249
-DB     102,68,15,56,222,193
-DB     102,68,15,56,222,201
-       movups  xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
-       jmp     $L$dec_loop8_enter
+       movups  xmm0,XMMWORD PTR[rax*1+rcx]
+       add     rax,16
+       jmp     $L$dec_loop8_inner
 ALIGN  16
 $L$dec_loop8::
 DB     102,15,56,222,209
 DB     102,15,56,222,217
+$L$dec_loop8_inner::
 DB     102,15,56,222,225
 DB     102,15,56,222,233
 DB     102,15,56,222,241
@@ -605,6 +595,7 @@ $L$ecb_enc_tail::
        movups  xmm7,XMMWORD PTR[80+rdi]
        je      $L$ecb_enc_six
        movdqu  xmm8,XMMWORD PTR[96+rdi]
+       xorps   xmm9,xmm9
        call    _aesni_encrypt8
        movups  XMMWORD PTR[rsi],xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
@@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter::
        jnc     $L$ecb_dec_loop8
 
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        mov     rcx,r11
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        mov     eax,r10d
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        movups  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
        movups  XMMWORD PTR[96+rsi],xmm8
+       pxor    xmm8,xmm8
        movups  XMMWORD PTR[112+rsi],xmm9
+       pxor    xmm9,xmm9
        lea     rsi,QWORD PTR[128+rsi]
        add     rdx,080h
        jz      $L$ecb_ret
@@ -749,14 +748,23 @@ $L$ecb_dec_tail::
        je      $L$ecb_dec_six
        movups  xmm8,XMMWORD PTR[96+rdi]
        movups  xmm0,XMMWORD PTR[rcx]
+       xorps   xmm9,xmm9
        call    _aesni_decrypt8
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        movups  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
        movups  XMMWORD PTR[96+rsi],xmm8
+       pxor    xmm8,xmm8
+       pxor    xmm9,xmm9
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_one::
@@ -772,53 +780,81 @@ DB        102,15,56,222,209
        jnz     $L$oop_dec1_4
 DB     102,15,56,223,209
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_two::
        call    _aesni_decrypt2
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_three::
        call    _aesni_decrypt3
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_four::
        call    _aesni_decrypt4
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_five::
        xorps   xmm7,xmm7
        call    _aesni_decrypt6
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        jmp     $L$ecb_ret
 ALIGN  16
 $L$ecb_dec_six::
        call    _aesni_decrypt6
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        movups  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        movups  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
 
 $L$ecb_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
        movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  XMMWORD PTR[rsp],xmm0
        movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        lea     rsp,QWORD PTR[88+rsp]
 $L$ecb_enc_ret::
        mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
@@ -898,11 +934,21 @@ DB        102,15,56,0,215
        lea     rsi,QWORD PTR[16+rsi]
        jnz     $L$ccm64_enc_outer
 
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[r9],xmm3
+       pxor    xmm3,xmm3
+       pxor    xmm8,xmm8
+       pxor    xmm6,xmm6
        movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  XMMWORD PTR[rsp],xmm0
        movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        lea     rsp,QWORD PTR[88+rsp]
 $L$ccm64_enc_ret::
        mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
@@ -1016,11 +1062,21 @@ DB      102,15,56,220,217
        lea     r11,QWORD PTR[16+r11]
        jnz     $L$oop_enc1_6
 DB     102,15,56,221,217
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
        movups  XMMWORD PTR[r9],xmm3
+       pxor    xmm3,xmm3
+       pxor    xmm8,xmm8
+       pxor    xmm6,xmm6
        movaps  xmm6,XMMWORD PTR[rsp]
+       movaps  XMMWORD PTR[rsp],xmm0
        movaps  xmm7,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        lea     rsp,QWORD PTR[88+rsp]
 $L$ccm64_dec_ret::
        mov     rdi,QWORD PTR[8+rsp]    ;WIN64 epilogue
@@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
        mov     r8,QWORD PTR[40+rsp]
 
 
+       cmp     rdx,1
+       jne     $L$ctr32_bulk
+
+
+
+       movups  xmm2,XMMWORD PTR[r8]
+       movups  xmm3,XMMWORD PTR[rdi]
+       mov     edx,DWORD PTR[240+rcx]
+       movups  xmm0,XMMWORD PTR[rcx]
+       movups  xmm1,XMMWORD PTR[16+rcx]
+       lea     rcx,QWORD PTR[32+rcx]
+       xorps   xmm2,xmm0
+$L$oop_enc1_7::
+DB     102,15,56,220,209
+       dec     edx
+       movups  xmm1,XMMWORD PTR[rcx]
+       lea     rcx,QWORD PTR[16+rcx]
+       jnz     $L$oop_enc1_7
+DB     102,15,56,221,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       xorps   xmm2,xmm3
+       pxor    xmm3,xmm3
+       movups  XMMWORD PTR[rsi],xmm2
+       xorps   xmm2,xmm2
+       jmp     $L$ctr32_epilogue
+
+ALIGN  16
+$L$ctr32_bulk::
        lea     rax,QWORD PTR[rsp]
        push    rbp
        sub     rsp,288
@@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
 $L$ctr32_body::
        lea     rbp,QWORD PTR[((-8))+rax]
 
-       cmp     rdx,1
-       je      $L$ctr32_one_shortcut
+
+
 
        movdqu  xmm2,XMMWORD PTR[r8]
        movdqu  xmm0,XMMWORD PTR[rcx]
@@ -1452,11 +1537,14 @@ DB      102,69,15,56,221,202
        lea     rcx,QWORD PTR[((-128))+rcx]
 
 $L$ctr32_tail::
+
+
        lea     rcx,QWORD PTR[16+rcx]
        cmp     rdx,4
        jb      $L$ctr32_loop3
        je      $L$ctr32_loop4
 
+
        shl     eax,4
        movdqa  xmm8,XMMWORD PTR[96+rsp]
        pxor    xmm9,xmm9
@@ -1559,40 +1647,43 @@ DB      102,15,56,221,225
        movups  xmm12,XMMWORD PTR[32+rdi]
        xorps   xmm4,xmm12
        movups  XMMWORD PTR[32+rsi],xmm4
-       jmp     $L$ctr32_done
 
-ALIGN  16
-$L$ctr32_one_shortcut::
-       movups  xmm2,XMMWORD PTR[r8]
-       movups  xmm10,XMMWORD PTR[rdi]
-       mov     eax,DWORD PTR[240+rcx]
-       movups  xmm0,XMMWORD PTR[rcx]
-       movups  xmm1,XMMWORD PTR[16+rcx]
-       lea     rcx,QWORD PTR[32+rcx]
-       xorps   xmm2,xmm0
-$L$oop_enc1_7::
-DB     102,15,56,220,209
-       dec     eax
-       movups  xmm1,XMMWORD PTR[rcx]
-       lea     rcx,QWORD PTR[16+rcx]
-       jnz     $L$oop_enc1_7
-DB     102,15,56,221,209
-       xorps   xmm2,xmm10
-       movups  XMMWORD PTR[rsi],xmm2
-       jmp     $L$ctr32_done
-
-ALIGN  16
 $L$ctr32_done::
+       xorps   xmm0,xmm0
+       xor     r11d,r11d
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        movaps  xmm6,XMMWORD PTR[((-160))+rbp]
+       movaps  XMMWORD PTR[(-160)+rbp],xmm0
        movaps  xmm7,XMMWORD PTR[((-144))+rbp]
+       movaps  XMMWORD PTR[(-144)+rbp],xmm0
        movaps  xmm8,XMMWORD PTR[((-128))+rbp]
+       movaps  XMMWORD PTR[(-128)+rbp],xmm0
        movaps  xmm9,XMMWORD PTR[((-112))+rbp]
+       movaps  XMMWORD PTR[(-112)+rbp],xmm0
        movaps  xmm10,XMMWORD PTR[((-96))+rbp]
+       movaps  XMMWORD PTR[(-96)+rbp],xmm0
        movaps  xmm11,XMMWORD PTR[((-80))+rbp]
+       movaps  XMMWORD PTR[(-80)+rbp],xmm0
        movaps  xmm12,XMMWORD PTR[((-64))+rbp]
+       movaps  XMMWORD PTR[(-64)+rbp],xmm0
        movaps  xmm13,XMMWORD PTR[((-48))+rbp]
+       movaps  XMMWORD PTR[(-48)+rbp],xmm0
        movaps  xmm14,XMMWORD PTR[((-32))+rbp]
+       movaps  XMMWORD PTR[(-32)+rbp],xmm0
        movaps  xmm15,XMMWORD PTR[((-16))+rbp]
+       movaps  XMMWORD PTR[(-16)+rbp],xmm0
+       movaps  XMMWORD PTR[rsp],xmm0
+       movaps  XMMWORD PTR[16+rsp],xmm0
+       movaps  XMMWORD PTR[32+rsp],xmm0
+       movaps  XMMWORD PTR[48+rsp],xmm0
+       movaps  XMMWORD PTR[64+rsp],xmm0
+       movaps  XMMWORD PTR[80+rsp],xmm0
+       movaps  XMMWORD PTR[96+rsp],xmm0
+       movaps  XMMWORD PTR[112+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$ctr32_epilogue::
@@ -1889,6 +1980,7 @@ DB        102,15,56,221,124,36,80
        shr     eax,4
 
 $L$xts_enc_short::
+
        mov     r10d,eax
        pxor    xmm10,xmm0
        add     rdx,16*6
@@ -1917,6 +2009,7 @@ $L$xts_enc_short::
        pxor    xmm4,xmm12
        pxor    xmm5,xmm13
        pxor    xmm6,xmm14
+       pxor    xmm7,xmm7
 
        call    _aesni_encrypt6
 
@@ -2059,16 +2152,39 @@ DB      102,15,56,221,209
        movups  XMMWORD PTR[(-16)+rsi],xmm2
 
 $L$xts_enc_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        movaps  xmm6,XMMWORD PTR[((-160))+rbp]
+       movaps  XMMWORD PTR[(-160)+rbp],xmm0
        movaps  xmm7,XMMWORD PTR[((-144))+rbp]
+       movaps  XMMWORD PTR[(-144)+rbp],xmm0
        movaps  xmm8,XMMWORD PTR[((-128))+rbp]
+       movaps  XMMWORD PTR[(-128)+rbp],xmm0
        movaps  xmm9,XMMWORD PTR[((-112))+rbp]
+       movaps  XMMWORD PTR[(-112)+rbp],xmm0
        movaps  xmm10,XMMWORD PTR[((-96))+rbp]
+       movaps  XMMWORD PTR[(-96)+rbp],xmm0
        movaps  xmm11,XMMWORD PTR[((-80))+rbp]
+       movaps  XMMWORD PTR[(-80)+rbp],xmm0
        movaps  xmm12,XMMWORD PTR[((-64))+rbp]
+       movaps  XMMWORD PTR[(-64)+rbp],xmm0
        movaps  xmm13,XMMWORD PTR[((-48))+rbp]
+       movaps  XMMWORD PTR[(-48)+rbp],xmm0
        movaps  xmm14,XMMWORD PTR[((-32))+rbp]
+       movaps  XMMWORD PTR[(-32)+rbp],xmm0
        movaps  xmm15,XMMWORD PTR[((-16))+rbp]
+       movaps  XMMWORD PTR[(-16)+rbp],xmm0
+       movaps  XMMWORD PTR[rsp],xmm0
+       movaps  XMMWORD PTR[16+rsp],xmm0
+       movaps  XMMWORD PTR[32+rsp],xmm0
+       movaps  XMMWORD PTR[48+rsp],xmm0
+       movaps  XMMWORD PTR[64+rsp],xmm0
+       movaps  XMMWORD PTR[80+rsp],xmm0
+       movaps  XMMWORD PTR[96+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$xts_enc_epilogue::
@@ -2371,6 +2487,7 @@ DB        102,15,56,223,124,36,80
        shr     eax,4
 
 $L$xts_dec_short::
+
        mov     r10d,eax
        pxor    xmm10,xmm0
        pxor    xmm11,xmm0
@@ -2573,16 +2690,39 @@ DB      102,15,56,223,209
        movups  XMMWORD PTR[rsi],xmm2
 
 $L$xts_dec_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        movaps  xmm6,XMMWORD PTR[((-160))+rbp]
+       movaps  XMMWORD PTR[(-160)+rbp],xmm0
        movaps  xmm7,XMMWORD PTR[((-144))+rbp]
+       movaps  XMMWORD PTR[(-144)+rbp],xmm0
        movaps  xmm8,XMMWORD PTR[((-128))+rbp]
+       movaps  XMMWORD PTR[(-128)+rbp],xmm0
        movaps  xmm9,XMMWORD PTR[((-112))+rbp]
+       movaps  XMMWORD PTR[(-112)+rbp],xmm0
        movaps  xmm10,XMMWORD PTR[((-96))+rbp]
+       movaps  XMMWORD PTR[(-96)+rbp],xmm0
        movaps  xmm11,XMMWORD PTR[((-80))+rbp]
+       movaps  XMMWORD PTR[(-80)+rbp],xmm0
        movaps  xmm12,XMMWORD PTR[((-64))+rbp]
+       movaps  XMMWORD PTR[(-64)+rbp],xmm0
        movaps  xmm13,XMMWORD PTR[((-48))+rbp]
+       movaps  XMMWORD PTR[(-48)+rbp],xmm0
        movaps  xmm14,XMMWORD PTR[((-32))+rbp]
+       movaps  XMMWORD PTR[(-32)+rbp],xmm0
        movaps  xmm15,XMMWORD PTR[((-16))+rbp]
+       movaps  XMMWORD PTR[(-16)+rbp],xmm0
+       movaps  XMMWORD PTR[rsp],xmm0
+       movaps  XMMWORD PTR[16+rsp],xmm0
+       movaps  XMMWORD PTR[32+rsp],xmm0
+       movaps  XMMWORD PTR[48+rsp],xmm0
+       movaps  XMMWORD PTR[64+rsp],xmm0
+       movaps  XMMWORD PTR[80+rsp],xmm0
+       movaps  XMMWORD PTR[96+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$xts_dec_epilogue::
@@ -2646,7 +2786,11 @@ DB       102,15,56,221,209
        jnc     $L$cbc_enc_loop
        add     rdx,16
        jnz     $L$cbc_enc_tail
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[r8],xmm2
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
        jmp     $L$cbc_ret
 
 $L$cbc_enc_tail::
@@ -2666,6 +2810,35 @@ $L$cbc_enc_tail::
 
 ALIGN  16
 $L$cbc_decrypt::
+       cmp     rdx,16
+       jne     $L$cbc_decrypt_bulk
+
+
+
+       movdqu  xmm2,XMMWORD PTR[rdi]
+       movdqu  xmm3,XMMWORD PTR[r8]
+       movdqa  xmm4,xmm2
+       movups  xmm0,XMMWORD PTR[rcx]
+       movups  xmm1,XMMWORD PTR[16+rcx]
+       lea     rcx,QWORD PTR[32+rcx]
+       xorps   xmm2,xmm0
+$L$oop_dec1_16::
+DB     102,15,56,222,209
+       dec     r10d
+       movups  xmm1,XMMWORD PTR[rcx]
+       lea     rcx,QWORD PTR[16+rcx]
+       jnz     $L$oop_dec1_16
+DB     102,15,56,223,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       movdqu  XMMWORD PTR[r8],xmm4
+       xorps   xmm2,xmm3
+       pxor    xmm3,xmm3
+       movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
+       jmp     $L$cbc_ret
+ALIGN  16
+$L$cbc_decrypt_bulk::
        lea     rax,QWORD PTR[rsp]
        push    rbp
        sub     rsp,176
@@ -2913,7 +3086,7 @@ DB        102,69,15,56,223,202
        movaps  xmm2,xmm9
        lea     rcx,QWORD PTR[((-112))+rcx]
        add     rdx,070h
-       jle     $L$cbc_dec_tail_collected
+       jle     $L$cbc_dec_clear_tail_collected
        movups  XMMWORD PTR[rsi],xmm9
        lea     rsi,QWORD PTR[16+rsi]
        cmp     rdx,050h
@@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        pxor    xmm6,xmm14
        movdqu  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        pxor    xmm7,xmm15
        movdqu  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        lea     rsi,QWORD PTR[80+rsi]
        movdqa  xmm2,xmm7
+       pxor    xmm7,xmm7
        jmp     $L$cbc_dec_tail_collected
 
 ALIGN  16
@@ -2954,16 +3132,23 @@ $L$cbc_dec_seven::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        pxor    xmm6,xmm14
        movdqu  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        pxor    xmm7,xmm15
        movdqu  XMMWORD PTR[64+rsi],xmm6
+       pxor    xmm6,xmm6
        pxor    xmm8,xmm9
        movdqu  XMMWORD PTR[80+rsi],xmm7
+       pxor    xmm7,xmm7
        lea     rsi,QWORD PTR[96+rsi]
        movdqa  xmm2,xmm8
+       pxor    xmm8,xmm8
+       pxor    xmm9,xmm9
        jmp     $L$cbc_dec_tail_collected
 
 ALIGN  16
@@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter::
 
        movdqa  xmm2,xmm7
        add     rdx,050h
-       jle     $L$cbc_dec_tail_collected
+       jle     $L$cbc_dec_clear_tail_collected
        movups  XMMWORD PTR[rsi],xmm7
        lea     rsi,QWORD PTR[16+rsi]
 
@@ -3042,12 +3227,17 @@ $L$cbc_dec_tail::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        pxor    xmm6,xmm14
        movdqu  XMMWORD PTR[48+rsi],xmm5
+       pxor    xmm5,xmm5
        lea     rsi,QWORD PTR[64+rsi]
        movdqa  xmm2,xmm6
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        sub     rdx,010h
        jmp     $L$cbc_dec_tail_collected
 
@@ -3058,12 +3248,12 @@ $L$cbc_dec_one::
        movups  xmm1,XMMWORD PTR[16+rcx]
        lea     rcx,QWORD PTR[32+rcx]
        xorps   xmm2,xmm0
-$L$oop_dec1_16::
+$L$oop_dec1_17::
 DB     102,15,56,222,209
        dec     eax
        movups  xmm1,XMMWORD PTR[rcx]
        lea     rcx,QWORD PTR[16+rcx]
-       jnz     $L$oop_dec1_16
+       jnz     $L$oop_dec1_17
 DB     102,15,56,223,209
        xorps   xmm2,xmm10
        movaps  xmm10,xmm11
@@ -3077,6 +3267,7 @@ $L$cbc_dec_two::
        pxor    xmm3,xmm11
        movdqu  XMMWORD PTR[rsi],xmm2
        movdqa  xmm2,xmm3
+       pxor    xmm3,xmm3
        lea     rsi,QWORD PTR[16+rsi]
        jmp     $L$cbc_dec_tail_collected
 ALIGN  16
@@ -3089,7 +3280,9 @@ $L$cbc_dec_three::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        movdqa  xmm2,xmm4
+       pxor    xmm4,xmm4
        lea     rsi,QWORD PTR[32+rsi]
        jmp     $L$cbc_dec_tail_collected
 ALIGN  16
@@ -3102,39 +3295,61 @@ $L$cbc_dec_four::
        movdqu  XMMWORD PTR[rsi],xmm2
        pxor    xmm4,xmm12
        movdqu  XMMWORD PTR[16+rsi],xmm3
+       pxor    xmm3,xmm3
        pxor    xmm5,xmm13
        movdqu  XMMWORD PTR[32+rsi],xmm4
+       pxor    xmm4,xmm4
        movdqa  xmm2,xmm5
+       pxor    xmm5,xmm5
        lea     rsi,QWORD PTR[48+rsi]
        jmp     $L$cbc_dec_tail_collected
 
 ALIGN  16
+$L$cbc_dec_clear_tail_collected::
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
 $L$cbc_dec_tail_collected::
        movups  XMMWORD PTR[r8],xmm10
        and     rdx,15
        jnz     $L$cbc_dec_tail_partial
        movups  XMMWORD PTR[rsi],xmm2
+       pxor    xmm2,xmm2
        jmp     $L$cbc_dec_ret
 ALIGN  16
 $L$cbc_dec_tail_partial::
        movaps  XMMWORD PTR[rsp],xmm2
+       pxor    xmm2,xmm2
        mov     rcx,16
        mov     rdi,rsi
        sub     rcx,rdx
        lea     rsi,QWORD PTR[rsp]
        DD      09066A4F3h
+       movdqa  XMMWORD PTR[rsp],xmm2
 
 $L$cbc_dec_ret::
+       xorps   xmm0,xmm0
+       pxor    xmm1,xmm1
        movaps  xmm6,XMMWORD PTR[16+rsp]
+       movaps  XMMWORD PTR[16+rsp],xmm0
        movaps  xmm7,XMMWORD PTR[32+rsp]
+       movaps  XMMWORD PTR[32+rsp],xmm0
        movaps  xmm8,XMMWORD PTR[48+rsp]
+       movaps  XMMWORD PTR[48+rsp],xmm0
        movaps  xmm9,XMMWORD PTR[64+rsp]
+       movaps  XMMWORD PTR[64+rsp],xmm0
        movaps  xmm10,XMMWORD PTR[80+rsp]
+       movaps  XMMWORD PTR[80+rsp],xmm0
        movaps  xmm11,XMMWORD PTR[96+rsp]
+       movaps  XMMWORD PTR[96+rsp],xmm0
        movaps  xmm12,XMMWORD PTR[112+rsp]
+       movaps  XMMWORD PTR[112+rsp],xmm0
        movaps  xmm13,XMMWORD PTR[128+rsp]
+       movaps  XMMWORD PTR[128+rsp],xmm0
        movaps  xmm14,XMMWORD PTR[144+rsp]
+       movaps  XMMWORD PTR[144+rsp],xmm0
        movaps  xmm15,XMMWORD PTR[160+rsp]
+       movaps  XMMWORD PTR[160+rsp],xmm0
        lea     rsp,QWORD PTR[rbp]
        pop     rbp
 $L$cbc_ret::
@@ -3175,7 +3390,9 @@ DB        102,15,56,219,201
 
        movups  xmm0,XMMWORD PTR[r8]
 DB     102,15,56,219,192
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR[rcx],xmm0
+       pxor    xmm0,xmm0
 $L$dec_key_ret::
        add     rsp,8
        DB      0F3h,0C3h               ;repret
@@ -3193,8 +3410,10 @@ DB       048h,083h,0ECh,008h
        test    r8,r8
        jz      $L$enc_key_ret
 
+       mov     r10d,268437504
        movups  xmm0,XMMWORD PTR[rcx]
        xorps   xmm4,xmm4
+       and     r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))]
        lea     rax,QWORD PTR[16+r8]
        cmp     edx,256
        je      $L$14rounds
@@ -3205,6 +3424,9 @@ DB        048h,083h,0ECh,008h
 
 $L$10rounds::
        mov     edx,9
+       cmp     r10d,268435456
+       je      $L$10rounds_alt
+
        movups  XMMWORD PTR[r8],xmm0
 DB     102,15,58,223,200,1
        call    $L$key_expansion_128_cold
@@ -3232,9 +3454,79 @@ DB       102,15,58,223,200,54
        jmp     $L$enc_key_ret
 
 ALIGN  16
+$L$10rounds_alt::
+       movdqa  xmm5,XMMWORD PTR[$L$key_rotate]
+       mov     r10d,8
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1]
+       movdqa  xmm2,xmm0
+       movdqu  XMMWORD PTR[r8],xmm0
+       jmp     $L$oop_key128
+
+ALIGN  16
+$L$oop_key128::
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+       lea     rax,QWORD PTR[16+rax]
+
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[(-16)+rax],xmm0
+       movdqa  xmm2,xmm0
+
+       dec     r10d
+       jnz     $L$oop_key128
+
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1b]
+
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[rax],xmm0
+
+       movdqa  xmm2,xmm0
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[16+rax],xmm0
+
+       mov     DWORD PTR[96+rax],edx
+       xor     eax,eax
+       jmp     $L$enc_key_ret
+
+ALIGN  16
 $L$12rounds::
        movq    xmm2,QWORD PTR[16+rcx]
        mov     edx,11
+       cmp     r10d,268435456
+       je      $L$12rounds_alt
+
        movups  XMMWORD PTR[r8],xmm0
 DB     102,15,58,223,202,1
        call    $L$key_expansion_192a_cold
@@ -3258,10 +3550,54 @@ DB      102,15,58,223,202,128
        jmp     $L$enc_key_ret
 
 ALIGN  16
+$L$12rounds_alt::
+       movdqa  xmm5,XMMWORD PTR[$L$key_rotate192]
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1]
+       mov     r10d,8
+       movdqu  XMMWORD PTR[r8],xmm0
+       jmp     $L$oop_key192
+
+ALIGN  16
+$L$oop_key192::
+       movq    QWORD PTR[rax],xmm2
+       movdqa  xmm1,xmm2
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+       pslld   xmm4,1
+       lea     rax,QWORD PTR[24+rax]
+
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+
+       pshufd  xmm3,xmm0,0ffh
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+
+       pxor    xmm0,xmm2
+       pxor    xmm2,xmm3
+       movdqu  XMMWORD PTR[(-16)+rax],xmm0
+
+       dec     r10d
+       jnz     $L$oop_key192
+
+       mov     DWORD PTR[32+rax],edx
+       xor     eax,eax
+       jmp     $L$enc_key_ret
+
+ALIGN  16
 $L$14rounds::
        movups  xmm2,XMMWORD PTR[16+rcx]
        mov     edx,13
        lea     rax,QWORD PTR[16+rax]
+       cmp     r10d,268435456
+       je      $L$14rounds_alt
+
        movups  XMMWORD PTR[r8],xmm0
        movups  XMMWORD PTR[16+r8],xmm2
 DB     102,15,58,223,202,1
@@ -3296,9 +3632,69 @@ DB       102,15,58,223,202,64
        jmp     $L$enc_key_ret
 
 ALIGN  16
+$L$14rounds_alt::
+       movdqa  xmm5,XMMWORD PTR[$L$key_rotate]
+       movdqa  xmm4,XMMWORD PTR[$L$key_rcon1]
+       mov     r10d,7
+       movdqu  XMMWORD PTR[r8],xmm0
+       movdqa  xmm1,xmm2
+       movdqu  XMMWORD PTR[16+r8],xmm2
+       jmp     $L$oop_key256
+
+ALIGN  16
+$L$oop_key256::
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+       pslld   xmm4,1
+
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR[rax],xmm0
+
+       dec     r10d
+       jz      $L$done_key256
+
+       pshufd  xmm2,xmm0,0ffh
+       pxor    xmm3,xmm3
+DB     102,15,56,221,211
+
+       movdqa  xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm1,xmm3
+
+       pxor    xmm2,xmm1
+       movdqu  XMMWORD PTR[16+rax],xmm2
+       lea     rax,QWORD PTR[32+rax]
+       movdqa  xmm1,xmm2
+
+       jmp     $L$oop_key256
+
+$L$done_key256::
+       mov     DWORD PTR[16+rax],edx
+       xor     eax,eax
+       jmp     $L$enc_key_ret
+
+ALIGN  16
 $L$bad_keybits::
        mov     rax,-2
 $L$enc_key_ret::
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
        add     rsp,8
        DB      0F3h,0C3h               ;repret
 $L$SEH_end_set_encrypt_key::
@@ -3384,6 +3780,14 @@ $L$xts_magic::
        DD      087h,0,1,0
 $L$increment1::
 DB     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate::
+       DD      00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh
+$L$key_rotate192::
+       DD      004070605h,004070605h,004070605h,004070605h
+$L$key_rcon1::
+       DD      1,1,1,1
+$L$key_rcon1b::
+       DD      01bh,01bh,01bh,01bh
 
 DB     65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 DB     83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
@@ -3489,7 +3893,7 @@ cbc_se_handler    PROC PRIVATE
        mov     rax,QWORD PTR[152+r8]
        mov     rbx,QWORD PTR[248+r8]
 
-       lea     r10,QWORD PTR[$L$cbc_decrypt]
+       lea     r10,QWORD PTR[$L$cbc_decrypt_bulk]
        cmp     rbx,r10
        jb      $L$common_seh_tail
 
index c47130f..f690ba5 100644 (file)
@@ -1832,11 +1832,16 @@ PUBLIC  bn_get_bits5
 
 ALIGN  16
 bn_get_bits5   PROC PUBLIC
-       mov     r10,rcx
+       lea     r10,QWORD PTR[rcx]
+       lea     r11,QWORD PTR[1+rcx]
        mov     ecx,edx
-       shr     edx,3
-       movzx   eax,WORD PTR[rdx*1+r10]
-       and     ecx,7
+       shr     edx,4
+       and     ecx,15
+       lea     eax,DWORD PTR[((-8))+rcx]
+       cmp     ecx,11
+       cmova   r10,r11
+       cmova   ecx,eax
+       movzx   eax,WORD PTR[rdx*2+r10]
        shr     eax,cl
        and     eax,31
        DB      0F3h,0C3h               ;repret
index a68f7cd..3bbc4e4 100644 (file)
@@ -21,7 +21,10 @@ aesni_encrypt:
        leal    16(%edx),%edx
        jnz     .L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .size  aesni_encrypt,.-.L_aesni_encrypt_begin
 .globl aesni_decrypt
@@ -45,7 +48,10 @@ aesni_decrypt:
        leal    16(%edx),%edx
        jnz     .L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .size  aesni_decrypt,.-.L_aesni_decrypt_begin
 .type  _aesni_encrypt2,@function
@@ -259,17 +265,15 @@ _aesni_encrypt6:
        negl    %ecx
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     .L_aesni_encrypt6_enter
+       jmp     .L008_aesni_encrypt6_inner
 .align 16
-.L008enc6_loop:
+.L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+.L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
@@ -283,7 +287,7 @@ _aesni_encrypt6:
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L008enc6_loop
+       jnz     .L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -315,17 +319,15 @@ _aesni_decrypt6:
        negl    %ecx
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     .L_aesni_decrypt6_enter
+       jmp     .L010_aesni_decrypt6_inner
 .align 16
-.L009dec6_loop:
+.L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+.L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
@@ -339,7 +341,7 @@ _aesni_decrypt6:
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L009dec6_loop
+       jnz     .L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -369,14 +371,14 @@ aesni_ecb_encrypt:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      .L010ecb_ret
+       jz      .L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      .L011ecb_decrypt
+       jz      .L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L012ecb_enc_tail
+       jb      .L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -385,9 +387,9 @@ aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L013ecb_enc_loop6_enter
+       jmp     .L015ecb_enc_loop6_enter
 .align 16
-.L014ecb_enc_loop6:
+.L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -402,12 +404,12 @@ aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L013ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
        call    _aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L014ecb_enc_loop6
+       jnc     .L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -416,18 +418,18 @@ aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L010ecb_ret
-.L012ecb_enc_tail:
+       jz      .L012ecb_ret
+.L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L015ecb_enc_one
+       jb      .L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      .L016ecb_enc_two
+       je      .L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L017ecb_enc_three
+       jb      .L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      .L018ecb_enc_four
+       je      .L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    _aesni_encrypt6
@@ -436,49 +438,49 @@ aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L015ecb_enc_one:
+.L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L019enc1_loop_3:
+.L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L019enc1_loop_3
+       jnz     .L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L016ecb_enc_two:
+.L018ecb_enc_two:
        call    _aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L017ecb_enc_three:
+.L019ecb_enc_three:
        call    _aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L018ecb_enc_four:
+.L020ecb_enc_four:
        call    _aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L011ecb_decrypt:
+.L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      .L020ecb_dec_tail
+       jb      .L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -487,9 +489,9 @@ aesni_ecb_encrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     .L021ecb_dec_loop6_enter
+       jmp     .L023ecb_dec_loop6_enter
 .align 16
-.L022ecb_dec_loop6:
+.L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -504,12 +506,12 @@ aesni_ecb_encrypt:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-.L021ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
        call    _aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     .L022ecb_dec_loop6
+       jnc     .L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -518,18 +520,18 @@ aesni_ecb_encrypt:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      .L010ecb_ret
-.L020ecb_dec_tail:
+       jz      .L012ecb_ret
+.L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      .L023ecb_dec_one
+       jb      .L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      .L024ecb_dec_two
+       je      .L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      .L025ecb_dec_three
+       jb      .L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      .L026ecb_dec_four
+       je      .L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    _aesni_decrypt6
@@ -538,43 +540,51 @@ aesni_ecb_encrypt:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L023ecb_dec_one:
+.L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L027dec1_loop_4:
+.L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L027dec1_loop_4
+       jnz     .L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L024ecb_dec_two:
+.L026ecb_dec_two:
        call    _aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L025ecb_dec_three:
+.L027ecb_dec_three:
        call    _aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L010ecb_ret
+       jmp     .L012ecb_ret
 .align 16
-.L026ecb_dec_four:
+.L028ecb_dec_four:
        call    _aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L010ecb_ret:
+.L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -621,7 +631,7 @@ aesni_ccm64_encrypt_blocks:
        leal    32(%edx,%ecx,1),%edx
        subl    %ecx,%ebx
 .byte  102,15,56,0,253
-.L028ccm64_enc_outer:
+.L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
@@ -630,7 +640,7 @@ aesni_ccm64_encrypt_blocks:
        xorps   %xmm6,%xmm0
        xorps   %xmm0,%xmm3
        movups  32(%ebp),%xmm0
-.L029ccm64_enc2_loop:
+.L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -638,7 +648,7 @@ aesni_ccm64_encrypt_blocks:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L029ccm64_enc2_loop
+       jnz     .L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
@@ -651,10 +661,18 @@ aesni_ccm64_encrypt_blocks:
        movups  %xmm6,(%edi)
 .byte  102,15,56,0,213
        leal    16(%edi),%edi
-       jnz     .L028ccm64_enc_outer
+       jnz     .L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -702,12 +720,12 @@ aesni_ccm64_decrypt_blocks:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L030enc1_loop_5:
+.L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L030enc1_loop_5
+       jnz     .L032enc1_loop_5
 .byte  102,15,56,221,209
        shll    $4,%ebx
        movl    $16,%ecx
@@ -717,16 +735,16 @@ aesni_ccm64_decrypt_blocks:
        subl    %ebx,%ecx
        leal    32(%ebp,%ebx,1),%edx
        movl    %ecx,%ebx
-       jmp     .L031ccm64_dec_outer
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L031ccm64_dec_outer:
+.L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      .L032ccm64_dec_break
+       jz      .L034ccm64_dec_break
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
@@ -734,7 +752,7 @@ aesni_ccm64_decrypt_blocks:
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
        movups  32(%ebp),%xmm0
-.L033ccm64_dec2_loop:
+.L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -742,7 +760,7 @@ aesni_ccm64_decrypt_blocks:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     .L033ccm64_dec2_loop
+       jnz     .L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
@@ -750,9 +768,9 @@ aesni_ccm64_decrypt_blocks:
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
        leal    16(%esi),%esi
-       jmp     .L031ccm64_dec_outer
+       jmp     .L033ccm64_dec_outer
 .align 16
-.L032ccm64_dec_break:
+.L034ccm64_dec_break:
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
@@ -760,16 +778,24 @@ aesni_ccm64_decrypt_blocks:
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-.L034enc1_loop_6:
+.L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L034enc1_loop_6
+       jnz     .L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -795,7 +821,7 @@ aesni_ctr32_encrypt_blocks:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      .L035ctr32_one_shortcut
+       je      .L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -833,7 +859,7 @@ aesni_ctr32_encrypt_blocks:
        pshufd  $192,%xmm0,%xmm2
        pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      .L036ctr32_tail
+       jb      .L038ctr32_tail
        pxor    %xmm6,%xmm7
        shll    $4,%ecx
        movl    $16,%ebx
@@ -842,9 +868,9 @@ aesni_ctr32_encrypt_blocks:
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     .L037ctr32_loop6
+       jmp     .L039ctr32_loop6
 .align 16
-.L037ctr32_loop6:
+.L039ctr32_loop6:
        pshufd  $64,%xmm0,%xmm4
        movdqa  32(%esp),%xmm0
        pshufd  $192,%xmm1,%xmm5
@@ -898,27 +924,27 @@ aesni_ctr32_encrypt_blocks:
        leal    96(%edi),%edi
        pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     .L037ctr32_loop6
+       jnc     .L039ctr32_loop6
        addl    $6,%eax
-       jz      .L038ctr32_ret
+       jz      .L040ctr32_ret
        movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
        pxor    32(%esp),%xmm7
        movl    240(%ebp),%ecx
-.L036ctr32_tail:
+.L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      .L039ctr32_one
+       jb      .L041ctr32_one
        pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      .L040ctr32_two
+       je      .L042ctr32_two
        pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      .L041ctr32_three
+       jb      .L043ctr32_three
        pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      .L042ctr32_four
+       je      .L044ctr32_four
        por     %xmm7,%xmm6
        call    _aesni_encrypt6
        movups  (%esi),%xmm1
@@ -936,29 +962,29 @@ aesni_ctr32_encrypt_blocks:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L035ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-.L039ctr32_one:
+.L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L043enc1_loop_7:
+.L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L043enc1_loop_7
+       jnz     .L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L040ctr32_two:
+.L042ctr32_two:
        call    _aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -966,9 +992,9 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L041ctr32_three:
+.L043ctr32_three:
        call    _aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -979,9 +1005,9 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     .L038ctr32_ret
+       jmp     .L040ctr32_ret
 .align 16
-.L042ctr32_four:
+.L044ctr32_four:
        call    _aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -995,7 +1021,18 @@ aesni_ctr32_encrypt_blocks:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-.L038ctr32_ret:
+.L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1020,12 +1057,12 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L044enc1_loop_8:
+.L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L044enc1_loop_8
+       jnz     .L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1049,14 +1086,14 @@ aesni_xts_encrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      .L045xts_enc_short
+       jc      .L047xts_enc_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     .L046xts_enc_loop6
+       jmp     .L048xts_enc_loop6
 .align 16
-.L046xts_enc_loop6:
+.L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1145,23 +1182,23 @@ aesni_xts_encrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L046xts_enc_loop6
+       jnc     .L048xts_enc_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L045xts_enc_short:
+.L047xts_enc_short:
        addl    $96,%eax
-       jz      .L047xts_enc_done6x
+       jz      .L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L048xts_enc_one
+       jb      .L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L049xts_enc_two
+       je      .L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1170,7 +1207,7 @@ aesni_xts_encrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L050xts_enc_three
+       jb      .L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1180,7 +1217,7 @@ aesni_xts_encrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L051xts_enc_four
+       je      .L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1212,9 +1249,9 @@ aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L048xts_enc_one:
+.L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1222,20 +1259,20 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L053enc1_loop_9:
+.L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L053enc1_loop_9
+       jnz     .L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L049xts_enc_two:
+.L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1249,9 +1286,9 @@ aesni_xts_encrypt:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L050xts_enc_three:
+.L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1269,9 +1306,9 @@ aesni_xts_encrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L051xts_enc_four:
+.L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1293,28 +1330,28 @@ aesni_xts_encrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L052xts_enc_done
+       jmp     .L054xts_enc_done
 .align 16
-.L047xts_enc_done6x:
+.L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L054xts_enc_ret
+       jz      .L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     .L055xts_enc_steal
+       jmp     .L057xts_enc_steal
 .align 16
-.L052xts_enc_done:
+.L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L054xts_enc_ret
+       jz      .L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-.L055xts_enc_steal:
+.L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1322,7 +1359,7 @@ aesni_xts_encrypt:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L055xts_enc_steal
+       jnz     .L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1332,16 +1369,30 @@ aesni_xts_encrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L056enc1_loop_10:
+.L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L056enc1_loop_10
+       jnz     .L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-.L054xts_enc_ret:
+.L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1366,12 +1417,12 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L057enc1_loop_11:
+.L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L057enc1_loop_11
+       jnz     .L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1400,14 +1451,14 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      .L058xts_dec_short
+       jc      .L060xts_dec_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     .L059xts_dec_loop6
+       jmp     .L061xts_dec_loop6
 .align 16
-.L059xts_dec_loop6:
+.L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1496,23 +1547,23 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     .L059xts_dec_loop6
+       jnc     .L061xts_dec_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-.L058xts_dec_short:
+.L060xts_dec_short:
        addl    $96,%eax
-       jz      .L060xts_dec_done6x
+       jz      .L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      .L061xts_dec_one
+       jb      .L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      .L062xts_dec_two
+       je      .L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1521,7 +1572,7 @@ aesni_xts_decrypt:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      .L063xts_dec_three
+       jb      .L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1531,7 +1582,7 @@ aesni_xts_decrypt:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      .L064xts_dec_four
+       je      .L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1563,9 +1614,9 @@ aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L061xts_dec_one:
+.L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1573,20 +1624,20 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L066dec1_loop_12:
+.L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L066dec1_loop_12
+       jnz     .L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L062xts_dec_two:
+.L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1600,9 +1651,9 @@ aesni_xts_decrypt:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L063xts_dec_three:
+.L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1620,9 +1671,9 @@ aesni_xts_decrypt:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L064xts_dec_four:
+.L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1644,20 +1695,20 @@ aesni_xts_decrypt:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     .L065xts_dec_done
+       jmp     .L067xts_dec_done
 .align 16
-.L060xts_dec_done6x:
+.L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      .L067xts_dec_ret
+       jz      .L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     .L068xts_dec_only_one_more
+       jmp     .L070xts_dec_only_one_more
 .align 16
-.L065xts_dec_done:
+.L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      .L067xts_dec_ret
+       jz      .L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1667,7 +1718,7 @@ aesni_xts_decrypt:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-.L068xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1681,16 +1732,16 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L069dec1_loop_13:
+.L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L069dec1_loop_13
+       jnz     .L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-.L070xts_dec_steal:
+.L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1698,7 +1749,7 @@ aesni_xts_decrypt:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     .L070xts_dec_steal
+       jnz     .L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1708,16 +1759,30 @@ aesni_xts_decrypt:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L071dec1_loop_14:
+.L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L071dec1_loop_14
+       jnz     .L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-.L067xts_dec_ret:
+.L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1743,7 +1808,7 @@ aesni_cbc_encrypt:
        movl    32(%esp),%edx
        movl    36(%esp),%ebp
        testl   %eax,%eax
-       jz      .L072cbc_abort
+       jz      .L074cbc_abort
        cmpl    $0,40(%esp)
        xchgl   %esp,%ebx
        movups  (%ebp),%xmm7
@@ -1751,14 +1816,14 @@ aesni_cbc_encrypt:
        movl    %edx,%ebp
        movl    %ebx,16(%esp)
        movl    %ecx,%ebx
-       je      .L073cbc_decrypt
+       je      .L075cbc_decrypt
        movaps  %xmm7,%xmm2
        cmpl    $16,%eax
-       jb      .L074cbc_enc_tail
+       jb      .L076cbc_enc_tail
        subl    $16,%eax
-       jmp     .L075cbc_enc_loop
+       jmp     .L077cbc_enc_loop
 .align 16
-.L075cbc_enc_loop:
+.L077cbc_enc_loop:
        movups  (%esi),%xmm7
        leal    16(%esi),%esi
        movups  (%edx),%xmm0
@@ -1766,24 +1831,25 @@ aesni_cbc_encrypt:
        xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
        xorps   %xmm7,%xmm2
-.L076enc1_loop_15:
+.L078enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L076enc1_loop_15
+       jnz     .L078enc1_loop_15
 .byte  102,15,56,221,209
        movl    %ebx,%ecx
        movl    %ebp,%edx
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        subl    $16,%eax
-       jnc     .L075cbc_enc_loop
+       jnc     .L077cbc_enc_loop
        addl    $16,%eax
-       jnz     .L074cbc_enc_tail
+       jnz     .L076cbc_enc_tail
        movaps  %xmm2,%xmm7
-       jmp     .L077cbc_ret
-.L074cbc_enc_tail:
+       pxor    %xmm2,%xmm2
+       jmp     .L079cbc_ret
+.L076cbc_enc_tail:
        movl    %eax,%ecx
 .long  2767451785
        movl    $16,%ecx
@@ -1794,20 +1860,20 @@ aesni_cbc_encrypt:
        movl    %ebx,%ecx
        movl    %edi,%esi
        movl    %ebp,%edx
-       jmp     .L075cbc_enc_loop
+       jmp     .L077cbc_enc_loop
 .align 16
-.L073cbc_decrypt:
+.L075cbc_decrypt:
        cmpl    $80,%eax
-       jbe     .L078cbc_dec_tail
+       jbe     .L080cbc_dec_tail
        movaps  %xmm7,(%esp)
        subl    $80,%eax
-       jmp     .L079cbc_dec_loop6_enter
+       jmp     .L081cbc_dec_loop6_enter
 .align 16
-.L080cbc_dec_loop6:
+.L082cbc_dec_loop6:
        movaps  %xmm0,(%esp)
        movups  %xmm7,(%edi)
        leal    16(%edi),%edi
-.L079cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -1837,28 +1903,28 @@ aesni_cbc_encrypt:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      .L080cbc_dec_loop6
+       ja      .L082cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     .L081cbc_dec_tail_collected
+       jle     .L083cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-.L078cbc_dec_tail:
+.L080cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     .L082cbc_dec_one
+       jbe     .L084cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     .L083cbc_dec_two
+       jbe     .L085cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     .L084cbc_dec_three
+       jbe     .L086cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     .L085cbc_dec_four
+       jbe     .L087cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1876,55 +1942,62 @@ aesni_cbc_encrypt:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L082cbc_dec_one:
+.L084cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-.L086dec1_loop_16:
+.L089dec1_loop_16:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     .L086dec1_loop_16
+       jnz     .L089dec1_loop_16
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L083cbc_dec_two:
+.L085cbc_dec_two:
        call    _aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L084cbc_dec_three:
+.L086cbc_dec_three:
        call    _aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     .L081cbc_dec_tail_collected
+       jmp     .L088cbc_dec_tail_collected
 .align 16
-.L085cbc_dec_four:
+.L087cbc_dec_four:
        call    _aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1934,28 +2007,44 @@ aesni_cbc_encrypt:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-.L081cbc_dec_tail_collected:
+       jmp     .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     .L087cbc_dec_tail_partial
+       jnz     .L090cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     .L077cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     .L079cbc_ret
 .align 16
-.L087cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-.L077cbc_ret:
+       movdqa  %xmm2,(%esp)
+.L079cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-.L072cbc_abort:
+       pxor    %xmm7,%xmm7
+.L074cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1965,52 +2054,62 @@ aesni_cbc_encrypt:
 .type  _aesni_set_encrypt_key,@function
 .align 16
 _aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      .L088bad_pointer
+       jz      .L091bad_pointer
        testl   %edx,%edx
-       jz      .L088bad_pointer
+       jz      .L091bad_pointer
+       call    .L092pic
+.L092pic:
+       popl    %ebx
+       leal    .Lkey_const-.L092pic(%ebx),%ebx
+       leal    OPENSSL_ia32cap_P,%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      .L08914rounds
+       je      .L09314rounds
        cmpl    $192,%ecx
-       je      .L09012rounds
+       je      .L09412rounds
        cmpl    $128,%ecx
-       jne     .L091bad_keybits
+       jne     .L095bad_keybits
 .align 16
-.L09210rounds:
+.L09610rounds:
+       cmpl    $268435456,%ebp
+       je      .L09710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    .L093key_128_cold
+       call    .L098key_128_cold
 .byte  102,15,58,223,200,2
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,4
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,8
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,16
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,32
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,64
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,128
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,27
-       call    .L094key_128
+       call    .L099key_128
 .byte  102,15,58,223,200,54
-       call    .L094key_128
+       call    .L099key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L100good_key
 .align 16
-.L094key_128:
+.L099key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-.L093key_128_cold:
+.L098key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2019,38 +2118,91 @@ _aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L09012rounds:
+.L09710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+.L101loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     .L101loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     .L100good_key
+.align 16
+.L09412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      .L10212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L095key_192a_cold
+       call    .L103key_192a_cold
 .byte  102,15,58,223,202,2
-       call    .L096key_192b
+       call    .L104key_192b
 .byte  102,15,58,223,202,4
-       call    .L097key_192a
+       call    .L105key_192a
 .byte  102,15,58,223,202,8
-       call    .L096key_192b
+       call    .L104key_192b
 .byte  102,15,58,223,202,16
-       call    .L097key_192a
+       call    .L105key_192a
 .byte  102,15,58,223,202,32
-       call    .L096key_192b
+       call    .L104key_192b
 .byte  102,15,58,223,202,64
-       call    .L097key_192a
+       call    .L105key_192a
 .byte  102,15,58,223,202,128
-       call    .L096key_192b
+       call    .L104key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     .L100good_key
 .align 16
-.L097key_192a:
+.L105key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 16
-.L095key_192a_cold:
+.L103key_192a_cold:
        movaps  %xmm2,%xmm5
-.L098key_192b_warm:
+.L106key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2064,56 +2216,90 @@ _aesni_set_encrypt_key:
        pxor    %xmm3,%xmm2
        ret
 .align 16
-.L096key_192b:
+.L104key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     .L098key_192b_warm
+       jmp     .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+.L107loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     .L107loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     .L100good_key
 .align 16
-.L08914rounds:
+.L09314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      .L10814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    .L099key_256a_cold
+       call    .L109key_256a_cold
 .byte  102,15,58,223,200,1
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,2
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,2
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,4
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,4
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,8
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,8
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,16
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,16
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,32
-       call    .L101key_256a
+       call    .L111key_256a
 .byte  102,15,58,223,200,32
-       call    .L100key_256b
+       call    .L110key_256b
 .byte  102,15,58,223,202,64
-       call    .L101key_256a
+       call    .L111key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     .L100good_key
 .align 16
-.L101key_256a:
+.L111key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-.L099key_256a_cold:
+.L109key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2122,7 +2308,7 @@ _aesni_set_encrypt_key:
        xorps   %xmm1,%xmm0
        ret
 .align 16
-.L100key_256b:
+.L110key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2132,13 +2318,70 @@ _aesni_set_encrypt_key:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 16
+.L10814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+.L112loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      .L113done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     .L112loop_key256
+.L113done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+.L100good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 4
-.L088bad_pointer:
+.L091bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 4
-.L091bad_keybits:
+.L095bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .size  _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
 .globl aesni_set_encrypt_key
@@ -2164,7 +2407,7 @@ aesni_set_decrypt_key:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     .L102dec_key_ret
+       jnz     .L114dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2172,7 +2415,7 @@ aesni_set_decrypt_key:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-.L103dec_key_inverse:
+.L115dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2182,15 +2425,24 @@ aesni_set_decrypt_key:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      .L103dec_key_inverse
+       ja      .L115dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-.L102dec_key_ret:
+.L114dec_key_ret:
        ret
 .size  aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
+.comm  OPENSSL_ia32cap_P,16,4
index cecd5f8..c1f5aec 100644 (file)
@@ -20,7 +20,10 @@ L000enc1_loop_1:
        leal    16(%edx),%edx
        jnz     L000enc1_loop_1
 .byte  102,15,56,221,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .globl _aesni_decrypt
 .align 4
@@ -42,7 +45,10 @@ L001dec1_loop_2:
        leal    16(%edx),%edx
        jnz     L001dec1_loop_2
 .byte  102,15,56,223,209
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        movups  %xmm2,(%eax)
+       pxor    %xmm2,%xmm2
        ret
 .align 4
 __aesni_encrypt2:
@@ -242,17 +248,15 @@ __aesni_encrypt6:
        negl    %ecx
 .byte  102,15,56,220,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,220,233
-.byte  102,15,56,220,241
-.byte  102,15,56,220,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     L_aesni_encrypt6_enter
+       jmp     L008_aesni_encrypt6_inner
 .align 4,0x90
-L008enc6_loop:
+L009enc6_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
+L008_aesni_encrypt6_inner:
 .byte  102,15,56,220,233
 .byte  102,15,56,220,241
 .byte  102,15,56,220,249
@@ -266,7 +270,7 @@ L_aesni_encrypt6_enter:
 .byte  102,15,56,220,240
 .byte  102,15,56,220,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L008enc6_loop
+       jnz     L009enc6_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
 .byte  102,15,56,220,225
@@ -296,17 +300,15 @@ __aesni_decrypt6:
        negl    %ecx
 .byte  102,15,56,222,225
        pxor    %xmm0,%xmm7
+       movups  (%edx,%ecx,1),%xmm0
        addl    $16,%ecx
-.byte  102,15,56,222,233
-.byte  102,15,56,222,241
-.byte  102,15,56,222,249
-       movups  -16(%edx,%ecx,1),%xmm0
-       jmp     L_aesni_decrypt6_enter
+       jmp     L010_aesni_decrypt6_inner
 .align 4,0x90
-L009dec6_loop:
+L011dec6_loop:
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
+L010_aesni_decrypt6_inner:
 .byte  102,15,56,222,233
 .byte  102,15,56,222,241
 .byte  102,15,56,222,249
@@ -320,7 +322,7 @@ L_aesni_decrypt6_enter:
 .byte  102,15,56,222,240
 .byte  102,15,56,222,248
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L009dec6_loop
+       jnz     L011dec6_loop
 .byte  102,15,56,222,209
 .byte  102,15,56,222,217
 .byte  102,15,56,222,225
@@ -348,14 +350,14 @@ L_aesni_ecb_encrypt_begin:
        movl    32(%esp),%edx
        movl    36(%esp),%ebx
        andl    $-16,%eax
-       jz      L010ecb_ret
+       jz      L012ecb_ret
        movl    240(%edx),%ecx
        testl   %ebx,%ebx
-       jz      L011ecb_decrypt
+       jz      L013ecb_decrypt
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      L012ecb_enc_tail
+       jb      L014ecb_enc_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -364,9 +366,9 @@ L_aesni_ecb_encrypt_begin:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     L013ecb_enc_loop6_enter
+       jmp     L015ecb_enc_loop6_enter
 .align 4,0x90
-L014ecb_enc_loop6:
+L016ecb_enc_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -381,12 +383,12 @@ L014ecb_enc_loop6:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-L013ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
        call    __aesni_encrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     L014ecb_enc_loop6
+       jnc     L016ecb_enc_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -395,18 +397,18 @@ L013ecb_enc_loop6_enter:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      L010ecb_ret
-L012ecb_enc_tail:
+       jz      L012ecb_ret
+L014ecb_enc_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      L015ecb_enc_one
+       jb      L017ecb_enc_one
        movups  16(%esi),%xmm3
-       je      L016ecb_enc_two
+       je      L018ecb_enc_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      L017ecb_enc_three
+       jb      L019ecb_enc_three
        movups  48(%esi),%xmm5
-       je      L018ecb_enc_four
+       je      L020ecb_enc_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_encrypt6
@@ -415,49 +417,49 @@ L012ecb_enc_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L015ecb_enc_one:
+L017ecb_enc_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L019enc1_loop_3:
+L021enc1_loop_3:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L019enc1_loop_3
+       jnz     L021enc1_loop_3
 .byte  102,15,56,221,209
        movups  %xmm2,(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L016ecb_enc_two:
+L018ecb_enc_two:
        call    __aesni_encrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L017ecb_enc_three:
+L019ecb_enc_three:
        call    __aesni_encrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L018ecb_enc_four:
+L020ecb_enc_four:
        call    __aesni_encrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L011ecb_decrypt:
+L013ecb_decrypt:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        cmpl    $96,%eax
-       jb      L020ecb_dec_tail
+       jb      L022ecb_dec_tail
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -466,9 +468,9 @@ L011ecb_decrypt:
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
        subl    $96,%eax
-       jmp     L021ecb_dec_loop6_enter
+       jmp     L023ecb_dec_loop6_enter
 .align 4,0x90
-L022ecb_dec_loop6:
+L024ecb_dec_loop6:
        movups  %xmm2,(%edi)
        movdqu  (%esi),%xmm2
        movups  %xmm3,16(%edi)
@@ -483,12 +485,12 @@ L022ecb_dec_loop6:
        leal    96(%edi),%edi
        movdqu  80(%esi),%xmm7
        leal    96(%esi),%esi
-L021ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
        call    __aesni_decrypt6
        movl    %ebp,%edx
        movl    %ebx,%ecx
        subl    $96,%eax
-       jnc     L022ecb_dec_loop6
+       jnc     L024ecb_dec_loop6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
@@ -497,18 +499,18 @@ L021ecb_dec_loop6_enter:
        movups  %xmm7,80(%edi)
        leal    96(%edi),%edi
        addl    $96,%eax
-       jz      L010ecb_ret
-L020ecb_dec_tail:
+       jz      L012ecb_ret
+L022ecb_dec_tail:
        movups  (%esi),%xmm2
        cmpl    $32,%eax
-       jb      L023ecb_dec_one
+       jb      L025ecb_dec_one
        movups  16(%esi),%xmm3
-       je      L024ecb_dec_two
+       je      L026ecb_dec_two
        movups  32(%esi),%xmm4
        cmpl    $64,%eax
-       jb      L025ecb_dec_three
+       jb      L027ecb_dec_three
        movups  48(%esi),%xmm5
-       je      L026ecb_dec_four
+       je      L028ecb_dec_four
        movups  64(%esi),%xmm6
        xorps   %xmm7,%xmm7
        call    __aesni_decrypt6
@@ -517,43 +519,51 @@ L020ecb_dec_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L023ecb_dec_one:
+L025ecb_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L027dec1_loop_4:
+L029dec1_loop_4:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L027dec1_loop_4
+       jnz     L029dec1_loop_4
 .byte  102,15,56,223,209
        movups  %xmm2,(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L024ecb_dec_two:
+L026ecb_dec_two:
        call    __aesni_decrypt2
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L025ecb_dec_three:
+L027ecb_dec_three:
        call    __aesni_decrypt3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L010ecb_ret
+       jmp     L012ecb_ret
 .align 4,0x90
-L026ecb_dec_four:
+L028ecb_dec_four:
        call    __aesni_decrypt4
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-L010ecb_ret:
+L012ecb_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -598,7 +608,7 @@ L_aesni_ccm64_encrypt_blocks_begin:
        leal    32(%edx,%ecx,1),%edx
        subl    %ecx,%ebx
 .byte  102,15,56,0,253
-L028ccm64_enc_outer:
+L030ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  (%esi),%xmm6
@@ -607,7 +617,7 @@ L028ccm64_enc_outer:
        xorps   %xmm6,%xmm0
        xorps   %xmm0,%xmm3
        movups  32(%ebp),%xmm0
-L029ccm64_enc2_loop:
+L031ccm64_enc2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -615,7 +625,7 @@ L029ccm64_enc2_loop:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L029ccm64_enc2_loop
+       jnz     L031ccm64_enc2_loop
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
@@ -628,10 +638,18 @@ L029ccm64_enc2_loop:
        movups  %xmm6,(%edi)
 .byte  102,15,56,0,213
        leal    16(%edi),%edi
-       jnz     L028ccm64_enc_outer
+       jnz     L030ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -677,12 +695,12 @@ L_aesni_ccm64_decrypt_blocks_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L030enc1_loop_5:
+L032enc1_loop_5:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L030enc1_loop_5
+       jnz     L032enc1_loop_5
 .byte  102,15,56,221,209
        shll    $4,%ebx
        movl    $16,%ecx
@@ -692,16 +710,16 @@ L030enc1_loop_5:
        subl    %ebx,%ecx
        leal    32(%ebp,%ebx,1),%edx
        movl    %ecx,%ebx
-       jmp     L031ccm64_dec_outer
+       jmp     L033ccm64_dec_outer
 .align 4,0x90
-L031ccm64_dec_outer:
+L033ccm64_dec_outer:
        xorps   %xmm2,%xmm6
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
 .byte  102,15,56,0,213
        subl    $1,%eax
-       jz      L032ccm64_dec_break
+       jz      L034ccm64_dec_break
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
        movups  16(%ebp),%xmm1
@@ -709,7 +727,7 @@ L031ccm64_dec_outer:
        xorps   %xmm0,%xmm2
        xorps   %xmm6,%xmm3
        movups  32(%ebp),%xmm0
-L033ccm64_dec2_loop:
+L035ccm64_dec2_loop:
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        movups  (%edx,%ecx,1),%xmm1
@@ -717,7 +735,7 @@ L033ccm64_dec2_loop:
 .byte  102,15,56,220,208
 .byte  102,15,56,220,216
        movups  -16(%edx,%ecx,1),%xmm0
-       jnz     L033ccm64_dec2_loop
+       jnz     L035ccm64_dec2_loop
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
@@ -725,9 +743,9 @@ L033ccm64_dec2_loop:
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
        leal    16(%esi),%esi
-       jmp     L031ccm64_dec_outer
+       jmp     L033ccm64_dec_outer
 .align 4,0x90
-L032ccm64_dec_break:
+L034ccm64_dec_break:
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movups  (%edx),%xmm0
@@ -735,16 +753,24 @@ L032ccm64_dec_break:
        xorps   %xmm0,%xmm6
        leal    32(%edx),%edx
        xorps   %xmm6,%xmm3
-L034enc1_loop_6:
+L036enc1_loop_6:
 .byte  102,15,56,220,217
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L034enc1_loop_6
+       jnz     L036enc1_loop_6
 .byte  102,15,56,221,217
        movl    48(%esp),%esp
        movl    40(%esp),%edi
        movups  %xmm3,(%edi)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -768,7 +794,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
        andl    $-16,%esp
        movl    %ebp,80(%esp)
        cmpl    $1,%eax
-       je      L035ctr32_one_shortcut
+       je      L037ctr32_one_shortcut
        movdqu  (%ebx),%xmm7
        movl    $202182159,(%esp)
        movl    $134810123,4(%esp)
@@ -806,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
        pshufd  $192,%xmm0,%xmm2
        pshufd  $128,%xmm0,%xmm3
        cmpl    $6,%eax
-       jb      L036ctr32_tail
+       jb      L038ctr32_tail
        pxor    %xmm6,%xmm7
        shll    $4,%ecx
        movl    $16,%ebx
@@ -815,9 +841,9 @@ L_aesni_ctr32_encrypt_blocks_begin:
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
        subl    $6,%eax
-       jmp     L037ctr32_loop6
+       jmp     L039ctr32_loop6
 .align 4,0x90
-L037ctr32_loop6:
+L039ctr32_loop6:
        pshufd  $64,%xmm0,%xmm4
        movdqa  32(%esp),%xmm0
        pshufd  $192,%xmm1,%xmm5
@@ -871,27 +897,27 @@ L037ctr32_loop6:
        leal    96(%edi),%edi
        pshufd  $128,%xmm0,%xmm3
        subl    $6,%eax
-       jnc     L037ctr32_loop6
+       jnc     L039ctr32_loop6
        addl    $6,%eax
-       jz      L038ctr32_ret
+       jz      L040ctr32_ret
        movdqu  (%ebp),%xmm7
        movl    %ebp,%edx
        pxor    32(%esp),%xmm7
        movl    240(%ebp),%ecx
-L036ctr32_tail:
+L038ctr32_tail:
        por     %xmm7,%xmm2
        cmpl    $2,%eax
-       jb      L039ctr32_one
+       jb      L041ctr32_one
        pshufd  $64,%xmm0,%xmm4
        por     %xmm7,%xmm3
-       je      L040ctr32_two
+       je      L042ctr32_two
        pshufd  $192,%xmm1,%xmm5
        por     %xmm7,%xmm4
        cmpl    $4,%eax
-       jb      L041ctr32_three
+       jb      L043ctr32_three
        pshufd  $128,%xmm1,%xmm6
        por     %xmm7,%xmm5
-       je      L042ctr32_four
+       je      L044ctr32_four
        por     %xmm7,%xmm6
        call    __aesni_encrypt6
        movups  (%esi),%xmm1
@@ -909,29 +935,29 @@ L036ctr32_tail:
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L035ctr32_one_shortcut:
+L037ctr32_one_shortcut:
        movups  (%ebx),%xmm2
        movl    240(%edx),%ecx
-L039ctr32_one:
+L041ctr32_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L043enc1_loop_7:
+L045enc1_loop_7:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L043enc1_loop_7
+       jnz     L045enc1_loop_7
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        xorps   %xmm2,%xmm6
        movups  %xmm6,(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L040ctr32_two:
+L042ctr32_two:
        call    __aesni_encrypt2
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -939,9 +965,9 @@ L040ctr32_two:
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L041ctr32_three:
+L043ctr32_three:
        call    __aesni_encrypt3
        movups  (%esi),%xmm5
        movups  16(%esi),%xmm6
@@ -952,9 +978,9 @@ L041ctr32_three:
        xorps   %xmm7,%xmm4
        movups  %xmm3,16(%edi)
        movups  %xmm4,32(%edi)
-       jmp     L038ctr32_ret
+       jmp     L040ctr32_ret
 .align 4,0x90
-L042ctr32_four:
+L044ctr32_four:
        call    __aesni_encrypt4
        movups  (%esi),%xmm6
        movups  16(%esi),%xmm7
@@ -968,7 +994,18 @@ L042ctr32_four:
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
        movups  %xmm5,48(%edi)
-L038ctr32_ret:
+L040ctr32_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
        movl    80(%esp),%esp
        popl    %edi
        popl    %esi
@@ -991,12 +1028,12 @@ L_aesni_xts_encrypt_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L044enc1_loop_8:
+L046enc1_loop_8:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L044enc1_loop_8
+       jnz     L046enc1_loop_8
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1020,14 +1057,14 @@ L044enc1_loop_8:
        movl    %edx,%ebp
        movl    %ecx,%ebx
        subl    $96,%eax
-       jc      L045xts_enc_short
+       jc      L047xts_enc_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     L046xts_enc_loop6
+       jmp     L048xts_enc_loop6
 .align 4,0x90
-L046xts_enc_loop6:
+L048xts_enc_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1116,23 +1153,23 @@ L046xts_enc_loop6:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     L046xts_enc_loop6
+       jnc     L048xts_enc_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-L045xts_enc_short:
+L047xts_enc_short:
        addl    $96,%eax
-       jz      L047xts_enc_done6x
+       jz      L049xts_enc_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      L048xts_enc_one
+       jb      L050xts_enc_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      L049xts_enc_two
+       je      L051xts_enc_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1141,7 +1178,7 @@ L045xts_enc_short:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      L050xts_enc_three
+       jb      L052xts_enc_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1151,7 +1188,7 @@ L045xts_enc_short:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      L051xts_enc_four
+       je      L053xts_enc_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1183,9 +1220,9 @@ L045xts_enc_short:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L048xts_enc_one:
+L050xts_enc_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1193,20 +1230,20 @@ L048xts_enc_one:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L053enc1_loop_9:
+L055enc1_loop_9:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L053enc1_loop_9
+       jnz     L055enc1_loop_9
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L049xts_enc_two:
+L051xts_enc_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1220,9 +1257,9 @@ L049xts_enc_two:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L050xts_enc_three:
+L052xts_enc_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1240,9 +1277,9 @@ L050xts_enc_three:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L051xts_enc_four:
+L053xts_enc_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1264,28 +1301,28 @@ L051xts_enc_four:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L052xts_enc_done
+       jmp     L054xts_enc_done
 .align 4,0x90
-L047xts_enc_done6x:
+L049xts_enc_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      L054xts_enc_ret
+       jz      L056xts_enc_ret
        movdqa  %xmm1,%xmm5
        movl    %eax,112(%esp)
-       jmp     L055xts_enc_steal
+       jmp     L057xts_enc_steal
 .align 4,0x90
-L052xts_enc_done:
+L054xts_enc_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      L054xts_enc_ret
+       jz      L056xts_enc_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm5
        paddq   %xmm1,%xmm1
        pand    96(%esp),%xmm5
        pxor    %xmm1,%xmm5
-L055xts_enc_steal:
+L057xts_enc_steal:
        movzbl  (%esi),%ecx
        movzbl  -16(%edi),%edx
        leal    1(%esi),%esi
@@ -1293,7 +1330,7 @@ L055xts_enc_steal:
        movb    %dl,(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     L055xts_enc_steal
+       jnz     L057xts_enc_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1303,16 +1340,30 @@ L055xts_enc_steal:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L056enc1_loop_10:
+L058enc1_loop_10:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L056enc1_loop_10
+       jnz     L058enc1_loop_10
 .byte  102,15,56,221,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,-16(%edi)
-L054xts_enc_ret:
+L056xts_enc_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1335,12 +1386,12 @@ L_aesni_xts_decrypt_begin:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L057enc1_loop_11:
+L059enc1_loop_11:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L057enc1_loop_11
+       jnz     L059enc1_loop_11
 .byte  102,15,56,221,209
        movl    20(%esp),%esi
        movl    24(%esp),%edi
@@ -1369,14 +1420,14 @@ L057enc1_loop_11:
        pcmpgtd %xmm1,%xmm0
        andl    $-16,%eax
        subl    $96,%eax
-       jc      L058xts_dec_short
+       jc      L060xts_dec_short
        shll    $4,%ecx
        movl    $16,%ebx
        subl    %ecx,%ebx
        leal    32(%edx,%ecx,1),%edx
-       jmp     L059xts_dec_loop6
+       jmp     L061xts_dec_loop6
 .align 4,0x90
-L059xts_dec_loop6:
+L061xts_dec_loop6:
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,(%esp)
@@ -1465,23 +1516,23 @@ L059xts_dec_loop6:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        subl    $96,%eax
-       jnc     L059xts_dec_loop6
+       jnc     L061xts_dec_loop6
        movl    240(%ebp),%ecx
        movl    %ebp,%edx
        movl    %ecx,%ebx
-L058xts_dec_short:
+L060xts_dec_short:
        addl    $96,%eax
-       jz      L060xts_dec_done6x
+       jz      L062xts_dec_done6x
        movdqa  %xmm1,%xmm5
        cmpl    $32,%eax
-       jb      L061xts_dec_one
+       jb      L063xts_dec_one
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        paddq   %xmm1,%xmm1
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-       je      L062xts_dec_two
+       je      L064xts_dec_two
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm6
@@ -1490,7 +1541,7 @@ L058xts_dec_short:
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
        cmpl    $64,%eax
-       jb      L063xts_dec_three
+       jb      L065xts_dec_three
        pshufd  $19,%xmm0,%xmm2
        pxor    %xmm0,%xmm0
        movdqa  %xmm1,%xmm7
@@ -1500,7 +1551,7 @@ L058xts_dec_short:
        pxor    %xmm2,%xmm1
        movdqa  %xmm5,(%esp)
        movdqa  %xmm6,16(%esp)
-       je      L064xts_dec_four
+       je      L066xts_dec_four
        movdqa  %xmm7,32(%esp)
        pshufd  $19,%xmm0,%xmm7
        movdqa  %xmm1,48(%esp)
@@ -1532,9 +1583,9 @@ L058xts_dec_short:
        movups  %xmm5,48(%edi)
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L061xts_dec_one:
+L063xts_dec_one:
        movups  (%esi),%xmm2
        leal    16(%esi),%esi
        xorps   %xmm5,%xmm2
@@ -1542,20 +1593,20 @@ L061xts_dec_one:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L066dec1_loop_12:
+L068dec1_loop_12:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L066dec1_loop_12
+       jnz     L068dec1_loop_12
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        movdqa  %xmm5,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L062xts_dec_two:
+L064xts_dec_two:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1569,9 +1620,9 @@ L062xts_dec_two:
        movups  %xmm3,16(%edi)
        leal    32(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L063xts_dec_three:
+L065xts_dec_three:
        movaps  %xmm1,%xmm7
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1589,9 +1640,9 @@ L063xts_dec_three:
        movups  %xmm4,32(%edi)
        leal    48(%edi),%edi
        movdqa  %xmm7,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L064xts_dec_four:
+L066xts_dec_four:
        movaps  %xmm1,%xmm6
        movups  (%esi),%xmm2
        movups  16(%esi),%xmm3
@@ -1613,20 +1664,20 @@ L064xts_dec_four:
        movups  %xmm5,48(%edi)
        leal    64(%edi),%edi
        movdqa  %xmm6,%xmm1
-       jmp     L065xts_dec_done
+       jmp     L067xts_dec_done
 .align 4,0x90
-L060xts_dec_done6x:
+L062xts_dec_done6x:
        movl    112(%esp),%eax
        andl    $15,%eax
-       jz      L067xts_dec_ret
+       jz      L069xts_dec_ret
        movl    %eax,112(%esp)
-       jmp     L068xts_dec_only_one_more
+       jmp     L070xts_dec_only_one_more
 .align 4,0x90
-L065xts_dec_done:
+L067xts_dec_done:
        movl    112(%esp),%eax
        pxor    %xmm0,%xmm0
        andl    $15,%eax
-       jz      L067xts_dec_ret
+       jz      L069xts_dec_ret
        pcmpgtd %xmm1,%xmm0
        movl    %eax,112(%esp)
        pshufd  $19,%xmm0,%xmm2
@@ -1636,7 +1687,7 @@ L065xts_dec_done:
        pand    %xmm3,%xmm2
        pcmpgtd %xmm1,%xmm0
        pxor    %xmm2,%xmm1
-L068xts_dec_only_one_more:
+L070xts_dec_only_one_more:
        pshufd  $19,%xmm0,%xmm5
        movdqa  %xmm1,%xmm6
        paddq   %xmm1,%xmm1
@@ -1650,16 +1701,16 @@ L068xts_dec_only_one_more:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L069dec1_loop_13:
+L071dec1_loop_13:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L069dec1_loop_13
+       jnz     L071dec1_loop_13
 .byte  102,15,56,223,209
        xorps   %xmm5,%xmm2
        movups  %xmm2,(%edi)
-L070xts_dec_steal:
+L072xts_dec_steal:
        movzbl  16(%esi),%ecx
        movzbl  (%edi),%edx
        leal    1(%esi),%esi
@@ -1667,7 +1718,7 @@ L070xts_dec_steal:
        movb    %dl,16(%edi)
        leal    1(%edi),%edi
        subl    $1,%eax
-       jnz     L070xts_dec_steal
+       jnz     L072xts_dec_steal
        subl    112(%esp),%edi
        movl    %ebp,%edx
        movl    %ebx,%ecx
@@ -1677,16 +1728,30 @@ L070xts_dec_steal:
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L071dec1_loop_14:
+L073dec1_loop_14:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L071dec1_loop_14
+       jnz     L073dec1_loop_14
 .byte  102,15,56,223,209
        xorps   %xmm6,%xmm2
        movups  %xmm2,(%edi)
-L067xts_dec_ret:
+L069xts_dec_ret:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       movdqa  %xmm0,(%esp)
+       pxor    %xmm3,%xmm3
+       movdqa  %xmm0,16(%esp)
+       pxor    %xmm4,%xmm4
+       movdqa  %xmm0,32(%esp)
+       pxor    %xmm5,%xmm5
+       movdqa  %xmm0,48(%esp)
+       pxor    %xmm6,%xmm6
+       movdqa  %xmm0,64(%esp)
+       pxor    %xmm7,%xmm7
+       movdqa  %xmm0,80(%esp)
        movl    116(%esp),%esp
        popl    %edi
        popl    %esi
@@ -1710,7 +1775,7 @@ L_aesni_cbc_encrypt_begin:
        movl    32(%esp),%edx
        movl    36(%esp),%ebp
        testl   %eax,%eax
-       jz      L072cbc_abort
+       jz      L074cbc_abort
        cmpl    $0,40(%esp)
        xchgl   %esp,%ebx
        movups  (%ebp),%xmm7
@@ -1718,14 +1783,14 @@ L_aesni_cbc_encrypt_begin:
        movl    %edx,%ebp
        movl    %ebx,16(%esp)
        movl    %ecx,%ebx
-       je      L073cbc_decrypt
+       je      L075cbc_decrypt
        movaps  %xmm7,%xmm2
        cmpl    $16,%eax
-       jb      L074cbc_enc_tail
+       jb      L076cbc_enc_tail
        subl    $16,%eax
-       jmp     L075cbc_enc_loop
+       jmp     L077cbc_enc_loop
 .align 4,0x90
-L075cbc_enc_loop:
+L077cbc_enc_loop:
        movups  (%esi),%xmm7
        leal    16(%esi),%esi
        movups  (%edx),%xmm0
@@ -1733,24 +1798,25 @@ L075cbc_enc_loop:
        xorps   %xmm0,%xmm7
        leal    32(%edx),%edx
        xorps   %xmm7,%xmm2
-L076enc1_loop_15:
+L078enc1_loop_15:
 .byte  102,15,56,220,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L076enc1_loop_15
+       jnz     L078enc1_loop_15
 .byte  102,15,56,221,209
        movl    %ebx,%ecx
        movl    %ebp,%edx
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
        subl    $16,%eax
-       jnc     L075cbc_enc_loop
+       jnc     L077cbc_enc_loop
        addl    $16,%eax
-       jnz     L074cbc_enc_tail
+       jnz     L076cbc_enc_tail
        movaps  %xmm2,%xmm7
-       jmp     L077cbc_ret
-L074cbc_enc_tail:
+       pxor    %xmm2,%xmm2
+       jmp     L079cbc_ret
+L076cbc_enc_tail:
        movl    %eax,%ecx
 .long  2767451785
        movl    $16,%ecx
@@ -1761,20 +1827,20 @@ L074cbc_enc_tail:
        movl    %ebx,%ecx
        movl    %edi,%esi
        movl    %ebp,%edx
-       jmp     L075cbc_enc_loop
+       jmp     L077cbc_enc_loop
 .align 4,0x90
-L073cbc_decrypt:
+L075cbc_decrypt:
        cmpl    $80,%eax
-       jbe     L078cbc_dec_tail
+       jbe     L080cbc_dec_tail
        movaps  %xmm7,(%esp)
        subl    $80,%eax
-       jmp     L079cbc_dec_loop6_enter
+       jmp     L081cbc_dec_loop6_enter
 .align 4,0x90
-L080cbc_dec_loop6:
+L082cbc_dec_loop6:
        movaps  %xmm0,(%esp)
        movups  %xmm7,(%edi)
        leal    16(%edi),%edi
-L079cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
        movdqu  (%esi),%xmm2
        movdqu  16(%esi),%xmm3
        movdqu  32(%esi),%xmm4
@@ -1804,28 +1870,28 @@ L079cbc_dec_loop6_enter:
        movups  %xmm6,64(%edi)
        leal    80(%edi),%edi
        subl    $96,%eax
-       ja      L080cbc_dec_loop6
+       ja      L082cbc_dec_loop6
        movaps  %xmm7,%xmm2
        movaps  %xmm0,%xmm7
        addl    $80,%eax
-       jle     L081cbc_dec_tail_collected
+       jle     L083cbc_dec_clear_tail_collected
        movups  %xmm2,(%edi)
        leal    16(%edi),%edi
-L078cbc_dec_tail:
+L080cbc_dec_tail:
        movups  (%esi),%xmm2
        movaps  %xmm2,%xmm6
        cmpl    $16,%eax
-       jbe     L082cbc_dec_one
+       jbe     L084cbc_dec_one
        movups  16(%esi),%xmm3
        movaps  %xmm3,%xmm5
        cmpl    $32,%eax
-       jbe     L083cbc_dec_two
+       jbe     L085cbc_dec_two
        movups  32(%esi),%xmm4
        cmpl    $48,%eax
-       jbe     L084cbc_dec_three
+       jbe     L086cbc_dec_three
        movups  48(%esi),%xmm5
        cmpl    $64,%eax
-       jbe     L085cbc_dec_four
+       jbe     L087cbc_dec_four
        movups  64(%esi),%xmm6
        movaps  %xmm7,(%esp)
        movups  (%esi),%xmm2
@@ -1843,55 +1909,62 @@ L078cbc_dec_tail:
        xorps   %xmm0,%xmm6
        movups  %xmm2,(%edi)
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        movups  %xmm5,48(%edi)
+       pxor    %xmm5,%xmm5
        leal    64(%edi),%edi
        movaps  %xmm6,%xmm2
+       pxor    %xmm6,%xmm6
        subl    $80,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L082cbc_dec_one:
+L084cbc_dec_one:
        movups  (%edx),%xmm0
        movups  16(%edx),%xmm1
        leal    32(%edx),%edx
        xorps   %xmm0,%xmm2
-L086dec1_loop_16:
+L089dec1_loop_16:
 .byte  102,15,56,222,209
        decl    %ecx
        movups  (%edx),%xmm1
        leal    16(%edx),%edx
-       jnz     L086dec1_loop_16
+       jnz     L089dec1_loop_16
 .byte  102,15,56,223,209
        xorps   %xmm7,%xmm2
        movaps  %xmm6,%xmm7
        subl    $16,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L083cbc_dec_two:
+L085cbc_dec_two:
        call    __aesni_decrypt2
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        movups  %xmm2,(%edi)
        movaps  %xmm3,%xmm2
+       pxor    %xmm3,%xmm3
        leal    16(%edi),%edi
        movaps  %xmm5,%xmm7
        subl    $32,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L084cbc_dec_three:
+L086cbc_dec_three:
        call    __aesni_decrypt3
        xorps   %xmm7,%xmm2
        xorps   %xmm6,%xmm3
        xorps   %xmm5,%xmm4
        movups  %xmm2,(%edi)
        movaps  %xmm4,%xmm2
+       pxor    %xmm4,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        leal    32(%edi),%edi
        movups  32(%esi),%xmm7
        subl    $48,%eax
-       jmp     L081cbc_dec_tail_collected
+       jmp     L088cbc_dec_tail_collected
 .align 4,0x90
-L085cbc_dec_four:
+L087cbc_dec_four:
        call    __aesni_decrypt4
        movups  16(%esi),%xmm1
        movups  32(%esi),%xmm0
@@ -1901,28 +1974,44 @@ L085cbc_dec_four:
        movups  %xmm2,(%edi)
        xorps   %xmm1,%xmm4
        movups  %xmm3,16(%edi)
+       pxor    %xmm3,%xmm3
        xorps   %xmm0,%xmm5
        movups  %xmm4,32(%edi)
+       pxor    %xmm4,%xmm4
        leal    48(%edi),%edi
        movaps  %xmm5,%xmm2
+       pxor    %xmm5,%xmm5
        subl    $64,%eax
-L081cbc_dec_tail_collected:
+       jmp     L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+L088cbc_dec_tail_collected:
        andl    $15,%eax
-       jnz     L087cbc_dec_tail_partial
+       jnz     L090cbc_dec_tail_partial
        movups  %xmm2,(%edi)
-       jmp     L077cbc_ret
+       pxor    %xmm0,%xmm0
+       jmp     L079cbc_ret
 .align 4,0x90
-L087cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
        movaps  %xmm2,(%esp)
+       pxor    %xmm0,%xmm0
        movl    $16,%ecx
        movl    %esp,%esi
        subl    %eax,%ecx
 .long  2767451785
-L077cbc_ret:
+       movdqa  %xmm2,(%esp)
+L079cbc_ret:
        movl    16(%esp),%esp
        movl    36(%esp),%ebp
+       pxor    %xmm2,%xmm2
+       pxor    %xmm1,%xmm1
        movups  %xmm7,(%ebp)
-L072cbc_abort:
+       pxor    %xmm7,%xmm7
+L074cbc_abort:
        popl    %edi
        popl    %esi
        popl    %ebx
@@ -1930,52 +2019,62 @@ L072cbc_abort:
        ret
 .align 4
 __aesni_set_encrypt_key:
+       pushl   %ebp
+       pushl   %ebx
        testl   %eax,%eax
-       jz      L088bad_pointer
+       jz      L091bad_pointer
        testl   %edx,%edx
-       jz      L088bad_pointer
+       jz      L091bad_pointer
+       call    L092pic
+L092pic:
+       popl    %ebx
+       leal    Lkey_const-L092pic(%ebx),%ebx
+       movl    L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
        movups  (%eax),%xmm0
        xorps   %xmm4,%xmm4
+       movl    4(%ebp),%ebp
        leal    16(%edx),%edx
+       andl    $268437504,%ebp
        cmpl    $256,%ecx
-       je      L08914rounds
+       je      L09314rounds
        cmpl    $192,%ecx
-       je      L09012rounds
+       je      L09412rounds
        cmpl    $128,%ecx
-       jne     L091bad_keybits
+       jne     L095bad_keybits
 .align 4,0x90
-L09210rounds:
+L09610rounds:
+       cmpl    $268435456,%ebp
+       je      L09710rounds_alt
        movl    $9,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,200,1
-       call    L093key_128_cold
+       call    L098key_128_cold
 .byte  102,15,58,223,200,2
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,4
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,8
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,16
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,32
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,64
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,128
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,27
-       call    L094key_128
+       call    L099key_128
 .byte  102,15,58,223,200,54
-       call    L094key_128
+       call    L099key_128
        movups  %xmm0,(%edx)
        movl    %ecx,80(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     L100good_key
 .align 4,0x90
-L094key_128:
+L099key_128:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
-L093key_128_cold:
+L098key_128_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -1984,38 +2083,91 @@ L093key_128_cold:
        xorps   %xmm1,%xmm0
        ret
 .align 4,0x90
-L09012rounds:
+L09710rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movl    $8,%ecx
+       movdqa  32(%ebx),%xmm4
+       movdqa  %xmm0,%xmm2
+       movdqu  %xmm0,-16(%edx)
+L101loop_key128:
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       leal    16(%edx),%edx
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,-16(%edx)
+       movdqa  %xmm0,%xmm2
+       decl    %ecx
+       jnz     L101loop_key128
+       movdqa  48(%ebx),%xmm4
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       pslld   $1,%xmm4
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       movdqa  %xmm0,%xmm2
+.byte  102,15,56,0,197
+.byte  102,15,56,221,196
+       movdqa  %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm2,%xmm3
+       pslldq  $4,%xmm2
+       pxor    %xmm3,%xmm2
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,16(%edx)
+       movl    $9,%ecx
+       movl    %ecx,96(%edx)
+       jmp     L100good_key
+.align 4,0x90
+L09412rounds:
        movq    16(%eax),%xmm2
+       cmpl    $268435456,%ebp
+       je      L10212rounds_alt
        movl    $11,%ecx
        movups  %xmm0,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    L095key_192a_cold
+       call    L103key_192a_cold
 .byte  102,15,58,223,202,2
-       call    L096key_192b
+       call    L104key_192b
 .byte  102,15,58,223,202,4
-       call    L097key_192a
+       call    L105key_192a
 .byte  102,15,58,223,202,8
-       call    L096key_192b
+       call    L104key_192b
 .byte  102,15,58,223,202,16
-       call    L097key_192a
+       call    L105key_192a
 .byte  102,15,58,223,202,32
-       call    L096key_192b
+       call    L104key_192b
 .byte  102,15,58,223,202,64
-       call    L097key_192a
+       call    L105key_192a
 .byte  102,15,58,223,202,128
-       call    L096key_192b
+       call    L104key_192b
        movups  %xmm0,(%edx)
        movl    %ecx,48(%edx)
-       xorl    %eax,%eax
-       ret
+       jmp     L100good_key
 .align 4,0x90
-L097key_192a:
+L105key_192a:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
 .align 4,0x90
-L095key_192a_cold:
+L103key_192a_cold:
        movaps  %xmm2,%xmm5
-L098key_192b_warm:
+L106key_192b_warm:
        shufps  $16,%xmm0,%xmm4
        movdqa  %xmm2,%xmm3
        xorps   %xmm4,%xmm0
@@ -2029,56 +2181,90 @@ L098key_192b_warm:
        pxor    %xmm3,%xmm2
        ret
 .align 4,0x90
-L096key_192b:
+L104key_192b:
        movaps  %xmm0,%xmm3
        shufps  $68,%xmm0,%xmm5
        movups  %xmm5,(%edx)
        shufps  $78,%xmm2,%xmm3
        movups  %xmm3,16(%edx)
        leal    32(%edx),%edx
-       jmp     L098key_192b_warm
+       jmp     L106key_192b_warm
 .align 4,0x90
-L08914rounds:
+L10212rounds_alt:
+       movdqa  16(%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $8,%ecx
+       movdqu  %xmm0,-16(%edx)
+L107loop_key192:
+       movq    %xmm2,(%edx)
+       movdqa  %xmm2,%xmm1
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       pslld   $1,%xmm4
+       leal    24(%edx),%edx
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pshufd  $255,%xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pxor    %xmm2,%xmm0
+       pxor    %xmm3,%xmm2
+       movdqu  %xmm0,-16(%edx)
+       decl    %ecx
+       jnz     L107loop_key192
+       movl    $11,%ecx
+       movl    %ecx,32(%edx)
+       jmp     L100good_key
+.align 4,0x90
+L09314rounds:
        movups  16(%eax),%xmm2
-       movl    $13,%ecx
        leal    16(%edx),%edx
+       cmpl    $268435456,%ebp
+       je      L10814rounds_alt
+       movl    $13,%ecx
        movups  %xmm0,-32(%edx)
        movups  %xmm2,-16(%edx)
 .byte  102,15,58,223,202,1
-       call    L099key_256a_cold
+       call    L109key_256a_cold
 .byte  102,15,58,223,200,1
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,2
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,2
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,4
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,4
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,8
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,8
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,16
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,16
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,32
-       call    L101key_256a
+       call    L111key_256a
 .byte  102,15,58,223,200,32
-       call    L100key_256b
+       call    L110key_256b
 .byte  102,15,58,223,202,64
-       call    L101key_256a
+       call    L111key_256a
        movups  %xmm0,(%edx)
        movl    %ecx,16(%edx)
        xorl    %eax,%eax
-       ret
+       jmp     L100good_key
 .align 4,0x90
-L101key_256a:
+L111key_256a:
        movups  %xmm2,(%edx)
        leal    16(%edx),%edx
-L099key_256a_cold:
+L109key_256a_cold:
        shufps  $16,%xmm0,%xmm4
        xorps   %xmm4,%xmm0
        shufps  $140,%xmm0,%xmm4
@@ -2087,7 +2273,7 @@ L099key_256a_cold:
        xorps   %xmm1,%xmm0
        ret
 .align 4,0x90
-L100key_256b:
+L110key_256b:
        movups  %xmm0,(%edx)
        leal    16(%edx),%edx
        shufps  $16,%xmm2,%xmm4
@@ -2097,13 +2283,70 @@ L100key_256b:
        shufps  $170,%xmm1,%xmm1
        xorps   %xmm1,%xmm2
        ret
+.align 4,0x90
+L10814rounds_alt:
+       movdqa  (%ebx),%xmm5
+       movdqa  32(%ebx),%xmm4
+       movl    $7,%ecx
+       movdqu  %xmm0,-32(%edx)
+       movdqa  %xmm2,%xmm1
+       movdqu  %xmm2,-16(%edx)
+L112loop_key256:
+.byte  102,15,56,0,213
+.byte  102,15,56,221,212
+       movdqa  %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm0,%xmm3
+       pslldq  $4,%xmm0
+       pxor    %xmm3,%xmm0
+       pslld   $1,%xmm4
+       pxor    %xmm2,%xmm0
+       movdqu  %xmm0,(%edx)
+       decl    %ecx
+       jz      L113done_key256
+       pshufd  $255,%xmm0,%xmm2
+       pxor    %xmm3,%xmm3
+.byte  102,15,56,221,211
+       movdqa  %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm1,%xmm3
+       pslldq  $4,%xmm1
+       pxor    %xmm3,%xmm1
+       pxor    %xmm1,%xmm2
+       movdqu  %xmm2,16(%edx)
+       leal    32(%edx),%edx
+       movdqa  %xmm2,%xmm1
+       jmp     L112loop_key256
+L113done_key256:
+       movl    $13,%ecx
+       movl    %ecx,16(%edx)
+L100good_key:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorl    %eax,%eax
+       popl    %ebx
+       popl    %ebp
+       ret
 .align 2,0x90
-L088bad_pointer:
+L091bad_pointer:
        movl    $-1,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .align 2,0x90
-L091bad_keybits:
+L095bad_keybits:
+       pxor    %xmm0,%xmm0
        movl    $-2,%eax
+       popl    %ebx
+       popl    %ebp
        ret
 .globl _aesni_set_encrypt_key
 .align 4
@@ -2125,7 +2368,7 @@ L_aesni_set_decrypt_key_begin:
        movl    12(%esp),%edx
        shll    $4,%ecx
        testl   %eax,%eax
-       jnz     L102dec_key_ret
+       jnz     L114dec_key_ret
        leal    16(%edx,%ecx,1),%eax
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
@@ -2133,7 +2376,7 @@ L_aesni_set_decrypt_key_begin:
        movups  %xmm1,(%edx)
        leal    16(%edx),%edx
        leal    -16(%eax),%eax
-L103dec_key_inverse:
+L115dec_key_inverse:
        movups  (%edx),%xmm0
        movups  (%eax),%xmm1
 .byte  102,15,56,219,192
@@ -2143,14 +2386,27 @@ L103dec_key_inverse:
        movups  %xmm0,16(%eax)
        movups  %xmm1,-16(%edx)
        cmpl    %edx,%eax
-       ja      L103dec_key_inverse
+       ja      L115dec_key_inverse
        movups  (%edx),%xmm0
 .byte  102,15,56,219,192
        movups  %xmm0,(%edx)
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
        xorl    %eax,%eax
-L102dec_key_ret:
+L114dec_key_ret:
        ret
+.align 6,0x90
+Lkey_const:
+.long  202313229,202313229,202313229,202313229
+.long  67569157,67569157,67569157,67569157
+.long  1,1,1,1
+.long  27,27,27,27
 .byte  65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 .byte  83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol       _OPENSSL_ia32cap_P
+.long  0
+.comm  _OPENSSL_ia32cap_P,16,2
index 43fdb5a..6511c21 100644 (file)
@@ -17,6 +17,7 @@ IF @Version LT 800
 ELSE
 .text$ SEGMENT ALIGN(64) 'CODE'
 ENDIF
+;EXTERN        _OPENSSL_ia32cap_P:NEAR
 ALIGN  16
 _aesni_encrypt PROC PUBLIC
 $L_aesni_encrypt_begin::
@@ -36,7 +37,10 @@ DB   102,15,56,220,209
        lea     edx,DWORD PTR 16[edx]
        jnz     $L000enc1_loop_1
 DB     102,15,56,221,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR [eax],xmm2
+       pxor    xmm2,xmm2
        ret
 _aesni_encrypt ENDP
 ALIGN  16
@@ -58,7 +62,10 @@ DB   102,15,56,222,209
        lea     edx,DWORD PTR 16[edx]
        jnz     $L001dec1_loop_2
 DB     102,15,56,223,209
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR [eax],xmm2
+       pxor    xmm2,xmm2
        ret
 _aesni_decrypt ENDP
 ALIGN  16
@@ -265,17 +272,15 @@ DB        102,15,56,220,217
        neg     ecx
 DB     102,15,56,220,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR [ecx*1+edx]
        add     ecx,16
-DB     102,15,56,220,233
-DB     102,15,56,220,241
-DB     102,15,56,220,249
-       movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jmp     $L_aesni_encrypt6_enter
+       jmp     $L008_aesni_encrypt6_inner
 ALIGN  16
-$L008enc6_loop:
+$L009enc6_loop:
 DB     102,15,56,220,209
 DB     102,15,56,220,217
 DB     102,15,56,220,225
+$L008_aesni_encrypt6_inner:
 DB     102,15,56,220,233
 DB     102,15,56,220,241
 DB     102,15,56,220,249
@@ -289,7 +294,7 @@ DB  102,15,56,220,232
 DB     102,15,56,220,240
 DB     102,15,56,220,248
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L008enc6_loop
+       jnz     $L009enc6_loop
 DB     102,15,56,220,209
 DB     102,15,56,220,217
 DB     102,15,56,220,225
@@ -320,17 +325,15 @@ DB        102,15,56,222,217
        neg     ecx
 DB     102,15,56,222,225
        pxor    xmm7,xmm0
+       movups  xmm0,XMMWORD PTR [ecx*1+edx]
        add     ecx,16
-DB     102,15,56,222,233
-DB     102,15,56,222,241
-DB     102,15,56,222,249
-       movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jmp     $L_aesni_decrypt6_enter
+       jmp     $L010_aesni_decrypt6_inner
 ALIGN  16
-$L009dec6_loop:
+$L011dec6_loop:
 DB     102,15,56,222,209
 DB     102,15,56,222,217
 DB     102,15,56,222,225
+$L010_aesni_decrypt6_inner:
 DB     102,15,56,222,233
 DB     102,15,56,222,241
 DB     102,15,56,222,249
@@ -344,7 +347,7 @@ DB  102,15,56,222,232
 DB     102,15,56,222,240
 DB     102,15,56,222,248
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L009dec6_loop
+       jnz     $L011dec6_loop
 DB     102,15,56,222,209
 DB     102,15,56,222,217
 DB     102,15,56,222,225
@@ -372,14 +375,14 @@ $L_aesni_ecb_encrypt_begin::
        mov     edx,DWORD PTR 32[esp]
        mov     ebx,DWORD PTR 36[esp]
        and     eax,-16
-       jz      $L010ecb_ret
+       jz      $L012ecb_ret
        mov     ecx,DWORD PTR 240[edx]
        test    ebx,ebx
-       jz      $L011ecb_decrypt
+       jz      $L013ecb_decrypt
        mov     ebp,edx
        mov     ebx,ecx
        cmp     eax,96
-       jb      $L012ecb_enc_tail
+       jb      $L014ecb_enc_tail
        movdqu  xmm2,XMMWORD PTR [esi]
        movdqu  xmm3,XMMWORD PTR 16[esi]
        movdqu  xmm4,XMMWORD PTR 32[esi]
@@ -388,9 +391,9 @@ $L_aesni_ecb_encrypt_begin::
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
        sub     eax,96
-       jmp     $L013ecb_enc_loop6_enter
+       jmp     $L015ecb_enc_loop6_enter
 ALIGN  16
-$L014ecb_enc_loop6:
+$L016ecb_enc_loop6:
        movups  XMMWORD PTR [edi],xmm2
        movdqu  xmm2,XMMWORD PTR [esi]
        movups  XMMWORD PTR 16[edi],xmm3
@@ -405,12 +408,12 @@ $L014ecb_enc_loop6:
        lea     edi,DWORD PTR 96[edi]
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
-$L013ecb_enc_loop6_enter:
+$L015ecb_enc_loop6_enter:
        call    __aesni_encrypt6
        mov     edx,ebp
        mov     ecx,ebx
        sub     eax,96
-       jnc     $L014ecb_enc_loop6
+       jnc     $L016ecb_enc_loop6
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
@@ -419,18 +422,18 @@ $L013ecb_enc_loop6_enter:
        movups  XMMWORD PTR 80[edi],xmm7
        lea     edi,DWORD PTR 96[edi]
        add     eax,96
-       jz      $L010ecb_ret
-$L012ecb_enc_tail:
+       jz      $L012ecb_ret
+$L014ecb_enc_tail:
        movups  xmm2,XMMWORD PTR [esi]
        cmp     eax,32
-       jb      $L015ecb_enc_one
+       jb      $L017ecb_enc_one
        movups  xmm3,XMMWORD PTR 16[esi]
-       je      $L016ecb_enc_two
+       je      $L018ecb_enc_two
        movups  xmm4,XMMWORD PTR 32[esi]
        cmp     eax,64
-       jb      $L017ecb_enc_three
+       jb      $L019ecb_enc_three
        movups  xmm5,XMMWORD PTR 48[esi]
-       je      $L018ecb_enc_four
+       je      $L020ecb_enc_four
        movups  xmm6,XMMWORD PTR 64[esi]
        xorps   xmm7,xmm7
        call    __aesni_encrypt6
@@ -439,49 +442,49 @@ $L012ecb_enc_tail:
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L015ecb_enc_one:
+$L017ecb_enc_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L019enc1_loop_3:
+$L021enc1_loop_3:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L019enc1_loop_3
+       jnz     $L021enc1_loop_3
 DB     102,15,56,221,209
        movups  XMMWORD PTR [edi],xmm2
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L016ecb_enc_two:
+$L018ecb_enc_two:
        call    __aesni_encrypt2
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L017ecb_enc_three:
+$L019ecb_enc_three:
        call    __aesni_encrypt3
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L018ecb_enc_four:
+$L020ecb_enc_four:
        call    __aesni_encrypt4
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L011ecb_decrypt:
+$L013ecb_decrypt:
        mov     ebp,edx
        mov     ebx,ecx
        cmp     eax,96
-       jb      $L020ecb_dec_tail
+       jb      $L022ecb_dec_tail
        movdqu  xmm2,XMMWORD PTR [esi]
        movdqu  xmm3,XMMWORD PTR 16[esi]
        movdqu  xmm4,XMMWORD PTR 32[esi]
@@ -490,9 +493,9 @@ $L011ecb_decrypt:
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
        sub     eax,96
-       jmp     $L021ecb_dec_loop6_enter
+       jmp     $L023ecb_dec_loop6_enter
 ALIGN  16
-$L022ecb_dec_loop6:
+$L024ecb_dec_loop6:
        movups  XMMWORD PTR [edi],xmm2
        movdqu  xmm2,XMMWORD PTR [esi]
        movups  XMMWORD PTR 16[edi],xmm3
@@ -507,12 +510,12 @@ $L022ecb_dec_loop6:
        lea     edi,DWORD PTR 96[edi]
        movdqu  xmm7,XMMWORD PTR 80[esi]
        lea     esi,DWORD PTR 96[esi]
-$L021ecb_dec_loop6_enter:
+$L023ecb_dec_loop6_enter:
        call    __aesni_decrypt6
        mov     edx,ebp
        mov     ecx,ebx
        sub     eax,96
-       jnc     $L022ecb_dec_loop6
+       jnc     $L024ecb_dec_loop6
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
@@ -521,18 +524,18 @@ $L021ecb_dec_loop6_enter:
        movups  XMMWORD PTR 80[edi],xmm7
        lea     edi,DWORD PTR 96[edi]
        add     eax,96
-       jz      $L010ecb_ret
-$L020ecb_dec_tail:
+       jz      $L012ecb_ret
+$L022ecb_dec_tail:
        movups  xmm2,XMMWORD PTR [esi]
        cmp     eax,32
-       jb      $L023ecb_dec_one
+       jb      $L025ecb_dec_one
        movups  xmm3,XMMWORD PTR 16[esi]
-       je      $L024ecb_dec_two
+       je      $L026ecb_dec_two
        movups  xmm4,XMMWORD PTR 32[esi]
        cmp     eax,64
-       jb      $L025ecb_dec_three
+       jb      $L027ecb_dec_three
        movups  xmm5,XMMWORD PTR 48[esi]
-       je      $L026ecb_dec_four
+       je      $L028ecb_dec_four
        movups  xmm6,XMMWORD PTR 64[esi]
        xorps   xmm7,xmm7
        call    __aesni_decrypt6
@@ -541,43 +544,51 @@ $L020ecb_dec_tail:
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L023ecb_dec_one:
+$L025ecb_dec_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L027dec1_loop_4:
+$L029dec1_loop_4:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L027dec1_loop_4
+       jnz     $L029dec1_loop_4
 DB     102,15,56,223,209
        movups  XMMWORD PTR [edi],xmm2
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L024ecb_dec_two:
+$L026ecb_dec_two:
        call    __aesni_decrypt2
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L025ecb_dec_three:
+$L027ecb_dec_three:
        call    __aesni_decrypt3
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
-       jmp     $L010ecb_ret
+       jmp     $L012ecb_ret
 ALIGN  16
-$L026ecb_dec_four:
+$L028ecb_dec_four:
        call    __aesni_decrypt4
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
-$L010ecb_ret:
+$L012ecb_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        pop     edi
        pop     esi
        pop     ebx
@@ -622,7 +633,7 @@ $L_aesni_ccm64_encrypt_blocks_begin::
        lea     edx,DWORD PTR 32[ecx*1+edx]
        sub     ebx,ecx
 DB     102,15,56,0,253
-$L028ccm64_enc_outer:
+$L030ccm64_enc_outer:
        movups  xmm0,XMMWORD PTR [ebp]
        mov     ecx,ebx
        movups  xmm6,XMMWORD PTR [esi]
@@ -631,7 +642,7 @@ $L028ccm64_enc_outer:
        xorps   xmm0,xmm6
        xorps   xmm3,xmm0
        movups  xmm0,XMMWORD PTR 32[ebp]
-$L029ccm64_enc2_loop:
+$L031ccm64_enc2_loop:
 DB     102,15,56,220,209
 DB     102,15,56,220,217
        movups  xmm1,XMMWORD PTR [ecx*1+edx]
@@ -639,7 +650,7 @@ DB  102,15,56,220,217
 DB     102,15,56,220,208
 DB     102,15,56,220,216
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L029ccm64_enc2_loop
+       jnz     $L031ccm64_enc2_loop
 DB     102,15,56,220,209
 DB     102,15,56,220,217
        paddq   xmm7,XMMWORD PTR 16[esp]
@@ -652,10 +663,18 @@ DB        102,15,56,221,216
        movups  XMMWORD PTR [edi],xmm6
 DB     102,15,56,0,213
        lea     edi,DWORD PTR 16[edi]
-       jnz     $L028ccm64_enc_outer
+       jnz     $L030ccm64_enc_outer
        mov     esp,DWORD PTR 48[esp]
        mov     edi,DWORD PTR 40[esp]
        movups  XMMWORD PTR [edi],xmm3
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        pop     edi
        pop     esi
        pop     ebx
@@ -701,12 +720,12 @@ DB        102,15,56,0,253
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L030enc1_loop_5:
+$L032enc1_loop_5:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L030enc1_loop_5
+       jnz     $L032enc1_loop_5
 DB     102,15,56,221,209
        shl     ebx,4
        mov     ecx,16
@@ -716,16 +735,16 @@ DB        102,15,56,221,209
        sub     ecx,ebx
        lea     edx,DWORD PTR 32[ebx*1+ebp]
        mov     ebx,ecx
-       jmp     $L031ccm64_dec_outer
+       jmp     $L033ccm64_dec_outer
 ALIGN  16
-$L031ccm64_dec_outer:
+$L033ccm64_dec_outer:
        xorps   xmm6,xmm2
        movdqa  xmm2,xmm7
        movups  XMMWORD PTR [edi],xmm6
        lea     edi,DWORD PTR 16[edi]
 DB     102,15,56,0,213
        sub     eax,1
-       jz      $L032ccm64_dec_break
+       jz      $L034ccm64_dec_break
        movups  xmm0,XMMWORD PTR [ebp]
        mov     ecx,ebx
        movups  xmm1,XMMWORD PTR 16[ebp]
@@ -733,7 +752,7 @@ DB  102,15,56,0,213
        xorps   xmm2,xmm0
        xorps   xmm3,xmm6
        movups  xmm0,XMMWORD PTR 32[ebp]
-$L033ccm64_dec2_loop:
+$L035ccm64_dec2_loop:
 DB     102,15,56,220,209
 DB     102,15,56,220,217
        movups  xmm1,XMMWORD PTR [ecx*1+edx]
@@ -741,7 +760,7 @@ DB  102,15,56,220,217
 DB     102,15,56,220,208
 DB     102,15,56,220,216
        movups  xmm0,XMMWORD PTR [ecx*1+edx-16]
-       jnz     $L033ccm64_dec2_loop
+       jnz     $L035ccm64_dec2_loop
        movups  xmm6,XMMWORD PTR [esi]
        paddq   xmm7,XMMWORD PTR 16[esp]
 DB     102,15,56,220,209
@@ -749,9 +768,9 @@ DB  102,15,56,220,217
 DB     102,15,56,221,208
 DB     102,15,56,221,216
        lea     esi,QWORD PTR 16[esi]
-       jmp     $L031ccm64_dec_outer
+       jmp     $L033ccm64_dec_outer
 ALIGN  16
-$L032ccm64_dec_break:
+$L034ccm64_dec_break:
        mov     ecx,DWORD PTR 240[ebp]
        mov     edx,ebp
        movups  xmm0,XMMWORD PTR [edx]
@@ -759,16 +778,24 @@ $L032ccm64_dec_break:
        xorps   xmm6,xmm0
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm3,xmm6
-$L034enc1_loop_6:
+$L036enc1_loop_6:
 DB     102,15,56,220,217
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L034enc1_loop_6
+       jnz     $L036enc1_loop_6
 DB     102,15,56,221,217
        mov     esp,DWORD PTR 48[esp]
        mov     edi,DWORD PTR 40[esp]
        movups  XMMWORD PTR [edi],xmm3
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+       pxor    xmm7,xmm7
        pop     edi
        pop     esi
        pop     ebx
@@ -792,7 +819,7 @@ $L_aesni_ctr32_encrypt_blocks_begin::
        and     esp,-16
        mov     DWORD PTR 80[esp],ebp
        cmp     eax,1
-       je      $L035ctr32_one_shortcut
+       je      $L037ctr32_one_shortcut
        movdqu  xmm7,XMMWORD PTR [ebx]
        mov     DWORD PTR [esp],202182159
        mov     DWORD PTR 4[esp],134810123
@@ -830,7 +857,7 @@ DB  102,15,56,0,202
        pshufd  xmm2,xmm0,192
        pshufd  xmm3,xmm0,128
        cmp     eax,6
-       jb      $L036ctr32_tail
+       jb      $L038ctr32_tail
        pxor    xmm7,xmm6
        shl     ecx,4
        mov     ebx,16
@@ -839,9 +866,9 @@ DB  102,15,56,0,202
        sub     ebx,ecx
        lea     edx,DWORD PTR 32[ecx*1+edx]
        sub     eax,6
-       jmp     $L037ctr32_loop6
+       jmp     $L039ctr32_loop6
 ALIGN  16
-$L037ctr32_loop6:
+$L039ctr32_loop6:
        pshufd  xmm4,xmm0,64
        movdqa  xmm0,XMMWORD PTR 32[esp]
        pshufd  xmm5,xmm1,192
@@ -895,27 +922,27 @@ DB        102,15,56,0,202
        lea     edi,DWORD PTR 96[edi]
        pshufd  xmm3,xmm0,128
        sub     eax,6
-       jnc     $L037ctr32_loop6
+       jnc     $L039ctr32_loop6
        add     eax,6
-       jz      $L038ctr32_ret
+       jz      $L040ctr32_ret
        movdqu  xmm7,XMMWORD PTR [ebp]
        mov     edx,ebp
        pxor    xmm7,XMMWORD PTR 32[esp]
        mov     ecx,DWORD PTR 240[ebp]
-$L036ctr32_tail:
+$L038ctr32_tail:
        por     xmm2,xmm7
        cmp     eax,2
-       jb      $L039ctr32_one
+       jb      $L041ctr32_one
        pshufd  xmm4,xmm0,64
        por     xmm3,xmm7
-       je      $L040ctr32_two
+       je      $L042ctr32_two
        pshufd  xmm5,xmm1,192
        por     xmm4,xmm7
        cmp     eax,4
-       jb      $L041ctr32_three
+       jb      $L043ctr32_three
        pshufd  xmm6,xmm1,128
        por     xmm5,xmm7
-       je      $L042ctr32_four
+       je      $L044ctr32_four
        por     xmm6,xmm7
        call    __aesni_encrypt6
        movups  xmm1,XMMWORD PTR [esi]
@@ -933,29 +960,29 @@ $L036ctr32_tail:
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L035ctr32_one_shortcut:
+$L037ctr32_one_shortcut:
        movups  xmm2,XMMWORD PTR [ebx]
        mov     ecx,DWORD PTR 240[edx]
-$L039ctr32_one:
+$L041ctr32_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L043enc1_loop_7:
+$L045enc1_loop_7:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L043enc1_loop_7
+       jnz     $L045enc1_loop_7
 DB     102,15,56,221,209
        movups  xmm6,XMMWORD PTR [esi]
        xorps   xmm6,xmm2
        movups  XMMWORD PTR [edi],xmm6
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L040ctr32_two:
+$L042ctr32_two:
        call    __aesni_encrypt2
        movups  xmm5,XMMWORD PTR [esi]
        movups  xmm6,XMMWORD PTR 16[esi]
@@ -963,9 +990,9 @@ $L040ctr32_two:
        xorps   xmm3,xmm6
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L041ctr32_three:
+$L043ctr32_three:
        call    __aesni_encrypt3
        movups  xmm5,XMMWORD PTR [esi]
        movups  xmm6,XMMWORD PTR 16[esi]
@@ -976,9 +1003,9 @@ $L041ctr32_three:
        xorps   xmm4,xmm7
        movups  XMMWORD PTR 16[edi],xmm3
        movups  XMMWORD PTR 32[edi],xmm4
-       jmp     $L038ctr32_ret
+       jmp     $L040ctr32_ret
 ALIGN  16
-$L042ctr32_four:
+$L044ctr32_four:
        call    __aesni_encrypt4
        movups  xmm6,XMMWORD PTR [esi]
        movups  xmm7,XMMWORD PTR 16[esi]
@@ -992,7 +1019,18 @@ $L042ctr32_four:
        xorps   xmm5,xmm0
        movups  XMMWORD PTR 32[edi],xmm4
        movups  XMMWORD PTR 48[edi],xmm5
-$L038ctr32_ret:
+$L040ctr32_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       pxor    xmm5,xmm5
+       movdqa  XMMWORD PTR 48[esp],xmm0
+       pxor    xmm6,xmm6
+       movdqa  XMMWORD PTR 64[esp],xmm0
+       pxor    xmm7,xmm7
        mov     esp,DWORD PTR 80[esp]
        pop     edi
        pop     esi
@@ -1015,12 +1053,12 @@ $L_aesni_xts_encrypt_begin::
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L044enc1_loop_8:
+$L046enc1_loop_8:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L044enc1_loop_8
+       jnz     $L046enc1_loop_8
 DB     102,15,56,221,209
        mov     esi,DWORD PTR 20[esp]
        mov     edi,DWORD PTR 24[esp]
@@ -1044,14 +1082,14 @@ DB      102,15,56,221,209
        mov     ebp,edx
        mov     ebx,ecx
        sub     eax,96
-       jc      $L045xts_enc_short
+       jc      $L047xts_enc_short
        shl     ecx,4
        mov     ebx,16
        sub     ebx,ecx
        lea     edx,DWORD PTR 32[ecx*1+edx]
-       jmp     $L046xts_enc_loop6
+       jmp     $L048xts_enc_loop6
 ALIGN  16
-$L046xts_enc_loop6:
+$L048xts_enc_loop6:
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  XMMWORD PTR [esp],xmm1
@@ -1140,23 +1178,23 @@ DB      102,15,56,220,249
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        sub     eax,96
-       jnc     $L046xts_enc_loop6
+       jnc     $L048xts_enc_loop6
        mov     ecx,DWORD PTR 240[ebp]
        mov     edx,ebp
        mov     ebx,ecx
-$L045xts_enc_short:
+$L047xts_enc_short:
        add     eax,96
-       jz      $L047xts_enc_done6x
+       jz      $L049xts_enc_done6x
        movdqa  xmm5,xmm1
        cmp     eax,32
-       jb      $L048xts_enc_one
+       jb      $L050xts_enc_one
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        paddq   xmm1,xmm1
        pand    xmm2,xmm3
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
-       je      $L049xts_enc_two
+       je      $L051xts_enc_two
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm6,xmm1
@@ -1165,7 +1203,7 @@ $L045xts_enc_short:
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        cmp     eax,64
-       jb      $L050xts_enc_three
+       jb      $L052xts_enc_three
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm7,xmm1
@@ -1175,7 +1213,7 @@ $L045xts_enc_short:
        pxor    xmm1,xmm2
        movdqa  XMMWORD PTR [esp],xmm5
        movdqa  XMMWORD PTR 16[esp],xmm6
-       je      $L051xts_enc_four
+       je      $L053xts_enc_four
        movdqa  XMMWORD PTR 32[esp],xmm7
        pshufd  xmm7,xmm0,19
        movdqa  XMMWORD PTR 48[esp],xmm1
@@ -1207,9 +1245,9 @@ $L045xts_enc_short:
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
        lea     edi,DWORD PTR 80[edi]
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L048xts_enc_one:
+$L050xts_enc_one:
        movups  xmm2,XMMWORD PTR [esi]
        lea     esi,DWORD PTR 16[esi]
        xorps   xmm2,xmm5
@@ -1217,20 +1255,20 @@ $L048xts_enc_one:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L053enc1_loop_9:
+$L055enc1_loop_9:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L053enc1_loop_9
+       jnz     $L055enc1_loop_9
 DB     102,15,56,221,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
        movdqa  xmm1,xmm5
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L049xts_enc_two:
+$L051xts_enc_two:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1244,9 +1282,9 @@ $L049xts_enc_two:
        movups  XMMWORD PTR 16[edi],xmm3
        lea     edi,DWORD PTR 32[edi]
        movdqa  xmm1,xmm6
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L050xts_enc_three:
+$L052xts_enc_three:
        movaps  xmm7,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1264,9 +1302,9 @@ $L050xts_enc_three:
        movups  XMMWORD PTR 32[edi],xmm4
        lea     edi,DWORD PTR 48[edi]
        movdqa  xmm1,xmm7
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L051xts_enc_four:
+$L053xts_enc_four:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1288,28 +1326,28 @@ $L051xts_enc_four:
        movups  XMMWORD PTR 48[edi],xmm5
        lea     edi,DWORD PTR 64[edi]
        movdqa  xmm1,xmm6
-       jmp     $L052xts_enc_done
+       jmp     $L054xts_enc_done
 ALIGN  16
-$L047xts_enc_done6x:
+$L049xts_enc_done6x:
        mov     eax,DWORD PTR 112[esp]
        and     eax,15
-       jz      $L054xts_enc_ret
+       jz      $L056xts_enc_ret
        movdqa  xmm5,xmm1
        mov     DWORD PTR 112[esp],eax
-       jmp     $L055xts_enc_steal
+       jmp     $L057xts_enc_steal
 ALIGN  16
-$L052xts_enc_done:
+$L054xts_enc_done:
        mov     eax,DWORD PTR 112[esp]
        pxor    xmm0,xmm0
        and     eax,15
-       jz      $L054xts_enc_ret
+       jz      $L056xts_enc_ret
        pcmpgtd xmm0,xmm1
        mov     DWORD PTR 112[esp],eax
        pshufd  xmm5,xmm0,19
        paddq   xmm1,xmm1
        pand    xmm5,XMMWORD PTR 96[esp]
        pxor    xmm5,xmm1
-$L055xts_enc_steal:
+$L057xts_enc_steal:
        movzx   ecx,BYTE PTR [esi]
        movzx   edx,BYTE PTR [edi-16]
        lea     esi,DWORD PTR 1[esi]
@@ -1317,7 +1355,7 @@ $L055xts_enc_steal:
        mov     BYTE PTR [edi],dl
        lea     edi,DWORD PTR 1[edi]
        sub     eax,1
-       jnz     $L055xts_enc_steal
+       jnz     $L057xts_enc_steal
        sub     edi,DWORD PTR 112[esp]
        mov     edx,ebp
        mov     ecx,ebx
@@ -1327,16 +1365,30 @@ $L055xts_enc_steal:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L056enc1_loop_10:
+$L058enc1_loop_10:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L056enc1_loop_10
+       jnz     $L058enc1_loop_10
 DB     102,15,56,221,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi-16],xmm2
-$L054xts_enc_ret:
+$L056xts_enc_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       movdqa  XMMWORD PTR [esp],xmm0
+       pxor    xmm3,xmm3
+       movdqa  XMMWORD PTR 16[esp],xmm0
+       pxor    xmm4,xmm4
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       pxor    xmm5,xmm5
+       movdqa  XMMWORD PTR 48[esp],xmm0
+       pxor    xmm6,xmm6
+       movdqa  XMMWORD PTR 64[esp],xmm0
+       pxor    xmm7,xmm7
+       movdqa  XMMWORD PTR 80[esp],xmm0
        mov     esp,DWORD PTR 116[esp]
        pop     edi
        pop     esi
@@ -1359,12 +1411,12 @@ $L_aesni_xts_decrypt_begin::
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L057enc1_loop_11:
+$L059enc1_loop_11:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L057enc1_loop_11
+       jnz     $L059enc1_loop_11
 DB     102,15,56,221,209
        mov     esi,DWORD PTR 20[esp]
        mov     edi,DWORD PTR 24[esp]
@@ -1393,14 +1445,14 @@ DB      102,15,56,221,209
        pcmpgtd xmm0,xmm1
        and     eax,-16
        sub     eax,96
-       jc      $L058xts_dec_short
+       jc      $L060xts_dec_short
        shl     ecx,4
        mov     ebx,16
        sub     ebx,ecx
        lea     edx,DWORD PTR 32[ecx*1+edx]
-       jmp     $L059xts_dec_loop6
+       jmp     $L061xts_dec_loop6
 ALIGN  16
-$L059xts_dec_loop6:
+$L061xts_dec_loop6:
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  XMMWORD PTR [esp],xmm1
@@ -1489,23 +1541,23 @@ DB      102,15,56,222,249
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        sub     eax,96
-       jnc     $L059xts_dec_loop6
+       jnc     $L061xts_dec_loop6
        mov     ecx,DWORD PTR 240[ebp]
        mov     edx,ebp
        mov     ebx,ecx
-$L058xts_dec_short:
+$L060xts_dec_short:
        add     eax,96
-       jz      $L060xts_dec_done6x
+       jz      $L062xts_dec_done6x
        movdqa  xmm5,xmm1
        cmp     eax,32
-       jb      $L061xts_dec_one
+       jb      $L063xts_dec_one
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        paddq   xmm1,xmm1
        pand    xmm2,xmm3
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
-       je      $L062xts_dec_two
+       je      $L064xts_dec_two
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm6,xmm1
@@ -1514,7 +1566,7 @@ $L058xts_dec_short:
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
        cmp     eax,64
-       jb      $L063xts_dec_three
+       jb      $L065xts_dec_three
        pshufd  xmm2,xmm0,19
        pxor    xmm0,xmm0
        movdqa  xmm7,xmm1
@@ -1524,7 +1576,7 @@ $L058xts_dec_short:
        pxor    xmm1,xmm2
        movdqa  XMMWORD PTR [esp],xmm5
        movdqa  XMMWORD PTR 16[esp],xmm6
-       je      $L064xts_dec_four
+       je      $L066xts_dec_four
        movdqa  XMMWORD PTR 32[esp],xmm7
        pshufd  xmm7,xmm0,19
        movdqa  XMMWORD PTR 48[esp],xmm1
@@ -1556,9 +1608,9 @@ $L058xts_dec_short:
        movups  XMMWORD PTR 48[edi],xmm5
        movups  XMMWORD PTR 64[edi],xmm6
        lea     edi,DWORD PTR 80[edi]
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L061xts_dec_one:
+$L063xts_dec_one:
        movups  xmm2,XMMWORD PTR [esi]
        lea     esi,DWORD PTR 16[esi]
        xorps   xmm2,xmm5
@@ -1566,20 +1618,20 @@ $L061xts_dec_one:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L066dec1_loop_12:
+$L068dec1_loop_12:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L066dec1_loop_12
+       jnz     $L068dec1_loop_12
 DB     102,15,56,223,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
        movdqa  xmm1,xmm5
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L062xts_dec_two:
+$L064xts_dec_two:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1593,9 +1645,9 @@ $L062xts_dec_two:
        movups  XMMWORD PTR 16[edi],xmm3
        lea     edi,DWORD PTR 32[edi]
        movdqa  xmm1,xmm6
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L063xts_dec_three:
+$L065xts_dec_three:
        movaps  xmm7,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1613,9 +1665,9 @@ $L063xts_dec_three:
        movups  XMMWORD PTR 32[edi],xmm4
        lea     edi,DWORD PTR 48[edi]
        movdqa  xmm1,xmm7
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L064xts_dec_four:
+$L066xts_dec_four:
        movaps  xmm6,xmm1
        movups  xmm2,XMMWORD PTR [esi]
        movups  xmm3,XMMWORD PTR 16[esi]
@@ -1637,20 +1689,20 @@ $L064xts_dec_four:
        movups  XMMWORD PTR 48[edi],xmm5
        lea     edi,DWORD PTR 64[edi]
        movdqa  xmm1,xmm6
-       jmp     $L065xts_dec_done
+       jmp     $L067xts_dec_done
 ALIGN  16
-$L060xts_dec_done6x:
+$L062xts_dec_done6x:
        mov     eax,DWORD PTR 112[esp]
        and     eax,15
-       jz      $L067xts_dec_ret
+       jz      $L069xts_dec_ret
        mov     DWORD PTR 112[esp],eax
-       jmp     $L068xts_dec_only_one_more
+       jmp     $L070xts_dec_only_one_more
 ALIGN  16
-$L065xts_dec_done:
+$L067xts_dec_done:
        mov     eax,DWORD PTR 112[esp]
        pxor    xmm0,xmm0
        and     eax,15
-       jz      $L067xts_dec_ret
+       jz      $L069xts_dec_ret
        pcmpgtd xmm0,xmm1
        mov     DWORD PTR 112[esp],eax
        pshufd  xmm2,xmm0,19
@@ -1660,7 +1712,7 @@ $L065xts_dec_done:
        pand    xmm2,xmm3
        pcmpgtd xmm0,xmm1
        pxor    xmm1,xmm2
-$L068xts_dec_only_one_more:
+$L070xts_dec_only_one_more:
        pshufd  xmm5,xmm0,19
        movdqa  xmm6,xmm1
        paddq   xmm1,xmm1
@@ -1674,16 +1726,16 @@ $L068xts_dec_only_one_more:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L069dec1_loop_13:
+$L071dec1_loop_13:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L069dec1_loop_13
+       jnz     $L071dec1_loop_13
 DB     102,15,56,223,209
        xorps   xmm2,xmm5
        movups  XMMWORD PTR [edi],xmm2
-$L070xts_dec_steal:
+$L072xts_dec_steal:
        movzx   ecx,BYTE PTR 16[esi]
        movzx   edx,BYTE PTR [edi]
        lea     esi,DWORD PTR 1[esi]
@@ -1691,7 +1743,7 @@ $L070xts_dec_steal:
        mov     BYTE PTR 16[edi],dl
        lea     edi,DWORD PTR 1[edi]
        sub     eax,1
-       jnz     $L070xts_dec_steal
+       jnz     $L072xts_dec_steal
        sub     edi,DWORD PTR 112[esp]
        mov     edx,ebp
        mov     ecx,ebx
@@ -1701,16 +1753,30 @@ $L070xts_dec_steal:
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L071dec1_loop_14:
+$L073dec1_loop_14:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L071dec1_loop_14
+       jnz     $L073dec1_loop_14
 DB     102,15,56,223,209
        xorps   xmm2,xmm6
        movups  XMMWORD PTR [edi],xmm2
-$L067xts_dec_ret:
+$L069xts_dec_ret:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       movdqa  XMMWORD PTR [esp],xmm0
+       pxor    xmm3,xmm3
+       movdqa  XMMWORD PTR 16[esp],xmm0
+       pxor    xmm4,xmm4
+       movdqa  XMMWORD PTR 32[esp],xmm0
+       pxor    xmm5,xmm5
+       movdqa  XMMWORD PTR 48[esp],xmm0
+       pxor    xmm6,xmm6
+       movdqa  XMMWORD PTR 64[esp],xmm0
+       pxor    xmm7,xmm7
+       movdqa  XMMWORD PTR 80[esp],xmm0
        mov     esp,DWORD PTR 116[esp]
        pop     edi
        pop     esi
@@ -1734,7 +1800,7 @@ $L_aesni_cbc_encrypt_begin::
        mov     edx,DWORD PTR 32[esp]
        mov     ebp,DWORD PTR 36[esp]
        test    eax,eax
-       jz      $L072cbc_abort
+       jz      $L074cbc_abort
        cmp     DWORD PTR 40[esp],0
        xchg    ebx,esp
        movups  xmm7,XMMWORD PTR [ebp]
@@ -1742,14 +1808,14 @@ $L_aesni_cbc_encrypt_begin::
        mov     ebp,edx
        mov     DWORD PTR 16[esp],ebx
        mov     ebx,ecx
-       je      $L073cbc_decrypt
+       je      $L075cbc_decrypt
        movaps  xmm2,xmm7
        cmp     eax,16
-       jb      $L074cbc_enc_tail
+       jb      $L076cbc_enc_tail
        sub     eax,16
-       jmp     $L075cbc_enc_loop
+       jmp     $L077cbc_enc_loop
 ALIGN  16
-$L075cbc_enc_loop:
+$L077cbc_enc_loop:
        movups  xmm7,XMMWORD PTR [esi]
        lea     esi,DWORD PTR 16[esi]
        movups  xmm0,XMMWORD PTR [edx]
@@ -1757,24 +1823,25 @@ $L075cbc_enc_loop:
        xorps   xmm7,xmm0
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm7
-$L076enc1_loop_15:
+$L078enc1_loop_15:
 DB     102,15,56,220,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L076enc1_loop_15
+       jnz     $L078enc1_loop_15
 DB     102,15,56,221,209
        mov     ecx,ebx
        mov     edx,ebp
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
        sub     eax,16
-       jnc     $L075cbc_enc_loop
+       jnc     $L077cbc_enc_loop
        add     eax,16
-       jnz     $L074cbc_enc_tail
+       jnz     $L076cbc_enc_tail
        movaps  xmm7,xmm2
-       jmp     $L077cbc_ret
-$L074cbc_enc_tail:
+       pxor    xmm2,xmm2
+       jmp     $L079cbc_ret
+$L076cbc_enc_tail:
        mov     ecx,eax
 DD     2767451785
        mov     ecx,16
@@ -1785,20 +1852,20 @@ DD      2868115081
        mov     ecx,ebx
        mov     esi,edi
        mov     edx,ebp
-       jmp     $L075cbc_enc_loop
+       jmp     $L077cbc_enc_loop
 ALIGN  16
-$L073cbc_decrypt:
+$L075cbc_decrypt:
        cmp     eax,80
-       jbe     $L078cbc_dec_tail
+       jbe     $L080cbc_dec_tail
        movaps  XMMWORD PTR [esp],xmm7
        sub     eax,80
-       jmp     $L079cbc_dec_loop6_enter
+       jmp     $L081cbc_dec_loop6_enter
 ALIGN  16
-$L080cbc_dec_loop6:
+$L082cbc_dec_loop6:
        movaps  XMMWORD PTR [esp],xmm0
        movups  XMMWORD PTR [edi],xmm7
        lea     edi,DWORD PTR 16[edi]
-$L079cbc_dec_loop6_enter:
+$L081cbc_dec_loop6_enter:
        movdqu  xmm2,XMMWORD PTR [esi]
        movdqu  xmm3,XMMWORD PTR 16[esi]
        movdqu  xmm4,XMMWORD PTR 32[esi]
@@ -1828,28 +1895,28 @@ $L079cbc_dec_loop6_enter:
        movups  XMMWORD PTR 64[edi],xmm6
        lea     edi,DWORD PTR 80[edi]
        sub     eax,96
-       ja      $L080cbc_dec_loop6
+       ja      $L082cbc_dec_loop6
        movaps  xmm2,xmm7
        movaps  xmm7,xmm0
        add     eax,80
-       jle     $L081cbc_dec_tail_collected
+       jle     $L083cbc_dec_clear_tail_collected
        movups  XMMWORD PTR [edi],xmm2
        lea     edi,DWORD PTR 16[edi]
-$L078cbc_dec_tail:
+$L080cbc_dec_tail:
        movups  xmm2,XMMWORD PTR [esi]
        movaps  xmm6,xmm2
        cmp     eax,16
-       jbe     $L082cbc_dec_one
+       jbe     $L084cbc_dec_one
        movups  xmm3,XMMWORD PTR 16[esi]
        movaps  xmm5,xmm3
        cmp     eax,32
-       jbe     $L083cbc_dec_two
+       jbe     $L085cbc_dec_two
        movups  xmm4,XMMWORD PTR 32[esi]
        cmp     eax,48
-       jbe     $L084cbc_dec_three
+       jbe     $L086cbc_dec_three
        movups  xmm5,XMMWORD PTR 48[esi]
        cmp     eax,64
-       jbe     $L085cbc_dec_four
+       jbe     $L087cbc_dec_four
        movups  xmm6,XMMWORD PTR 64[esi]
        movaps  XMMWORD PTR [esp],xmm7
        movups  xmm2,XMMWORD PTR [esi]
@@ -1867,55 +1934,62 @@ $L078cbc_dec_tail:
        xorps   xmm6,xmm0
        movups  XMMWORD PTR [edi],xmm2
        movups  XMMWORD PTR 16[edi],xmm3
+       pxor    xmm3,xmm3
        movups  XMMWORD PTR 32[edi],xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR 48[edi],xmm5
+       pxor    xmm5,xmm5
        lea     edi,DWORD PTR 64[edi]
        movaps  xmm2,xmm6
+       pxor    xmm6,xmm6
        sub     eax,80
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L082cbc_dec_one:
+$L084cbc_dec_one:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR 16[edx]
        lea     edx,DWORD PTR 32[edx]
        xorps   xmm2,xmm0
-$L086dec1_loop_16:
+$L089dec1_loop_16:
 DB     102,15,56,222,209
        dec     ecx
        movups  xmm1,XMMWORD PTR [edx]
        lea     edx,DWORD PTR 16[edx]
-       jnz     $L086dec1_loop_16
+       jnz     $L089dec1_loop_16
 DB     102,15,56,223,209
        xorps   xmm2,xmm7
        movaps  xmm7,xmm6
        sub     eax,16
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L083cbc_dec_two:
+$L085cbc_dec_two:
        call    __aesni_decrypt2
        xorps   xmm2,xmm7
        xorps   xmm3,xmm6
        movups  XMMWORD PTR [edi],xmm2
        movaps  xmm2,xmm3
+       pxor    xmm3,xmm3
        lea     edi,DWORD PTR 16[edi]
        movaps  xmm7,xmm5
        sub     eax,32
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L084cbc_dec_three:
+$L086cbc_dec_three:
        call    __aesni_decrypt3
        xorps   xmm2,xmm7
        xorps   xmm3,xmm6
        xorps   xmm4,xmm5
        movups  XMMWORD PTR [edi],xmm2
        movaps  xmm2,xmm4
+       pxor    xmm4,xmm4
        movups  XMMWORD PTR 16[edi],xmm3
+       pxor    xmm3,xmm3
        lea     edi,DWORD PTR 32[edi]
        movups  xmm7,XMMWORD PTR 32[esi]
        sub     eax,48
-       jmp     $L081cbc_dec_tail_collected
+       jmp     $L088cbc_dec_tail_collected
 ALIGN  16
-$L085cbc_dec_four:
+$L087cbc_dec_four:
        call    __aesni_decrypt4
        movups  xmm1,XMMWORD PTR 16[esi]
        movups  xmm0,XMMWORD PTR 32[esi]
@@ -1925,28 +1999,44 @@ $L085cbc_dec_four:
        movups  XMMWORD PTR [edi],xmm2
        xorps   xmm4,xmm1
        movups  XMMWORD PTR 16[edi],xmm3
+       pxor    xmm3,xmm3
        xorps   xmm5,xmm0
        movups  XMMWORD PTR 32[edi],xmm4
+       pxor    xmm4,xmm4
        lea     edi,DWORD PTR 48[edi]
        movaps  xmm2,xmm5
+       pxor    xmm5,xmm5
        sub     eax,64
-$L081cbc_dec_tail_collected:
+       jmp     $L088cbc_dec_tail_collected
+ALIGN  16
+$L083cbc_dec_clear_tail_collected:
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       pxor    xmm6,xmm6
+$L088cbc_dec_tail_collected:
        and     eax,15
-       jnz     $L087cbc_dec_tail_partial
+       jnz     $L090cbc_dec_tail_partial
        movups  XMMWORD PTR [edi],xmm2
-       jmp     $L077cbc_ret
+       pxor    xmm0,xmm0
+       jmp     $L079cbc_ret
 ALIGN  16
-$L087cbc_dec_tail_partial:
+$L090cbc_dec_tail_partial:
        movaps  XMMWORD PTR [esp],xmm2
+       pxor    xmm0,xmm0
        mov     ecx,16
        mov     esi,esp
        sub     ecx,eax
 DD     2767451785
-$L077cbc_ret:
+       movdqa  XMMWORD PTR [esp],xmm2
+$L079cbc_ret:
        mov     esp,DWORD PTR 16[esp]
        mov     ebp,DWORD PTR 36[esp]
+       pxor    xmm2,xmm2
+       pxor    xmm1,xmm1
        movups  XMMWORD PTR [ebp],xmm7
-$L072cbc_abort:
+       pxor    xmm7,xmm7
+$L074cbc_abort:
        pop     edi
        pop     esi
        pop     ebx
@@ -1955,52 +2045,62 @@ $L072cbc_abort:
 _aesni_cbc_encrypt ENDP
 ALIGN  16
 __aesni_set_encrypt_key        PROC PRIVATE
+       push    ebp
+       push    ebx
        test    eax,eax
-       jz      $L088bad_pointer
+       jz      $L091bad_pointer
        test    edx,edx
-       jz      $L088bad_pointer
+       jz      $L091bad_pointer
+       call    $L092pic
+$L092pic:
+       pop     ebx
+       lea     ebx,DWORD PTR ($Lkey_const-$L092pic)[ebx]
+       lea     ebp,DWORD PTR _OPENSSL_ia32cap_P
        movups  xmm0,XMMWORD PTR [eax]
        xorps   xmm4,xmm4
+       mov     ebp,DWORD PTR 4[ebp]
        lea     edx,DWORD PTR 16[edx]
+       and     ebp,268437504
        cmp     ecx,256
-       je      $L08914rounds
+       je      $L09314rounds
        cmp     ecx,192
-       je      $L09012rounds
+       je      $L09412rounds
        cmp     ecx,128
-       jne     $L091bad_keybits
+       jne     $L095bad_keybits
 ALIGN  16
-$L09210rounds:
+$L09610rounds:
+       cmp     ebp,268435456
+       je      $L09710rounds_alt
        mov     ecx,9
        movups  XMMWORD PTR [edx-16],xmm0
 DB     102,15,58,223,200,1
-       call    $L093key_128_cold
+       call    $L098key_128_cold
 DB     102,15,58,223,200,2
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,4
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,8
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,16
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,32
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,64
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,128
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,27
-       call    $L094key_128
+       call    $L099key_128
 DB     102,15,58,223,200,54
-       call    $L094key_128
+       call    $L099key_128
        movups  XMMWORD PTR [edx],xmm0
        mov     DWORD PTR 80[edx],ecx
-       xor     eax,eax
-       ret
+       jmp     $L100good_key
 ALIGN  16
-$L094key_128:
+$L099key_128:
        movups  XMMWORD PTR [edx],xmm0
        lea     edx,DWORD PTR 16[edx]
-$L093key_128_cold:
+$L098key_128_cold:
        shufps  xmm4,xmm0,16
        xorps   xmm0,xmm4
        shufps  xmm4,xmm0,140
@@ -2009,38 +2109,91 @@ $L093key_128_cold:
        xorps   xmm0,xmm1
        ret
 ALIGN  16
-$L09012rounds:
+$L09710rounds_alt:
+       movdqa  xmm5,XMMWORD PTR [ebx]
+       mov     ecx,8
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+       movdqa  xmm2,xmm0
+       movdqu  XMMWORD PTR [edx-16],xmm0
+$L101loop_key128:
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+       lea     edx,DWORD PTR 16[edx]
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR [edx-16],xmm0
+       movdqa  xmm2,xmm0
+       dec     ecx
+       jnz     $L101loop_key128
+       movdqa  xmm4,XMMWORD PTR 48[ebx]
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       pslld   xmm4,1
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR [edx],xmm0
+       movdqa  xmm2,xmm0
+DB     102,15,56,0,197
+DB     102,15,56,221,196
+       movdqa  xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm3,xmm2
+       pslldq  xmm2,4
+       pxor    xmm2,xmm3
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR 16[edx],xmm0
+       mov     ecx,9
+       mov     DWORD PTR 96[edx],ecx
+       jmp     $L100good_key
+ALIGN  16
+$L09412rounds:
        movq    xmm2,QWORD PTR 16[eax]
+       cmp     ebp,268435456
+       je      $L10212rounds_alt
        mov     ecx,11
        movups  XMMWORD PTR [edx-16],xmm0
 DB     102,15,58,223,202,1
-       call    $L095key_192a_cold
+       call    $L103key_192a_cold
 DB     102,15,58,223,202,2
-       call    $L096key_192b
+       call    $L104key_192b
 DB     102,15,58,223,202,4
-       call    $L097key_192a
+       call    $L105key_192a
 DB     102,15,58,223,202,8
-       call    $L096key_192b
+       call    $L104key_192b
 DB     102,15,58,223,202,16
-       call    $L097key_192a
+       call    $L105key_192a
 DB     102,15,58,223,202,32
-       call    $L096key_192b
+       call    $L104key_192b
 DB     102,15,58,223,202,64
-       call    $L097key_192a
+       call    $L105key_192a
 DB     102,15,58,223,202,128
-       call    $L096key_192b
+       call    $L104key_192b
        movups  XMMWORD PTR [edx],xmm0
        mov     DWORD PTR 48[edx],ecx
-       xor     eax,eax
-       ret
+       jmp     $L100good_key
 ALIGN  16
-$L097key_192a:
+$L105key_192a:
        movups  XMMWORD PTR [edx],xmm0
        lea     edx,DWORD PTR 16[edx]
 ALIGN  16
-$L095key_192a_cold:
+$L103key_192a_cold:
        movaps  xmm5,xmm2
-$L098key_192b_warm:
+$L106key_192b_warm:
        shufps  xmm4,xmm0,16
        movdqa  xmm3,xmm2
        xorps   xmm0,xmm4
@@ -2054,56 +2207,90 @@ $L098key_192b_warm:
        pxor    xmm2,xmm3
        ret
 ALIGN  16
-$L096key_192b:
+$L104key_192b:
        movaps  xmm3,xmm0
        shufps  xmm5,xmm0,68
        movups  XMMWORD PTR [edx],xmm5
        shufps  xmm3,xmm2,78
        movups  XMMWORD PTR 16[edx],xmm3
        lea     edx,DWORD PTR 32[edx]
-       jmp     $L098key_192b_warm
+       jmp     $L106key_192b_warm
+ALIGN  16
+$L10212rounds_alt:
+       movdqa  xmm5,XMMWORD PTR 16[ebx]
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+       mov     ecx,8
+       movdqu  XMMWORD PTR [edx-16],xmm0
+$L107loop_key192:
+       movq    QWORD PTR [edx],xmm2
+       movdqa  xmm1,xmm2
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+       pslld   xmm4,1
+       lea     edx,DWORD PTR 24[edx]
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+       pshufd  xmm3,xmm0,255
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pxor    xmm0,xmm2
+       pxor    xmm2,xmm3
+       movdqu  XMMWORD PTR [edx-16],xmm0
+       dec     ecx
+       jnz     $L107loop_key192
+       mov     ecx,11
+       mov     DWORD PTR 32[edx],ecx
+       jmp     $L100good_key
 ALIGN  16
-$L08914rounds:
+$L09314rounds:
        movups  xmm2,XMMWORD PTR 16[eax]
-       mov     ecx,13
        lea     edx,DWORD PTR 16[edx]
+       cmp     ebp,268435456
+       je      $L10814rounds_alt
+       mov     ecx,13
        movups  XMMWORD PTR [edx-32],xmm0
        movups  XMMWORD PTR [edx-16],xmm2
 DB     102,15,58,223,202,1
-       call    $L099key_256a_cold
+       call    $L109key_256a_cold
 DB     102,15,58,223,200,1
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,2
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,2
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,4
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,4
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,8
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,8
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,16
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,16
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,32
-       call    $L101key_256a
+       call    $L111key_256a
 DB     102,15,58,223,200,32
-       call    $L100key_256b
+       call    $L110key_256b
 DB     102,15,58,223,202,64
-       call    $L101key_256a
+       call    $L111key_256a
        movups  XMMWORD PTR [edx],xmm0
        mov     DWORD PTR 16[edx],ecx
        xor     eax,eax
-       ret
+       jmp     $L100good_key
 ALIGN  16
-$L101key_256a:
+$L111key_256a:
        movups  XMMWORD PTR [edx],xmm2
        lea     edx,DWORD PTR 16[edx]
-$L099key_256a_cold:
+$L109key_256a_cold:
        shufps  xmm4,xmm0,16
        xorps   xmm0,xmm4
        shufps  xmm4,xmm0,140
@@ -2112,7 +2299,7 @@ $L099key_256a_cold:
        xorps   xmm0,xmm1
        ret
 ALIGN  16
-$L100key_256b:
+$L110key_256b:
        movups  XMMWORD PTR [edx],xmm0
        lea     edx,DWORD PTR 16[edx]
        shufps  xmm4,xmm2,16
@@ -2122,13 +2309,70 @@ $L100key_256b:
        shufps  xmm1,xmm1,170
        xorps   xmm2,xmm1
        ret
+ALIGN  16
+$L10814rounds_alt:
+       movdqa  xmm5,XMMWORD PTR [ebx]
+       movdqa  xmm4,XMMWORD PTR 32[ebx]
+       mov     ecx,7
+       movdqu  XMMWORD PTR [edx-32],xmm0
+       movdqa  xmm1,xmm2
+       movdqu  XMMWORD PTR [edx-16],xmm2
+$L112loop_key256:
+DB     102,15,56,0,213
+DB     102,15,56,221,212
+       movdqa  xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm3,xmm0
+       pslldq  xmm0,4
+       pxor    xmm0,xmm3
+       pslld   xmm4,1
+       pxor    xmm0,xmm2
+       movdqu  XMMWORD PTR [edx],xmm0
+       dec     ecx
+       jz      $L113done_key256
+       pshufd  xmm2,xmm0,255
+       pxor    xmm3,xmm3
+DB     102,15,56,221,211
+       movdqa  xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm3,xmm1
+       pslldq  xmm1,4
+       pxor    xmm1,xmm3
+       pxor    xmm2,xmm1
+       movdqu  XMMWORD PTR 16[edx],xmm2
+       lea     edx,DWORD PTR 32[edx]
+       movdqa  xmm1,xmm2
+       jmp     $L112loop_key256
+$L113done_key256:
+       mov     ecx,13
+       mov     DWORD PTR 16[edx],ecx
+$L100good_key:
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       xor     eax,eax
+       pop     ebx
+       pop     ebp
+       ret
 ALIGN  4
-$L088bad_pointer:
+$L091bad_pointer:
        mov     eax,-1
+       pop     ebx
+       pop     ebp
        ret
 ALIGN  4
-$L091bad_keybits:
+$L095bad_keybits:
+       pxor    xmm0,xmm0
        mov     eax,-2
+       pop     ebx
+       pop     ebp
        ret
 __aesni_set_encrypt_key ENDP
 ALIGN  16
@@ -2150,7 +2394,7 @@ $L_aesni_set_decrypt_key_begin::
        mov     edx,DWORD PTR 12[esp]
        shl     ecx,4
        test    eax,eax
-       jnz     $L102dec_key_ret
+       jnz     $L114dec_key_ret
        lea     eax,DWORD PTR 16[ecx*1+edx]
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR [eax]
@@ -2158,7 +2402,7 @@ $L_aesni_set_decrypt_key_begin::
        movups  XMMWORD PTR [edx],xmm1
        lea     edx,DWORD PTR 16[edx]
        lea     eax,DWORD PTR [eax-16]
-$L103dec_key_inverse:
+$L115dec_key_inverse:
        movups  xmm0,XMMWORD PTR [edx]
        movups  xmm1,XMMWORD PTR [eax]
 DB     102,15,56,219,192
@@ -2168,17 +2412,28 @@ DB      102,15,56,219,201
        movups  XMMWORD PTR 16[eax],xmm0
        movups  XMMWORD PTR [edx-16],xmm1
        cmp     eax,edx
-       ja      $L103dec_key_inverse
+       ja      $L115dec_key_inverse
        movups  xmm0,XMMWORD PTR [edx]
 DB     102,15,56,219,192
        movups  XMMWORD PTR [edx],xmm0
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
        xor     eax,eax
-$L102dec_key_ret:
+$L114dec_key_ret:
        ret
 _aesni_set_decrypt_key ENDP
+ALIGN  64
+$Lkey_const::
+DD     202313229,202313229,202313229,202313229
+DD     67569157,67569157,67569157,67569157
+DD     1,1,1,1
+DD     27,27,27,27
 DB     65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
 DB     83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 DB     32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 DB     115,108,46,111,114,103,62,0
 .text$ ENDS
+.bss   SEGMENT 'BSS'
+COMM   _OPENSSL_ia32cap_P:DWORD:4
+.bss   ENDS
 END